* config/arm/arm.c (arm_print_operand): Fix invalid alignment

[pf3gnuchains/gcc-fork.git] / gcc / config / arm / arm.c
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c

index eaac1cf..ba081d1 100644 (file)
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1,6 +1,6 @@
  /* Output routines for GCC for ARM.
     Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-   2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
+   2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
     Free Software Foundation, Inc.
     Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl)
     and Martin Simmons (@harleqn.co.uk).
@@ -164,6 +164,8 @@ static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool
  static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
  static bool arm_rtx_costs (rtx, int, int, int, int *, bool);
  static int arm_address_cost (rtx, bool);
+static int arm_register_move_cost (enum machine_mode, reg_class_t, reg_class_t);
+static int arm_memory_move_cost (enum machine_mode, reg_class_t, bool);
  static bool arm_memory_load_p (rtx);
  static bool arm_cirrus_insn_p (rtx);
  static void cirrus_reorg (rtx);
@@ -267,6 +269,9 @@ static unsigned int arm_autovectorize_vector_sizes (void);
  static int arm_default_branch_cost (bool, bool);
  static int arm_cortex_a5_branch_cost (bool, bool);
  
+static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                            const unsigned char *sel);
+
  \f
  /* Table of machine attributes.  */
  static const struct attribute_spec arm_attribute_table[] =
@@ -363,6 +368,12 @@ static const struct attribute_spec arm_attribute_table[] =
  #undef  TARGET_SCHED_ADJUST_COST
  #define TARGET_SCHED_ADJUST_COST arm_adjust_cost
  
+#undef TARGET_REGISTER_MOVE_COST
+#define TARGET_REGISTER_MOVE_COST arm_register_move_cost
+
+#undef TARGET_MEMORY_MOVE_COST
+#define TARGET_MEMORY_MOVE_COST arm_memory_move_cost
+
  #undef TARGET_ENCODE_SECTION_INFO
  #ifdef ARM_PE
  #define TARGET_ENCODE_SECTION_INFO  arm_pe_encode_section_info
@@ -604,6 +615,10 @@ static const struct attribute_spec arm_attribute_table[] =
  #define TARGET_PREFERRED_RENAME_CLASS \
    arm_preferred_rename_class
  
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
+  arm_vectorize_vec_perm_const_ok
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  \f
  /* Obstack for minipool constant handling.  */
@@ -1981,7 +1996,8 @@ arm_option_override (void)
                            global_options_set.x_param_values);
  
    /* ARM EABI defaults to strict volatile bitfields.  */
-  if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0)
+  if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0
+      && abi_version_at_least(2))
      flag_strict_volatile_bitfields = 1;
  
    /* Enable sw prefetching at -O3 for CPUS that have prefetch, and we have deemed
@@ -3653,6 +3669,10 @@ arm_libcall_uses_aapcs_base (const_rtx libcall)
        add_libcall (libcall_htab,
                    convert_optab_libfunc (trunc_optab, HFmode, SFmode));
        add_libcall (libcall_htab,
+                  convert_optab_libfunc (sfix_optab, SImode, DFmode));
+      add_libcall (libcall_htab,
+                  convert_optab_libfunc (ufix_optab, SImode, DFmode));
+      add_libcall (libcall_htab,
                    convert_optab_libfunc (sfix_optab, DImode, DFmode));
        add_libcall (libcall_htab,
                    convert_optab_libfunc (ufix_optab, DImode, DFmode));
@@ -4244,6 +4264,11 @@ use_vfp_abi (enum arm_pcs pcs_variant, bool is_double)
           (TARGET_VFP_DOUBLE || !is_double));
  }
  
+/* Return true if an argument whose type is TYPE, or mode is MODE, is
+   suitable for passing or returning in VFP registers for the PCS
+   variant selected.  If it is, then *BASE_MODE is updated to contain
+   a machine mode describing each element of the argument's type and
+   *COUNT to hold the number of such elements.  */
  static bool
  aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
                                        enum machine_mode mode, const_tree type,
@@ -4251,9 +4276,20 @@ aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
  {
    enum machine_mode new_mode = VOIDmode;
  
-  if (GET_MODE_CLASS (mode) == MODE_FLOAT
-      || GET_MODE_CLASS (mode) == MODE_VECTOR_INT
-      || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+  /* If we have the type information, prefer that to working things
+     out from the mode.  */
+  if (type)
+    {
+      int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
+
+      if (ag_count > 0 && ag_count <= 4)
+       *count = ag_count;
+      else
+       return false;
+    }
+  else if (GET_MODE_CLASS (mode) == MODE_FLOAT
+          || GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+          || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
      {
        *count = 1;
        new_mode = mode;
@@ -4263,15 +4299,6 @@ aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
        *count = 2;
        new_mode = (mode == DCmode ? DFmode : SFmode);
      }
-  else if (type && (mode == BLKmode || TREE_CODE (type) == VECTOR_TYPE))
-    {
-      int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
-
-      if (ag_count > 0 && ag_count <= 4)
-       *count = ag_count;
-      else
-       return false;
-    }
    else
      return false;
  
@@ -5241,6 +5268,14 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
    if (IS_STACKALIGN (func_type))
      return false;
  
+  /* The AAPCS says that, on bare-metal, calls to unresolved weak
+     references should become a NOP.  Don't convert such calls into
+     sibling calls.  */
+  if (TARGET_AAPCS_BASED
+      && arm_abi == ARM_ABI_AAPCS
+      && DECL_WEAK (decl))
+    return false;
+
    /* Everything else is ok.  */
    return true;
  }
@@ -5543,11 +5578,7 @@ arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
  
        if (TARGET_32BIT)
         {
-         emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx));
-         if (TARGET_ARM)
-           emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno));
-         else
-           emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
+         emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
         }
        else /* TARGET_THUMB1 */
         {
@@ -5560,10 +5591,10 @@ arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
                                      thumb_find_work_register (saved_regs));
               emit_insn (gen_pic_load_addr_thumb1 (pic_tmp, pic_rtx));
               emit_insn (gen_movsi (pic_offset_table_rtx, pic_tmp));
+             emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
             }
           else
-           emit_insn (gen_pic_load_addr_thumb1 (pic_reg, pic_rtx));
-         emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
+           emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
         }
      }
  
@@ -5593,20 +5624,7 @@ arm_pic_static_addr (rtx orig, rtx reg)
                                 UNSPEC_SYMBOL_OFFSET);
    offset_rtx = gen_rtx_CONST (Pmode, offset_rtx);
  
-  if (TARGET_32BIT)
-    {
-      emit_insn (gen_pic_load_addr_32bit (reg, offset_rtx));
-      if (TARGET_ARM)
-        insn = emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno));
-      else
-        insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
-    }
-  else /* TARGET_THUMB1 */
-    {
-      emit_insn (gen_pic_load_addr_thumb1 (reg, offset_rtx));
-      insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
-    }
-
+  insn = emit_insn (gen_pic_load_addr_unified (reg, offset_rtx, labelno));
    return insn;
  }
  
@@ -5649,7 +5667,7 @@ static bool
  will_be_in_index_register (const_rtx x)
  {
    /* arm.md: calculate_pic_address will split this into a register.  */
-  return GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_PIC_SYM;
+  return GET_CODE (x) == UNSPEC && (XINT (x, 1) == UNSPEC_PIC_SYM);
  }
  
  /* Return nonzero if X is a valid ARM state address operand.  */
@@ -7613,6 +7631,15 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
  
      case SET:
        return false;
+      
+    case UNSPEC:
+      /* We cost this as high as our memory costs to allow this to
+        be hoisted from loops.  */
+      if (XINT (x, 1) == UNSPEC_PIC_UNIFIED)
+       {
+         *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
+       }
+      return true;
  
      default:
        *total = COSTS_N_INSNS (4);
@@ -8484,6 +8511,63 @@ fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
    return true;
  }
  
+/* Implement TARGET_REGISTER_MOVE_COST.
+
+   Moves between FPA_REGS and GENERAL_REGS are two memory insns.
+   Moves between VFP_REGS and GENERAL_REGS are a single insn, but
+   it is typically more expensive than a single memory access.  We set
+   the cost to less than two memory accesses so that floating
+   point to integer conversion does not go through memory.  */
+
+int
+arm_register_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
+                       reg_class_t from, reg_class_t to)
+{
+  if (TARGET_32BIT)
+    {
+      if ((from == FPA_REGS && to != FPA_REGS)
+         || (from != FPA_REGS && to == FPA_REGS))
+       return 20;
+      else if ((IS_VFP_CLASS (from) && !IS_VFP_CLASS (to))
+              || (!IS_VFP_CLASS (from) && IS_VFP_CLASS (to)))
+       return 15;
+      else if ((from == IWMMXT_REGS && to != IWMMXT_REGS)
+              || (from != IWMMXT_REGS && to == IWMMXT_REGS))
+       return 4;
+      else if (from == IWMMXT_GR_REGS || to == IWMMXT_GR_REGS)
+       return 20;
+      else if ((from == CIRRUS_REGS && to != CIRRUS_REGS)
+              || (from != CIRRUS_REGS && to == CIRRUS_REGS))
+       return 20;
+      else
+       return 2;
+    }
+  else
+    {
+      if (from == HI_REGS || to == HI_REGS)
+       return 4;
+      else
+       return 2;
+    }
+}
+
+/* Implement TARGET_MEMORY_MOVE_COST.  */
+
+int
+arm_memory_move_cost (enum machine_mode mode, reg_class_t rclass,
+                     bool in ATTRIBUTE_UNUSED)
+{
+  if (TARGET_32BIT)
+    return 10;
+  else
+    {
+      if (GET_MODE_SIZE (mode) < 4)
+       return 8;
+      else
+       return ((2 * GET_MODE_SIZE (mode)) * (rclass == LO_REGS ? 1 : 2));
+    }
+}
+
  /* This function implements the target macro TARGET_SCHED_ADJUST_COST.
     It corrects the value of COST based on the relationship between
     INSN and DEP through the dependence LINK.  It returns the new
@@ -9916,7 +10000,8 @@ static int
  arm_note_pic_base (rtx *x, void *date ATTRIBUTE_UNUSED)
  {
    if (GET_CODE (*x) == UNSPEC
-      && XINT (*x, 1) == UNSPEC_PIC_BASE)
+      && (XINT (*x, 1) == UNSPEC_PIC_BASE
+         || XINT (*x, 1) == UNSPEC_PIC_UNIFIED))
      return 1;
    return 0;
  }
@@ -11606,7 +11691,7 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
             return CC_Zmode;
  
           /* We can do an equality test in three Thumb instructions.  */
-         if (!TARGET_ARM)
+         if (!TARGET_32BIT)
             return CC_Zmode;
  
           /* FALLTHROUGH */
@@ -11618,7 +11703,7 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
           /* DImode unsigned comparisons can be implemented by cmp +
              cmpeq without a scratch register.  Not worth doing in
              Thumb-2.  */
-         if (TARGET_ARM)
+         if (TARGET_32BIT)
             return CC_CZmode;
  
           /* FALLTHROUGH */
@@ -11645,7 +11730,7 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
     return the rtx for register 0 in the proper mode.  FP means this is a
     floating point compare: I don't think that it is needed on the arm.  */
  rtx
-arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y)
+arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y, rtx scratch)
  {
    enum machine_mode mode;
    rtx cc_reg;
@@ -11670,11 +11755,18 @@ arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y)
          CC_CZmode is cheaper.  */
        if (mode == CC_Zmode && y != const0_rtx)
         {
+         gcc_assert (!reload_completed);
           x = expand_binop (DImode, xor_optab, x, y, NULL_RTX, 0, OPTAB_WIDEN);
           y = const0_rtx;
         }
+
        /* A scratch register is required.  */
-      clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (SImode));
+      if (reload_completed)
+       gcc_assert (scratch != NULL && GET_MODE (scratch) == SImode);
+      else
+       scratch = gen_rtx_SCRATCH (SImode);
+
+      clobber = gen_rtx_CLOBBER (VOIDmode, scratch);
        set = gen_rtx_SET (VOIDmode, cc_reg, gen_rtx_COMPARE (mode, x, y));
        emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
      }
@@ -14109,6 +14201,9 @@ output_move_double (rtx *operands, bool emit, int *count)
                     output_asm_insn ("sub%?\t%0, %1, %2", otherops);
                 }
  
+             if (count)
+               *count = 2;
+
               if (TARGET_LDRD)
                 return "ldr%(d%)\t%0, [%1]";
  
@@ -17617,11 +17712,11 @@ arm_print_operand (FILE *stream, rtx x, int code)
         memsize = MEM_SIZE (x);
  
         /* Only certain alignment specifiers are supported by the hardware.  */
-       if (memsize == 16 && (align % 32) == 0)
+       if (memsize == 32 && (align % 32) == 0)
           align_bits = 256;
-       else if ((memsize == 8 || memsize == 16) && (align % 16) == 0)
+       else if ((memsize == 16 || memsize == 32) && (align % 16) == 0)
           align_bits = 128;
-       else if ((align % 8) == 0)
+       else if (memsize >= 8 && (align % 8) == 0)
           align_bits = 64;
         else
           align_bits = 0;
@@ -17671,6 +17766,11 @@ arm_print_operand (FILE *stream, rtx x, int code)
        }
        return;
  
+    case 'v':
+       gcc_assert (GET_CODE (x) == CONST_DOUBLE);
+       fprintf (stream, "#%d", vfp3_const_double_for_fract_bits (x));
+       return;
+
      /* Register specifier for vld1.16/vst1.16.  Translate the S register
         number into a D register number and element index.  */
      case 'z':
@@ -19004,7 +19104,9 @@ static neon_builtin_datum neon_builtin_data[] =
    VAR3 (BINOP, vsubhn, v8hi, v4si, v2di),
    VAR8 (BINOP, vceq, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
    VAR8 (BINOP, vcge, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
+  VAR6 (BINOP, vcgeu, v8qi, v4hi, v2si, v16qi, v8hi, v4si),
    VAR8 (BINOP, vcgt, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
+  VAR6 (BINOP, vcgtu, v8qi, v4hi, v2si, v16qi, v8hi, v4si),
    VAR2 (BINOP, vcage, v2sf, v4sf),
    VAR2 (BINOP, vcagt, v2sf, v4sf),
    VAR6 (BINOP, vtst, v8qi, v4hi, v2si, v16qi, v8hi, v4si),
@@ -20818,6 +20920,57 @@ neon_disambiguate_copy (rtx *operands, rtx *dest, rtx *src, unsigned int count)
      }
  }
  
+/* Split operands into moves from op[1] + op[2] into op[0].  */
+
+void
+neon_split_vcombine (rtx operands[3])
+{
+  unsigned int dest = REGNO (operands[0]);
+  unsigned int src1 = REGNO (operands[1]);
+  unsigned int src2 = REGNO (operands[2]);
+  enum machine_mode halfmode = GET_MODE (operands[1]);
+  unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
+  rtx destlo, desthi;
+
+  if (src1 == dest && src2 == dest + halfregs)
+    {
+      /* No-op move.  Can't split to nothing; emit something.  */
+      emit_note (NOTE_INSN_DELETED);
+      return;
+    }
+
+  /* Preserve register attributes for variable tracking.  */
+  destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
+  desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
+                              GET_MODE_SIZE (halfmode));
+
+  /* Special case of reversed high/low parts.  Use VSWP.  */
+  if (src2 == dest && src1 == dest + halfregs)
+    {
+      rtx x = gen_rtx_SET (VOIDmode, destlo, operands[1]);
+      rtx y = gen_rtx_SET (VOIDmode, desthi, operands[2]);
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, x, y)));
+      return;
+    }
+
+  if (!reg_overlap_mentioned_p (operands[2], destlo))
+    {
+      /* Try to avoid unnecessary moves if part of the result
+        is in the right place already.  */
+      if (src1 != dest)
+       emit_move_insn (destlo, operands[1]);
+      if (src2 != dest + halfregs)
+       emit_move_insn (desthi, operands[2]);
+    }
+  else
+    {
+      if (src2 != dest + halfregs)
+       emit_move_insn (desthi, operands[2]);
+      if (src1 != dest)
+       emit_move_insn (destlo, operands[1]);
+    }
+}
+
  /* Expand an expression EXP that calls a built-in function,
     with result going to TARGET if that's convenient
     (and in mode MODE if that's convenient).
@@ -24327,715 +24480,1041 @@ arm_have_conditional_execution (void)
    return !TARGET_THUMB1;
  }
  
-/* Legitimize a memory reference for sync primitive implemented using
-   ldrex / strex.  We currently force the form of the reference to be
-   indirect without offset.  We do not yet support the indirect offset
-   addressing supported by some ARM targets for these
-   instructions.  */
-static rtx
-arm_legitimize_sync_memory (rtx memory)
+static unsigned int
+arm_autovectorize_vector_sizes (void)
  {
-  rtx addr = force_reg (Pmode, XEXP (memory, 0));
-  rtx legitimate_memory = gen_rtx_MEM (GET_MODE (memory), addr);
-
-  set_mem_alias_set (legitimate_memory, ALIAS_SET_MEMORY_BARRIER);
-  MEM_VOLATILE_P (legitimate_memory) = MEM_VOLATILE_P (memory);
-  return legitimate_memory;
+  return TARGET_NEON_VECTORIZE_DOUBLE ? 0 : (16 | 8);
  }
  
-/* An instruction emitter. */
-typedef void (* emit_f) (int label, const char *, rtx *);
-
-/* An instruction emitter that emits via the conventional
-   output_asm_insn.  */
-static void
-arm_emit (int label ATTRIBUTE_UNUSED, const char *pattern, rtx *operands)
+static bool
+arm_vector_alignment_reachable (const_tree type, bool is_packed)
  {
-  output_asm_insn (pattern, operands);
-}
-
-/* Count the number of emitted synchronization instructions.  */
-static unsigned arm_insn_count;
+  /* Vectors which aren't in packed structures will not be less aligned than
+     the natural alignment of their element type, so this is safe.  */
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    return !is_packed;
  
-/* An emitter that counts emitted instructions but does not actually
-   emit instruction into the instruction stream.  */
-static void
-arm_count (int label,
-          const char *pattern ATTRIBUTE_UNUSED,
-          rtx *operands ATTRIBUTE_UNUSED)
-{
-  if (! label)
-    ++ arm_insn_count;
+  return default_builtin_vector_alignment_reachable (type, is_packed);
  }
  
-/* Construct a pattern using conventional output formatting and feed
-   it to output_asm_insn.  Provides a mechanism to construct the
-   output pattern on the fly.  Note the hard limit on the pattern
-   buffer size.  */
-static void ATTRIBUTE_PRINTF_4
-arm_output_asm_insn (emit_f emit, int label, rtx *operands,
-                    const char *pattern, ...)
+static bool
+arm_builtin_support_vector_misalignment (enum machine_mode mode,
+                                        const_tree type, int misalignment,
+                                        bool is_packed)
  {
-  va_list ap;
-  char buffer[256];
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    {
+      HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
  
-  va_start (ap, pattern);
-  vsprintf (buffer, pattern, ap);
-  va_end (ap);
-  emit (label, buffer, operands);
+      if (is_packed)
+        return align == 1;
+
+      /* If the misalignment is unknown, we should be able to handle the access
+        so long as it is not to a member of a packed data structure.  */
+      if (misalignment == -1)
+        return true;
+
+      /* Return true if the misalignment is a multiple of the natural alignment
+         of the vector's element type.  This is probably always going to be
+        true in practice, since we've already established that this isn't a
+        packed access.  */
+      return ((misalignment % align) == 0);
+    }
+
+  return default_builtin_support_vector_misalignment (mode, type, misalignment,
+                                                     is_packed);
  }
  
-/* Emit the memory barrier instruction, if any, provided by this
-   target to a specified emitter.  */
  static void
-arm_process_output_memory_barrier (emit_f emit, rtx *operands)
+arm_conditional_register_usage (void)
  {
-  if (TARGET_HAVE_DMB)
+  int regno;
+
+  if (TARGET_SOFT_FLOAT || TARGET_THUMB1 || !TARGET_FPA)
      {
-      /* Note we issue a system level barrier. We should consider
-         issuing a inner shareabilty zone barrier here instead, ie.
-         "DMB ISH".  */
-      emit (0, "dmb\tsy", operands);
-      return;
+      for (regno = FIRST_FPA_REGNUM;
+          regno <= LAST_FPA_REGNUM; ++regno)
+       fixed_regs[regno] = call_used_regs[regno] = 1;
      }
  
-  if (TARGET_HAVE_DMB_MCR)
+  if (TARGET_THUMB1 && optimize_size)
      {
-      emit (0, "mcr\tp15, 0, r0, c7, c10, 5", operands);
-      return;
+      /* When optimizing for size on Thumb-1, it's better not
+        to use the HI regs, because of the overhead of
+        stacking them.  */
+      for (regno = FIRST_HI_REGNUM;
+          regno <= LAST_HI_REGNUM; ++regno)
+       fixed_regs[regno] = call_used_regs[regno] = 1;
      }
  
-  gcc_unreachable ();
-}
-
-/* Emit the memory barrier instruction, if any, provided by this
-   target.  */
-const char *
-arm_output_memory_barrier (rtx *operands)
-{
-  arm_process_output_memory_barrier (arm_emit, operands);
-  return "";
-}
+  /* The link register can be clobbered by any branch insn,
+     but we have no way to track that at present, so mark
+     it as unavailable.  */
+  if (TARGET_THUMB1)
+    fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
  
-/* Helper to figure out the instruction suffix required on ldrex/strex
-   for operations on an object of the specified mode.  */
-static const char *
-arm_ldrex_suffix (enum machine_mode mode)
-{
-  switch (mode)
+  if (TARGET_32BIT && TARGET_HARD_FLOAT)
      {
-    case QImode: return "b";
-    case HImode: return "h";
-    case SImode: return "";
-    case DImode: return "d";
-    default:
-      gcc_unreachable ();
+      if (TARGET_MAVERICK)
+       {
+         for (regno = FIRST_FPA_REGNUM;
+              regno <= LAST_FPA_REGNUM; ++ regno)
+           fixed_regs[regno] = call_used_regs[regno] = 1;
+         for (regno = FIRST_CIRRUS_FP_REGNUM;
+              regno <= LAST_CIRRUS_FP_REGNUM; ++ regno)
+           {
+             fixed_regs[regno] = 0;
+             call_used_regs[regno] = regno < FIRST_CIRRUS_FP_REGNUM + 4;
+           }
+       }
+      if (TARGET_VFP)
+       {
+         /* VFPv3 registers are disabled when earlier VFP
+            versions are selected due to the definition of
+            LAST_VFP_REGNUM.  */
+         for (regno = FIRST_VFP_REGNUM;
+              regno <= LAST_VFP_REGNUM; ++ regno)
+           {
+             fixed_regs[regno] = 0;
+             call_used_regs[regno] = regno < FIRST_VFP_REGNUM + 16
+               || regno >= FIRST_VFP_REGNUM + 32;
+           }
+       }
      }
-  return "";
-}
  
-/* Emit an ldrex{b,h,d, } instruction appropriate for the specified
-   mode.  */
-static void
-arm_output_ldrex (emit_f emit,
-                 enum machine_mode mode,
-                 rtx target,
-                 rtx memory)
-{
-  rtx operands[3];
-
-  operands[0] = target;
-  if (mode != DImode)
+  if (TARGET_REALLY_IWMMXT)
      {
-      const char *suffix = arm_ldrex_suffix (mode);
-      operands[1] = memory;
-      arm_output_asm_insn (emit, 0, operands, "ldrex%s\t%%0, %%C1", suffix);
+      regno = FIRST_IWMMXT_GR_REGNUM;
+      /* The 2002/10/09 revision of the XScale ABI has wCG0
+         and wCG1 as call-preserved registers.  The 2002/11/21
+         revision changed this so that all wCG registers are
+         scratch registers.  */
+      for (regno = FIRST_IWMMXT_GR_REGNUM;
+          regno <= LAST_IWMMXT_GR_REGNUM; ++ regno)
+       fixed_regs[regno] = 0;
+      /* The XScale ABI has wR0 - wR9 as scratch registers,
+        the rest as call-preserved registers.  */
+      for (regno = FIRST_IWMMXT_REGNUM;
+          regno <= LAST_IWMMXT_REGNUM; ++ regno)
+       {
+         fixed_regs[regno] = 0;
+         call_used_regs[regno] = regno < FIRST_IWMMXT_REGNUM + 10;
+       }
      }
-  else
+
+  if ((unsigned) PIC_OFFSET_TABLE_REGNUM != INVALID_REGNUM)
      {
-      /* The restrictions on target registers in ARM mode are that the two
-        registers are consecutive and the first one is even; Thumb is
-        actually more flexible, but DI should give us this anyway.
-        Note that the 1st register always gets the lowest word in memory.  */
-      gcc_assert ((REGNO (target) & 1) == 0);
-      operands[1] = gen_rtx_REG (SImode, REGNO (target) + 1);
-      operands[2] = memory;
-      arm_output_asm_insn (emit, 0, operands, "ldrexd\t%%0, %%1, %%C2");
+      fixed_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
+      call_used_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
      }
-}
-
-/* Emit a strex{b,h,d, } instruction appropriate for the specified
-   mode.  */
-static void
-arm_output_strex (emit_f emit,
-                 enum machine_mode mode,
-                 const char *cc,
-                 rtx result,
-                 rtx value,
-                 rtx memory)
-{
-  rtx operands[4];
-
-  operands[0] = result;
-  operands[1] = value;
-  if (mode != DImode)
+  else if (TARGET_APCS_STACK)
      {
-      const char *suffix = arm_ldrex_suffix (mode);
-      operands[2] = memory;
-      arm_output_asm_insn (emit, 0, operands, "strex%s%s\t%%0, %%1, %%C2",
-                         suffix, cc);
+      fixed_regs[10]     = 1;
+      call_used_regs[10] = 1;
      }
-  else
+  /* -mcaller-super-interworking reserves r11 for calls to
+     _interwork_r11_call_via_rN().  Making the register global
+     is an easy way of ensuring that it remains valid for all
+     calls.  */
+  if (TARGET_APCS_FRAME || TARGET_CALLER_INTERWORKING
+      || TARGET_TPCS_FRAME || TARGET_TPCS_LEAF_FRAME)
      {
-      /* The restrictions on target registers in ARM mode are that the two
-        registers are consecutive and the first one is even; Thumb is
-        actually more flexible, but DI should give us this anyway.
-        Note that the 1st register always gets the lowest word in memory.  */
-      gcc_assert ((REGNO (value) & 1) == 0 || TARGET_THUMB2);
-      operands[2] = gen_rtx_REG (SImode, REGNO (value) + 1);
-      operands[3] = memory;
-      arm_output_asm_insn (emit, 0, operands, "strexd%s\t%%0, %%1, %%2, %%C3",
-                          cc);
+      fixed_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+      call_used_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+      if (TARGET_CALLER_INTERWORKING)
+       global_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
      }
+  SUBTARGET_CONDITIONAL_REGISTER_USAGE
  }
  
-/* Helper to emit an it instruction in Thumb2 mode only; although the assembler
-   will ignore it in ARM mode, emitting it will mess up instruction counts we
-   sometimes keep 'flags' are the extra t's and e's if it's more than one
-   instruction that is conditional.  */
-static void
-arm_output_it (emit_f emit, const char *flags, const char *cond)
-{
-  rtx operands[1]; /* Don't actually use the operand.  */
-  if (TARGET_THUMB2)
-    arm_output_asm_insn (emit, 0, operands, "it%s\t%s", flags, cond);
-}
-
-/* Helper to emit a two operand instruction.  */
-static void
-arm_output_op2 (emit_f emit, const char *mnemonic, rtx d, rtx s)
+static reg_class_t
+arm_preferred_rename_class (reg_class_t rclass)
  {
-  rtx operands[2];
-
-  operands[0] = d;
-  operands[1] = s;
-  arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1", mnemonic);
-}
+  /* Thumb-2 instructions using LO_REGS may be smaller than instructions
+     using GENERIC_REGS.  During register rename pass, we prefer LO_REGS,
+     and code size can be reduced.  */
+  if (TARGET_THUMB2 && rclass == GENERAL_REGS)
+    return LO_REGS;
+  else
+    return NO_REGS;
+}
  
-/* Helper to emit a three operand instruction.  */
-static void
-arm_output_op3 (emit_f emit, const char *mnemonic, rtx d, rtx a, rtx b)
+/* Compute the atrribute "length" of insn "*push_multi".
+   So this function MUST be kept in sync with that insn pattern.  */
+int
+arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
  {
-  rtx operands[3];
+  int i, regno, hi_reg;
+  int num_saves = XVECLEN (parallel_op, 0);
  
-  operands[0] = d;
-  operands[1] = a;
-  operands[2] = b;
-  arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1, %%2", mnemonic);
-}
+  /* ARM mode.  */
+  if (TARGET_ARM)
+    return 4;
+  /* Thumb1 mode.  */
+  if (TARGET_THUMB1)
+    return 2;
  
-/* Emit a load store exclusive synchronization loop.
+  /* Thumb2 mode.  */
+  regno = REGNO (first_op);
+  hi_reg = (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
+  for (i = 1; i < num_saves && !hi_reg; i++)
+    {
+      regno = REGNO (XEXP (XVECEXP (parallel_op, 0, i), 0));
+      hi_reg |= (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
+    }
  
-   do
-     old_value = [mem]
-     if old_value != required_value
-       break;
-     t1 = sync_op (old_value, new_value)
-     [mem] = t1, t2 = [0|1]
-   while ! t2
+  if (!hi_reg)
+    return 2;
+  return 4;
+}
  
-   Note:
-     t1 == t2 is not permitted
-     t1 == old_value is permitted
+/* Compute the number of instructions emitted by output_move_double.  */
+int
+arm_count_output_move_double_insns (rtx *operands)
+{
+  int count;
+  rtx ops[2];
+  /* output_move_double may modify the operands array, so call it
+     here on a copy of the array.  */
+  ops[0] = operands[0];
+  ops[1] = operands[1];
+  output_move_double (ops, false, &count);
+  return count;
+}
  
-   required_value:
+int
+vfp3_const_double_for_fract_bits (rtx operand)
+{
+  REAL_VALUE_TYPE r0;
+  
+  if (GET_CODE (operand) != CONST_DOUBLE)
+    return 0;
+  
+  REAL_VALUE_FROM_CONST_DOUBLE (r0, operand);
+  if (exact_real_inverse (DFmode, &r0))
+    {
+      if (exact_real_truncate (DFmode, &r0))
+       {
+         HOST_WIDE_INT value = real_to_integer (&r0);
+         value = value & 0xffffffff;
+         if ((value != 0) && ( (value & (value - 1)) == 0))
+           return int_log2 (value);
+       }
+    }
+  return 0;
+}
+\f
+/* Emit a memory barrier around an atomic sequence according to MODEL.  */
  
-   RTX register representing the required old_value for
-   the modify to continue, if NULL no comparsion is performed.  */
  static void
-arm_output_sync_loop (emit_f emit,
-                     enum machine_mode mode,
-                     rtx old_value,
-                     rtx memory,
-                     rtx required_value,
-                     rtx new_value,
-                     rtx t1,
-                     rtx t2,
-                     enum attr_sync_op sync_op,
-                     int early_barrier_required)
+arm_pre_atomic_barrier (enum memmodel model)
  {
-  rtx operands[2];
-  /* We'll use the lo for the normal rtx in the none-DI case
-     as well as the least-sig word in the DI case.  */
-  rtx old_value_lo, required_value_lo, new_value_lo, t1_lo;
-  rtx old_value_hi, required_value_hi, new_value_hi, t1_hi;
+  switch (model)
+    {
+    case MEMMODEL_RELAXED:
+    case MEMMODEL_CONSUME:
+    case MEMMODEL_ACQUIRE:
+      break;
+    case MEMMODEL_RELEASE:
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      emit_insn (gen_memory_barrier ());
+      break;
+    default:
+      gcc_unreachable ();
+    }
+}
  
-  bool is_di = mode == DImode;
+static void
+arm_post_atomic_barrier (enum memmodel model)
+{
+  switch (model)
+    {
+    case MEMMODEL_RELAXED:
+    case MEMMODEL_CONSUME:
+    case MEMMODEL_RELEASE:
+      break;
+    case MEMMODEL_ACQUIRE:
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      emit_insn (gen_memory_barrier ());
+      break;
+    default:
+      gcc_unreachable ();
+    }
+}
  
-  gcc_assert (t1 != t2);
+/* Emit the load-exclusive and store-exclusive instructions.  */
  
-  if (early_barrier_required)
-    arm_process_output_memory_barrier (emit, NULL);
+static void
+arm_emit_load_exclusive (enum machine_mode mode, rtx rval, rtx mem)
+{
+  rtx (*gen) (rtx, rtx);
  
-  arm_output_asm_insn (emit, 1, operands, "%sLSYT%%=:", LOCAL_LABEL_PREFIX);
+  switch (mode)
+    {
+    case QImode: gen = gen_arm_load_exclusiveqi; break;
+    case HImode: gen = gen_arm_load_exclusivehi; break;
+    case SImode: gen = gen_arm_load_exclusivesi; break;
+    case DImode: gen = gen_arm_load_exclusivedi; break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (gen (rval, mem));
+}
  
-  arm_output_ldrex (emit, mode, old_value, memory);
+static void
+arm_emit_store_exclusive (enum machine_mode mode, rtx bval, rtx rval, rtx mem)
+{
+  rtx (*gen) (rtx, rtx, rtx);
  
-  if (is_di)
+  switch (mode)
      {
-      old_value_lo = gen_lowpart (SImode, old_value);
-      old_value_hi = gen_highpart (SImode, old_value);
-      if (required_value)
-       {
-         required_value_lo = gen_lowpart (SImode, required_value);
-         required_value_hi = gen_highpart (SImode, required_value);
-       }
-      else
-       {
-         /* Silence false potentially unused warning.  */
-         required_value_lo = NULL_RTX;
-         required_value_hi = NULL_RTX;
-       }
-      new_value_lo = gen_lowpart (SImode, new_value);
-      new_value_hi = gen_highpart (SImode, new_value);
-      t1_lo = gen_lowpart (SImode, t1);
-      t1_hi = gen_highpart (SImode, t1);
+    case QImode: gen = gen_arm_store_exclusiveqi; break;
+    case HImode: gen = gen_arm_store_exclusivehi; break;
+    case SImode: gen = gen_arm_store_exclusivesi; break;
+    case DImode: gen = gen_arm_store_exclusivedi; break;
+    default:
+      gcc_unreachable ();
      }
-  else
-    {
-      old_value_lo = old_value;
-      new_value_lo = new_value;
-      required_value_lo = required_value;
-      t1_lo = t1;
  
-      /* Silence false potentially unused warning.  */
-      t1_hi = NULL_RTX;
-      new_value_hi = NULL_RTX;
-      required_value_hi = NULL_RTX;
-      old_value_hi = NULL_RTX;
-    }
+  emit_insn (gen (bval, rval, mem));
+}
  
-  if (required_value)
-    {
-      operands[0] = old_value_lo;
-      operands[1] = required_value_lo;
+/* Mark the previous jump instruction as unlikely.  */
  
-      arm_output_asm_insn (emit, 0, operands, "cmp\t%%0, %%1");
-      if (is_di)
-        {
-          arm_output_it (emit, "", "eq");
-          arm_output_op2 (emit, "cmpeq", old_value_hi, required_value_hi);
-        }
-      arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYB%%=", LOCAL_LABEL_PREFIX);
-    }
+static void
+emit_unlikely_jump (rtx insn)
+{
+  rtx very_unlikely = GEN_INT (REG_BR_PROB_BASE / 100 - 1);
  
-  switch (sync_op)
-    {
-    case SYNC_OP_ADD:
-      arm_output_op3 (emit, is_di ? "adds" : "add",
-                     t1_lo, old_value_lo, new_value_lo);
-      if (is_di)
-       arm_output_op3 (emit, "adc", t1_hi, old_value_hi, new_value_hi);
-      break;
+  insn = emit_jump_insn (insn);
+  add_reg_note (insn, REG_BR_PROB, very_unlikely);
+}
  
-    case SYNC_OP_SUB:
-      arm_output_op3 (emit, is_di ? "subs" : "sub",
-                     t1_lo, old_value_lo, new_value_lo);
-      if (is_di)
-       arm_output_op3 (emit, "sbc", t1_hi, old_value_hi, new_value_hi);
-      break;
+/* Expand a compare and swap pattern.  */
  
-    case SYNC_OP_IOR:
-      arm_output_op3 (emit, "orr", t1_lo, old_value_lo, new_value_lo);
-      if (is_di)
-       arm_output_op3 (emit, "orr", t1_hi, old_value_hi, new_value_hi);
-      break;
+void
+arm_expand_compare_and_swap (rtx operands[])
+{
+  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
+  enum machine_mode mode;
+  rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+
+  bval = operands[0];
+  rval = operands[1];
+  mem = operands[2];
+  oldval = operands[3];
+  newval = operands[4];
+  is_weak = operands[5];
+  mod_s = operands[6];
+  mod_f = operands[7];
+  mode = GET_MODE (mem);
  
-    case SYNC_OP_XOR:
-      arm_output_op3 (emit, "eor", t1_lo, old_value_lo, new_value_lo);
-      if (is_di)
-       arm_output_op3 (emit, "eor", t1_hi, old_value_hi, new_value_hi);
-      break;
+  switch (mode)
+    {
+    case QImode:
+    case HImode:
+      /* For narrow modes, we're going to perform the comparison in SImode,
+        so do the zero-extension now.  */
+      rval = gen_reg_rtx (SImode);
+      oldval = convert_modes (SImode, mode, oldval, true);
+      /* FALLTHRU */
  
-    case SYNC_OP_AND:
-      arm_output_op3 (emit,"and", t1_lo, old_value_lo, new_value_lo);
-      if (is_di)
-       arm_output_op3 (emit, "and", t1_hi, old_value_hi, new_value_hi);
+    case SImode:
+      /* Force the value into a register if needed.  We waited until after
+        the zero-extension above to do this properly.  */
+      if (!arm_add_operand (oldval, mode))
+       oldval = force_reg (mode, oldval);
        break;
  
-    case SYNC_OP_NAND:
-      arm_output_op3 (emit, "and", t1_lo, old_value_lo, new_value_lo);
-      if (is_di)
-       arm_output_op3 (emit, "and", t1_hi, old_value_hi, new_value_hi);
-      arm_output_op2 (emit, "mvn", t1_lo, t1_lo);
-      if (is_di)
-       arm_output_op2 (emit, "mvn", t1_hi, t1_hi);
+    case DImode:
+      if (!cmpdi_operand (oldval, mode))
+       oldval = force_reg (mode, oldval);
        break;
  
-    case SYNC_OP_NONE:
-      t1 = new_value;
-      t1_lo = new_value_lo;
-      if (is_di)
-       t1_hi = new_value_hi;
-      break;
+    default:
+      gcc_unreachable ();
      }
  
-  /* Note that the result of strex is a 0/1 flag that's always 1 register.  */
-  if (t2)
+  switch (mode)
      {
-      arm_output_strex (emit, mode, "", t2, t1, memory);
-      operands[0] = t2;
-      arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
-      arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=",
-                          LOCAL_LABEL_PREFIX);
+    case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
+    case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
+    case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
+    case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
+    default:
+      gcc_unreachable ();
      }
-  else
-    {
-      /* Use old_value for the return value because for some operations
-        the old_value can easily be restored.  This saves one register.  */
-      arm_output_strex (emit, mode, "", old_value_lo, t1, memory);
-      operands[0] = old_value_lo;
-      arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
-      arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=",
-                          LOCAL_LABEL_PREFIX);
  
-      /* Note that we only used the _lo half of old_value as a temporary
-        so in DI we don't have to restore the _hi part.  */
-      switch (sync_op)
-       {
-       case SYNC_OP_ADD:
-         arm_output_op3 (emit, "sub", old_value_lo, t1_lo, new_value_lo);
-         break;
+  emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
  
-       case SYNC_OP_SUB:
-         arm_output_op3 (emit, "add", old_value_lo, t1_lo, new_value_lo);
-         break;
+  if (mode == QImode || mode == HImode)
+    emit_move_insn (operands[1], gen_lowpart (mode, rval));
  
-       case SYNC_OP_XOR:
-         arm_output_op3 (emit, "eor", old_value_lo, t1_lo, new_value_lo);
-         break;
+  /* In all cases, we arrange for success to be signaled by Z set.
+     This arrangement allows for the boolean result to be used directly
+     in a subsequent branch, post optimization.  */
+  x = gen_rtx_REG (CCmode, CC_REGNUM);
+  x = gen_rtx_EQ (SImode, x, const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, bval, x));
+}
  
-       case SYNC_OP_NONE:
-         arm_output_op2 (emit, "mov", old_value_lo, required_value_lo);
-         break;
+/* Split a compare and swap pattern.  It is IMPLEMENTATION DEFINED whether
+   another memory store between the load-exclusive and store-exclusive can
+   reset the monitor from Exclusive to Open state.  This means we must wait
+   until after reload to split the pattern, lest we get a register spill in
+   the middle of the atomic sequence.  */
  
-       default:
-         gcc_unreachable ();
-       }
+void
+arm_split_compare_and_swap (rtx operands[])
+{
+  rtx rval, mem, oldval, newval, scratch;
+  enum machine_mode mode;
+  enum memmodel mod_s, mod_f;
+  bool is_weak;
+  rtx label1, label2, x, cond;
+
+  rval = operands[0];
+  mem = operands[1];
+  oldval = operands[2];
+  newval = operands[3];
+  is_weak = (operands[4] != const0_rtx);
+  mod_s = (enum memmodel) INTVAL (operands[5]);
+  mod_f = (enum memmodel) INTVAL (operands[6]);
+  scratch = operands[7];
+  mode = GET_MODE (mem);
+
+  arm_pre_atomic_barrier (mod_s);
+
+  label1 = NULL_RTX;
+  if (!is_weak)
+    {
+      label1 = gen_label_rtx ();
+      emit_label (label1);
      }
+  label2 = gen_label_rtx ();
  
-  /* Note: label is before barrier so that in cmp failure case we still get
-     a barrier to stop subsequent loads floating upwards past the ldrex
-     PR target/48126.  */
-  arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX);
-  arm_process_output_memory_barrier (emit, NULL);
-}
+  arm_emit_load_exclusive (mode, rval, mem);
  
-static rtx
-arm_get_sync_operand (rtx *operands, int index, rtx default_value)
-{
-  if (index > 0)
-    default_value = operands[index - 1];
+  cond = arm_gen_compare_reg (NE, rval, oldval, scratch);
+  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                           gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+  emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
  
-  return default_value;
-}
+  arm_emit_store_exclusive (mode, scratch, mem, newval);
  
-#define FETCH_SYNC_OPERAND(NAME, DEFAULT) \
-  arm_get_sync_operand (operands, (int) get_attr_sync_##NAME (insn), DEFAULT);
+  /* Weak or strong, we want EQ to be true for success, so that we
+     match the flags that we got from the compare above.  */
+  cond = gen_rtx_REG (CCmode, CC_REGNUM);
+  x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, cond, x));
  
-/* Extract the operands for a synchroniztion instruction from the
-   instructions attributes and emit the instruction.  */
-static void
-arm_process_output_sync_insn (emit_f emit, rtx insn, rtx *operands)
-{
-  rtx result, memory, required_value, new_value, t1, t2;
-  int early_barrier;
-  enum machine_mode mode;
-  enum attr_sync_op sync_op;
+  if (!is_weak)
+    {
+      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                               gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+      emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
+    }
  
-  result = FETCH_SYNC_OPERAND(result, 0);
-  memory = FETCH_SYNC_OPERAND(memory, 0);
-  required_value = FETCH_SYNC_OPERAND(required_value, 0);
-  new_value = FETCH_SYNC_OPERAND(new_value, 0);
-  t1 = FETCH_SYNC_OPERAND(t1, 0);
-  t2 = FETCH_SYNC_OPERAND(t2, 0);
-  early_barrier =
-    get_attr_sync_release_barrier (insn) == SYNC_RELEASE_BARRIER_YES;
-  sync_op = get_attr_sync_op (insn);
-  mode = GET_MODE (memory);
+  if (mod_f != MEMMODEL_RELAXED)
+    emit_label (label2);
  
-  arm_output_sync_loop (emit, mode, result, memory, required_value,
-                       new_value, t1, t2, sync_op, early_barrier);
-}
+  arm_post_atomic_barrier (mod_s);
  
-/* Emit a synchronization instruction loop.  */
-const char *
-arm_output_sync_insn (rtx insn, rtx *operands)
-{
-  arm_process_output_sync_insn (arm_emit, insn, operands);
-  return "";
+  if (mod_f == MEMMODEL_RELAXED)
+    emit_label (label2);
  }
  
-/* Count the number of machine instruction that will be emitted for a
-   synchronization instruction.  Note that the emitter used does not
-   emit instructions, it just counts instructions being carefull not
-   to count labels.  */
-unsigned int
-arm_sync_loop_insns (rtx insn, rtx *operands)
+void
+arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
+                    rtx value, rtx model_rtx, rtx cond)
  {
-  arm_insn_count = 0;
-  arm_process_output_sync_insn (arm_count, insn, operands);
-  return arm_insn_count;
-}
+  enum memmodel model = (enum memmodel) INTVAL (model_rtx);
+  enum machine_mode mode = GET_MODE (mem);
+  enum machine_mode wmode = (mode == DImode ? DImode : SImode);
+  rtx label, x;
  
-/* Helper to call a target sync instruction generator, dealing with
-   the variation in operands required by the different generators.  */
-static rtx
-arm_call_generator (struct arm_sync_generator *generator, rtx old_value,
-                   rtx memory, rtx required_value, rtx new_value)
-{
-  switch (generator->op)
+  arm_pre_atomic_barrier (model);
+
+  label = gen_label_rtx ();
+  emit_label (label);
+
+  if (new_out)
+    new_out = gen_lowpart (wmode, new_out);
+  if (old_out)
+    old_out = gen_lowpart (wmode, old_out);
+  else
+    old_out = new_out;
+  value = simplify_gen_subreg (wmode, value, mode, 0);
+
+  arm_emit_load_exclusive (mode, old_out, mem);
+
+  switch (code)
      {
-    case arm_sync_generator_omn:
-      gcc_assert (! required_value);
-      return generator->u.omn (old_value, memory, new_value);
+    case SET:
+      new_out = value;
+      break;
  
-    case arm_sync_generator_omrn:
-      gcc_assert (required_value);
-      return generator->u.omrn (old_value, memory, required_value, new_value);
+    case NOT:
+      x = gen_rtx_AND (wmode, old_out, value);
+      emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
+      x = gen_rtx_NOT (wmode, new_out);
+      emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
+      break;
+
+    case MINUS:
+      if (CONST_INT_P (value))
+       {
+         value = GEN_INT (-INTVAL (value));
+         code = PLUS;
+       }
+      /* FALLTHRU */
+
+    case PLUS:
+      if (mode == DImode)
+       {
+         /* DImode plus/minus need to clobber flags.  */
+         /* The adddi3 and subdi3 patterns are incorrectly written so that
+            they require matching operands, even when we could easily support
+            three operands.  Thankfully, this can be fixed up post-splitting,
+            as the individual add+adc patterns do accept three operands and
+            post-reload cprop can make these moves go away.  */
+         emit_move_insn (new_out, old_out);
+         if (code == PLUS)
+           x = gen_adddi3 (new_out, new_out, value);
+         else
+           x = gen_subdi3 (new_out, new_out, value);
+         emit_insn (x);
+         break;
+       }
+      /* FALLTHRU */
+
+    default:
+      x = gen_rtx_fmt_ee (code, wmode, old_out, value);
+      emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
+      break;
      }
  
-  return NULL;
+  arm_emit_store_exclusive (mode, cond, mem, gen_lowpart (mode, new_out));
+
+  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+  emit_unlikely_jump (gen_cbranchsi4 (x, cond, const0_rtx, label));
+
+  arm_post_atomic_barrier (model);
  }
+\f
+#define MAX_VECT_LEN 16
  
-/* Expand a synchronization loop. The synchronization loop is expanded
-   as an opaque block of instructions in order to ensure that we do
-   not subsequently get extraneous memory accesses inserted within the
-   critical region. The exclusive access property of ldrex/strex is
-   only guaranteed in there are no intervening memory accesses. */
-void
-arm_expand_sync (enum machine_mode mode,
-                struct arm_sync_generator *generator,
-                rtx target, rtx memory, rtx required_value, rtx new_value)
+struct expand_vec_perm_d
  {
-  if (target == NULL)
-    target = gen_reg_rtx (mode);
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_vector_p;
+  bool testing_p;
+};
  
-  memory = arm_legitimize_sync_memory (memory);
-  if (mode != SImode && mode != DImode)
-    {
-      rtx load_temp = gen_reg_rtx (SImode);
+/* Generate a variable permutation.  */
+
+static void
+arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  bool one_vector_p = rtx_equal_p (op0, op1);
  
-      if (required_value)
-       required_value = convert_modes (SImode, mode, required_value, true);
+  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
+  gcc_checking_assert (GET_MODE (op0) == vmode);
+  gcc_checking_assert (GET_MODE (op1) == vmode);
+  gcc_checking_assert (GET_MODE (sel) == vmode);
+  gcc_checking_assert (TARGET_NEON);
  
-      new_value = convert_modes (SImode, mode, new_value, true);
-      emit_insn (arm_call_generator (generator, load_temp, memory,
-                                    required_value, new_value));
-      emit_move_insn (target, gen_lowpart (mode, load_temp));
+  if (one_vector_p)
+    {
+      if (vmode == V8QImode)
+       emit_insn (gen_neon_vtbl1v8qi (target, op0, sel));
+      else
+       emit_insn (gen_neon_vtbl1v16qi (target, op0, sel));
      }
    else
      {
-      emit_insn (arm_call_generator (generator, target, memory, required_value,
-                                    new_value));
+      rtx pair;
+
+      if (vmode == V8QImode)
+       {
+         pair = gen_reg_rtx (V16QImode);
+         emit_insn (gen_neon_vcombinev8qi (pair, op0, op1));
+         pair = gen_lowpart (TImode, pair);
+         emit_insn (gen_neon_vtbl2v8qi (target, pair, sel));
+       }
+      else
+       {
+         pair = gen_reg_rtx (OImode);
+         emit_insn (gen_neon_vcombinev16qi (pair, op0, op1));
+         emit_insn (gen_neon_vtbl2v16qi (target, pair, sel));
+       }
      }
  }
  
-static unsigned int
-arm_autovectorize_vector_sizes (void)
+void
+arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
  {
-  return TARGET_NEON_VECTORIZE_DOUBLE ? 0 : (16 | 8);
-}
+  enum machine_mode vmode = GET_MODE (target);
+  unsigned int i, nelt = GET_MODE_NUNITS (vmode);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+  rtx rmask[MAX_VECT_LEN], mask;
  
-static bool
-arm_vector_alignment_reachable (const_tree type, bool is_packed)
-{
-  /* Vectors which aren't in packed structures will not be less aligned than
-     the natural alignment of their element type, so this is safe.  */
-  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
-    return !is_packed;
+  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  gcc_checking_assert (!BYTES_BIG_ENDIAN);
  
-  return default_builtin_vector_alignment_reachable (type, is_packed);
+  /* The VTBL instruction does not use a modulo index, so we must take care
+     of that ourselves.  */
+  mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
+  for (i = 0; i < nelt; ++i)
+    rmask[i] = mask;
+  mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
+  sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
+
+  arm_expand_vec_perm_1 (target, op0, op1, sel);
  }
  
+/* Generate or test for an insn that supports a constant permutation.  */
+
+/* Recognize patterns for the VUZP insns.  */
+
  static bool
-arm_builtin_support_vector_misalignment (enum machine_mode mode,
-                                        const_tree type, int misalignment,
-                                        bool is_packed)
+arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
  {
-  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+  unsigned int i, odd, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt; i++)
      {
-      HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
+      unsigned elt = (i * 2 + odd) & mask;
+      if (d->perm[i] != elt)
+       return false;
+    }
  
-      if (is_packed)
-        return align == 1;
+  /* Success!  */
+  if (d->testing_p)
+    return true;
  
-      /* If the misalignment is unknown, we should be able to handle the access
-        so long as it is not to a member of a packed data structure.  */
-      if (misalignment == -1)
-        return true;
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vuzpv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vuzpv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vuzpv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vuzpv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vuzpv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vuzpv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vuzpv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vuzpv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
  
-      /* Return true if the misalignment is a multiple of the natural alignment
-         of the vector's element type.  This is probably always going to be
-        true in practice, since we've already established that this isn't a
-        packed access.  */
-      return ((misalignment % align) == 0);
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
      }
  
-  return default_builtin_support_vector_misalignment (mode, type, misalignment,
-                                                     is_packed);
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (odd)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
  }
  
-static void
-arm_conditional_register_usage (void)
+/* Recognize patterns for the VZIP insns.  */
+
+static bool
+arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
  {
-  int regno;
+  unsigned int i, high, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
  
-  if (TARGET_SOFT_FLOAT || TARGET_THUMB1 || !TARGET_FPA)
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  high = nelt / 2;
+  if (d->perm[0] == high)
+    ;
+  else if (d->perm[0] == 0)
+    high = 0;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt / 2; i++)
      {
-      for (regno = FIRST_FPA_REGNUM;
-          regno <= LAST_FPA_REGNUM; ++regno)
-       fixed_regs[regno] = call_used_regs[regno] = 1;
+      unsigned elt = (i + high) & mask;
+      if (d->perm[i * 2] != elt)
+       return false;
+      elt = (elt + nelt) & mask;
+      if (d->perm[i * 2 + 1] != elt)
+       return false;
      }
  
-  if (TARGET_THUMB1 && optimize_size)
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
      {
-      /* When optimizing for size on Thumb-1, it's better not
-        to use the HI regs, because of the overhead of
-        stacking them.  */
-      for (regno = FIRST_HI_REGNUM;
-          regno <= LAST_HI_REGNUM; ++regno)
-       fixed_regs[regno] = call_used_regs[regno] = 1;
+    case V16QImode: gen = gen_neon_vzipv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vzipv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vzipv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vzipv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vzipv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vzipv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vzipv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vzipv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
      }
  
-  /* The link register can be clobbered by any branch insn,
-     but we have no way to track that at present, so mark
-     it as unavailable.  */
-  if (TARGET_THUMB1)
-    fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      high = !high;
+    }
  
-  if (TARGET_32BIT && TARGET_HARD_FLOAT)
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (high)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* Recognize patterns for the VREV insns.  */
+
+static bool
+arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
+{
+  unsigned int i, j, diff, nelt = d->nelt;
+  rtx (*gen)(rtx, rtx, rtx);
+
+  if (!d->one_vector_p)
+    return false;
+
+  diff = d->perm[0];
+  switch (diff)
      {
-      if (TARGET_MAVERICK)
+    case 7:
+      switch (d->vmode)
         {
-         for (regno = FIRST_FPA_REGNUM;
-              regno <= LAST_FPA_REGNUM; ++ regno)
-           fixed_regs[regno] = call_used_regs[regno] = 1;
-         for (regno = FIRST_CIRRUS_FP_REGNUM;
-              regno <= LAST_CIRRUS_FP_REGNUM; ++ regno)
-           {
-             fixed_regs[regno] = 0;
-             call_used_regs[regno] = regno < FIRST_CIRRUS_FP_REGNUM + 4;
-           }
+       case V16QImode: gen = gen_neon_vrev64v16qi; break;
+       case V8QImode:  gen = gen_neon_vrev64v8qi;  break;
+       default:
+         return false;
         }
-      if (TARGET_VFP)
+      break;
+    case 3:
+      switch (d->vmode)
         {
-         /* VFPv3 registers are disabled when earlier VFP
-            versions are selected due to the definition of
-            LAST_VFP_REGNUM.  */
-         for (regno = FIRST_VFP_REGNUM;
-              regno <= LAST_VFP_REGNUM; ++ regno)
-           {
-             fixed_regs[regno] = 0;
-             call_used_regs[regno] = regno < FIRST_VFP_REGNUM + 16
-               || regno >= FIRST_VFP_REGNUM + 32;
-           }
+       case V16QImode: gen = gen_neon_vrev32v16qi; break;
+       case V8QImode:  gen = gen_neon_vrev32v8qi;  break;
+       case V8HImode:  gen = gen_neon_vrev64v8hi;  break;
+       case V4HImode:  gen = gen_neon_vrev64v4hi;  break;
+       default:
+         return false;
         }
-    }
-
-  if (TARGET_REALLY_IWMMXT)
-    {
-      regno = FIRST_IWMMXT_GR_REGNUM;
-      /* The 2002/10/09 revision of the XScale ABI has wCG0
-         and wCG1 as call-preserved registers.  The 2002/11/21
-         revision changed this so that all wCG registers are
-         scratch registers.  */
-      for (regno = FIRST_IWMMXT_GR_REGNUM;
-          regno <= LAST_IWMMXT_GR_REGNUM; ++ regno)
-       fixed_regs[regno] = 0;
-      /* The XScale ABI has wR0 - wR9 as scratch registers,
-        the rest as call-preserved registers.  */
-      for (regno = FIRST_IWMMXT_REGNUM;
-          regno <= LAST_IWMMXT_REGNUM; ++ regno)
-       {
-         fixed_regs[regno] = 0;
-         call_used_regs[regno] = regno < FIRST_IWMMXT_REGNUM + 10;
+      break;
+    case 1:
+      switch (d->vmode)
+       {
+       case V16QImode: gen = gen_neon_vrev16v16qi; break;
+       case V8QImode:  gen = gen_neon_vrev16v8qi;  break;
+       case V8HImode:  gen = gen_neon_vrev32v8hi;  break;
+       case V4HImode:  gen = gen_neon_vrev32v4hi;  break;
+       case V4SImode:  gen = gen_neon_vrev64v4si;  break;
+       case V2SImode:  gen = gen_neon_vrev64v2si;  break;
+       case V4SFmode:  gen = gen_neon_vrev64v4sf;  break;
+       case V2SFmode:  gen = gen_neon_vrev64v2sf;  break;
+       default:
+         return false;
         }
+      break;
+    default:
+      return false;
      }
  
-  if ((unsigned) PIC_OFFSET_TABLE_REGNUM != INVALID_REGNUM)
+  for (i = 0; i < nelt; i += diff)
+    for (j = 0; j <= diff; j += 1)
+      if (d->perm[i + j] != i + diff - j)
+       return false;
+
+  /* Success! */
+  if (d->testing_p)
+    return true;
+
+  /* ??? The third operand is an artifact of the builtin infrastructure
+     and is ignored by the actual instruction.  */
+  emit_insn (gen (d->target, d->op0, const0_rtx));
+  return true;
+}
+
+/* Recognize patterns for the VTRN insns.  */
+
+static bool
+arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
+{
+  unsigned int i, odd, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt; i += 2)
      {
-      fixed_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
-      call_used_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
+      if (d->perm[i] != i + odd)
+       return false;
+      if (d->perm[i + 1] != ((i + nelt + odd) & mask))
+       return false;
      }
-  else if (TARGET_APCS_STACK)
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
      {
-      fixed_regs[10]     = 1;
-      call_used_regs[10] = 1;
+    case V16QImode: gen = gen_neon_vtrnv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vtrnv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vtrnv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vtrnv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vtrnv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vtrnv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vtrnv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vtrnv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
      }
-  /* -mcaller-super-interworking reserves r11 for calls to
-     _interwork_r11_call_via_rN().  Making the register global
-     is an easy way of ensuring that it remains valid for all
-     calls.  */
-  if (TARGET_APCS_FRAME || TARGET_CALLER_INTERWORKING
-      || TARGET_TPCS_FRAME || TARGET_TPCS_LEAF_FRAME)
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
      {
-      fixed_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
-      call_used_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
-      if (TARGET_CALLER_INTERWORKING)
-       global_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
      }
-  SUBTARGET_CONDITIONAL_REGISTER_USAGE
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (odd)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
  }
  
-static reg_class_t
-arm_preferred_rename_class (reg_class_t rclass)
+/* The NEON VTBL instruction is a fully variable permuation that's even
+   stronger than what we expose via VEC_PERM_EXPR.  What it doesn't do
+   is mask the index operand as VEC_PERM_EXPR requires.  Therefore we
+   can do slightly better by expanding this as a constant where we don't
+   have to apply a mask.  */
+
+static bool
+arm_evpc_neon_vtbl (struct expand_vec_perm_d *d)
  {
-  /* Thumb-2 instructions using LO_REGS may be smaller than instructions
-     using GENERIC_REGS.  During register rename pass, we prefer LO_REGS,
-     and code size can be reduced.  */
-  if (TARGET_THUMB2 && rclass == GENERAL_REGS)
-    return LO_REGS;
-  else
-    return NO_REGS;
+  rtx rperm[MAX_VECT_LEN], sel;
+  enum machine_mode vmode = d->vmode;
+  unsigned int i, nelt = d->nelt;
+
+  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  /* Generic code will try constant permutation twice.  Once with the
+     original mode and again with the elements lowered to QImode.
+     So wait and don't do the selector expansion ourselves.  */
+  if (vmode != V8QImode && vmode != V16QImode)
+    return false;
+
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (d->perm[i]);
+  sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+  sel = force_reg (vmode, sel);
+
+  arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
+  return true;
  }
  
-/* Compute the atrribute "length" of insn "*push_multi".
-   So this function MUST be kept in sync with that insn pattern.  */
-int
-arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
+static bool
+arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
  {
-  int i, regno, hi_reg;
-  int num_saves = XVECLEN (parallel_op, 0);
+  /* The pattern matching functions above are written to look for a small
+     number to begin the sequence (0, 1, N/2).  If we begin with an index
+     from the second operand, we can swap the operands.  */
+  if (d->perm[0] >= d->nelt)
+    {
+      unsigned i, nelt = d->nelt;
+      rtx x;
  
-  /* ARM mode.  */
-  if (TARGET_ARM)
-    return 4;
-  /* Thumb1 mode.  */
-  if (TARGET_THUMB1)
-    return 2;
+      for (i = 0; i < nelt; ++i)
+       d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
  
-  /* Thumb2 mode.  */
-  regno = REGNO (first_op);
-  hi_reg = (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
-  for (i = 1; i < num_saves && !hi_reg; i++)
+      x = d->op0;
+      d->op0 = d->op1;
+      d->op1 = x;
+    }
+
+  if (TARGET_NEON)
      {
-      regno = REGNO (XEXP (XVECEXP (parallel_op, 0, i), 0));
-      hi_reg |= (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
+      if (arm_evpc_neon_vuzp (d))
+       return true;
+      if (arm_evpc_neon_vzip (d))
+       return true;
+      if (arm_evpc_neon_vrev (d))
+       return true;
+      if (arm_evpc_neon_vtrn (d))
+       return true;
+      return arm_evpc_neon_vtbl (d);
      }
+  return false;
+}
  
-  if (!hi_reg)
-    return 2;
-  return 4;
+/* Expand a vec_perm_const pattern.  */
+
+bool
+arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  struct expand_vec_perm_d d;
+  int i, nelt, which;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = GET_MODE (target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      d.one_vector_p = false;
+      if (!rtx_equal_p (op0, op1))
+       break;
+
+      /* The elements of PERM do not suggest that only the first operand
+        is used, but both operands are identical.  Allow easier matching
+        of the permutation by folding the permutation into the single
+        input vector.  */
+      /* FALLTHRU */
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d.perm[i] &= nelt - 1;
+      d.op0 = op1;
+      d.one_vector_p = true;
+      break;
+
+    case 1:
+      d.op1 = op0;
+      d.one_vector_p = true;
+      break;
+    }
+
+  return arm_expand_vec_perm_const_1 (&d);
  }
  
-/* Compute the number of instructions emitted by output_move_double.  */
-int
-arm_count_output_move_double_insns (rtx *operands)
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
+
+static bool
+arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                const unsigned char *sel)
  {
-  int count;
-  output_move_double (operands, false, &count);
-  return count;
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+  memcpy (d.perm, sel, nelt);
+
+  /* Categorize the set of elements in the selector.  */
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to the vector type.  */
+  d.one_vector_p = (which != 3);
+
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_vector_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = arm_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
  }
  
+\f
  #include "gt-arm.h"