gcc/ChangeLog:

[pf3gnuchains/gcc-fork.git] / gcc / config / arm / arm.c
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c

index 1af75f1..88f21bc 100644 (file)
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1,6 +1,6 @@
  /* Output routines for GCC for ARM.
     Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-   2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+   2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
     Free Software Foundation, Inc.
     Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl)
     and Martin Simmons (@harleqn.co.uk).
@@ -31,7 +31,6 @@
  #include "obstack.h"
  #include "regs.h"
  #include "hard-reg-set.h"
-#include "real.h"
  #include "insn-config.h"
  #include "conditions.h"
  #include "output.h"
@@ -55,6 +54,7 @@
  #include "langhooks.h"
  #include "df.h"
  #include "intl.h"
+#include "libfuncs.h"
  
  /* Forward definitions of types.  */
  typedef struct minipool_node    Mnode;
@@ -132,11 +132,12 @@ static enum machine_mode arm_promote_function_mode (const_tree,
                                                     const_tree, int);
  static bool arm_return_in_memory (const_tree, const_tree);
  static rtx arm_function_value (const_tree, const_tree, bool);
-static rtx arm_libcall_value (enum machine_mode, rtx);
+static rtx arm_libcall_value (enum machine_mode, const_rtx);
  
  static void arm_internal_label (FILE *, const char *, unsigned long);
  static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT,
                                  tree);
+static bool arm_have_conditional_execution (void);
  static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool);
  static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *);
  static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
@@ -149,7 +150,6 @@ static bool arm_memory_load_p (rtx);
  static bool arm_cirrus_insn_p (rtx);
  static void cirrus_reorg (rtx);
  static void arm_init_builtins (void);
-static rtx arm_expand_builtin (tree, rtx, rtx, enum machine_mode, int);
  static void arm_init_iwmmxt_builtins (void);
  static rtx safe_vector_operand (rtx, enum machine_mode);
  static rtx arm_expand_binop_builtin (enum insn_code, tree, rtx);
@@ -189,6 +189,7 @@ static void arm_unwind_emit (FILE *, rtx);
  static bool arm_output_ttype (rtx);
  #endif
  static void arm_dwarf_handle_frame_unspec (const char *, rtx, int);
+static rtx arm_dwarf_register_span (rtx);
  
  static tree arm_cxx_guard_type (void);
  static bool arm_cxx_guard_mask_bit (void);
@@ -217,6 +218,11 @@ static tree arm_promoted_type (const_tree t);
  static tree arm_convert_to_type (tree type, tree expr);
  static bool arm_scalar_mode_supported_p (enum machine_mode);
  static bool arm_frame_pointer_required (void);
+static bool arm_can_eliminate (const int, const int);
+static void arm_asm_trampoline_template (FILE *);
+static void arm_trampoline_init (rtx, tree, rtx);
+static rtx arm_trampoline_adjust_address (rtx);
+static rtx arm_pic_static_addr (rtx orig, rtx reg);
  
  \f
  /* Table of machine attributes.  */
@@ -364,6 +370,13 @@ static const struct attribute_spec arm_attribute_table[] =
  #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS
  #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS arm_allocate_stack_slots_for_args
  
+#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
+#define TARGET_ASM_TRAMPOLINE_TEMPLATE arm_asm_trampoline_template
+#undef TARGET_TRAMPOLINE_INIT
+#define TARGET_TRAMPOLINE_INIT arm_trampoline_init
+#undef TARGET_TRAMPOLINE_ADJUST_ADDRESS
+#define TARGET_TRAMPOLINE_ADJUST_ADDRESS arm_trampoline_adjust_address
+
  #undef TARGET_DEFAULT_SHORT_ENUMS
  #define TARGET_DEFAULT_SHORT_ENUMS arm_default_short_enums
  
@@ -425,6 +438,9 @@ static const struct attribute_spec arm_attribute_table[] =
  #undef TARGET_DWARF_HANDLE_FRAME_UNSPEC
  #define TARGET_DWARF_HANDLE_FRAME_UNSPEC arm_dwarf_handle_frame_unspec
  
+#undef TARGET_DWARF_REGISTER_SPAN
+#define TARGET_DWARF_REGISTER_SPAN arm_dwarf_register_span
+
  #undef  TARGET_CANNOT_COPY_INSN_P
  #define TARGET_CANNOT_COPY_INSN_P arm_cannot_copy_insn_p
  
@@ -433,6 +449,9 @@ static const struct attribute_spec arm_attribute_table[] =
  #define TARGET_HAVE_TLS true
  #endif
  
+#undef TARGET_HAVE_CONDITIONAL_EXECUTION
+#define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
+
  #undef TARGET_CANNOT_FORCE_CONST_MEM
  #define TARGET_CANNOT_FORCE_CONST_MEM arm_cannot_force_const_mem
  
@@ -484,6 +503,9 @@ static const struct attribute_spec arm_attribute_table[] =
  #undef TARGET_FRAME_POINTER_REQUIRED
  #define TARGET_FRAME_POINTER_REQUIRED arm_frame_pointer_required
  
+#undef TARGET_CAN_ELIMINATE
+#define TARGET_CAN_ELIMINATE arm_can_eliminate
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  \f
  /* Obstack for minipool constant handling.  */
@@ -502,17 +524,17 @@ int making_const_table;
  /* The processor for which instructions should be scheduled.  */
  enum processor_type arm_tune = arm_none;
  
+/* The current tuning set.  */
+const struct tune_params *current_tune;
+
  /* The default processor used if not overridden by commandline.  */
  static enum processor_type arm_default_cpu = arm_none;
  
-/* Which floating point model to use.  */
-enum arm_fp_model arm_fp_model;
-
-/* Which floating point hardware is available.  */
-enum fputype arm_fpu_arch;
-
  /* Which floating point hardware to schedule for.  */
-enum fputype arm_fpu_tune;
+int arm_fpu_attr;
+
+/* Which floating popint hardware to use.  */
+const struct arm_fpu_desc *arm_fpu_desc;
  
  /* Whether to use floating point hardware.  */
  enum float_abi_type arm_float_abi;
@@ -558,6 +580,8 @@ static int thumb_call_reg_needed;
  #define FL_DIV       (1 << 18)       /* Hardware divide.  */
  #define FL_VFPV3      (1 << 19)       /* Vector Floating Point V3.  */
  #define FL_NEON       (1 << 20)       /* Neon instructions.  */
+#define FL_ARCH7EM    (1 << 21)              /* Instructions present in the ARMv7E-M
+                                        architecture.  */
  
  #define FL_IWMMXT     (1 << 29)              /* XScale v2 or "Intel Wireless MMX technology".  */
  
@@ -579,9 +603,10 @@ static int thumb_call_reg_needed;
  #define FL_FOR_ARCH6T2 (FL_FOR_ARCH6 | FL_THUMB2)
  #define FL_FOR_ARCH6M  (FL_FOR_ARCH6 & ~FL_NOTM)
  #define FL_FOR_ARCH7   (FL_FOR_ARCH6T2 &~ FL_NOTM)
-#define FL_FOR_ARCH7A  (FL_FOR_ARCH7 | FL_NOTM)
+#define FL_FOR_ARCH7A  (FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
  #define FL_FOR_ARCH7R  (FL_FOR_ARCH7A | FL_DIV)
  #define FL_FOR_ARCH7M  (FL_FOR_ARCH7 | FL_DIV)
+#define FL_FOR_ARCH7EM  (FL_FOR_ARCH7M | FL_ARCH7EM)
  
  /* The bits in this mask specify which
     instructions we are allowed to generate.  */
@@ -618,6 +643,9 @@ int arm_arch6k = 0;
  /* Nonzero if instructions not present in the 'M' profile can be used.  */
  int arm_arch_notm = 0;
  
+/* Nonzero if instructions present in ARMv7E-M can be used.  */
+int arm_arch7em = 0;
+
  /* Nonzero if this chip can benefit from load scheduling.  */
  int arm_ld_sched = 0;
  
@@ -671,9 +699,6 @@ unsigned arm_pic_register = INVALID_REGNUM;
     the next function.  */
  static int after_arm_reorg = 0;
  
-/* The maximum number of insns to be used when loading a constant.  */
-static int arm_constant_limit = 3;
-
  static enum arm_pcs arm_pcs_default;
  
  /* For an explanation of these variables, see final_prescan_insn below.  */
@@ -712,7 +737,31 @@ struct processors
    enum processor_type core;
    const char *arch;
    const unsigned long flags;
-  bool (* rtx_costs) (rtx, enum rtx_code, enum rtx_code, int *, bool);
+  const struct tune_params *const tune;
+};
+
+const struct tune_params arm_slowmul_tune =
+{
+  arm_slowmul_rtx_costs,
+  3
+};
+
+const struct tune_params arm_fastmul_tune =
+{
+  arm_fastmul_rtx_costs,
+  1
+};
+
+const struct tune_params arm_xscale_tune =
+{
+  arm_xscale_rtx_costs,
+  2
+};
+
+const struct tune_params arm_9e_tune =
+{
+  arm_9e_rtx_costs,
+  1
  };
  
  /* Not all of these give usefully different compilation alternatives,
@@ -721,7 +770,7 @@ static const struct processors all_cores[] =
  {
    /* ARM Cores */
  #define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \
-  {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, arm_##COSTS##_rtx_costs},
+  {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune},
  #include "arm-cores.def"
  #undef ARM_CORE
    {NULL, arm_none, NULL, 0, NULL}
@@ -730,7 +779,7 @@ static const struct processors all_cores[] =
  static const struct processors all_architectures[] =
  {
    /* ARM Architectures */
-  /* We don't specify rtx_costs here as it will be figured out
+  /* We don't specify tuning costs here as it will be figured out
       from the core.  */
  
    {"armv2",   arm2,       "2",   FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL},
@@ -756,6 +805,7 @@ static const struct processors all_architectures[] =
    {"armv7-a", cortexa8,          "7A",  FL_CO_PROC |             FL_FOR_ARCH7A, NULL},
    {"armv7-r", cortexr4,          "7R",  FL_CO_PROC |             FL_FOR_ARCH7R, NULL},
    {"armv7-m", cortexm3,          "7M",  FL_CO_PROC |             FL_FOR_ARCH7M, NULL},
+  {"armv7e-m",   cortexm3, "7EM", FL_CO_PROC |           FL_FOR_ARCH7EM, NULL},
    {"ep9312",  ep9312,     "4T",  FL_LDSCHED | FL_CIRRUS | FL_FOR_ARCH4, NULL},
    {"iwmmxt",  iwmmxt,     "5TE", FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT , NULL},
    {"iwmmxt2", iwmmxt2,     "5TE", FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT , NULL},
@@ -790,46 +840,29 @@ static struct arm_cpu_select arm_select[] =
  
  char arm_arch_name[] = "__ARM_ARCH_0UNK__";
  
-struct fpu_desc
-{
-  const char * name;
-  enum fputype fpu;
-};
-
-
  /* Available values for -mfpu=.  */
  
-static const struct fpu_desc all_fpus[] =
-{
-  {"fpa",              FPUTYPE_FPA},
-  {"fpe2",             FPUTYPE_FPA_EMU2},
-  {"fpe3",             FPUTYPE_FPA_EMU2},
-  {"maverick",         FPUTYPE_MAVERICK},
-  {"vfp",              FPUTYPE_VFP},
-  {"vfp3",             FPUTYPE_VFP3},
-  {"vfpv3",            FPUTYPE_VFP3},
-  {"vfpv3-d16",                FPUTYPE_VFP3D16},
-  {"neon",             FPUTYPE_NEON},
-  {"neon-fp16",                FPUTYPE_NEON_FP16}
-};
-
-
-/* Floating point models used by the different hardware.
-   See fputype in arm.h.  */
-
-static const enum arm_fp_model fp_model_for_fpu[] =
-{
-  /* No FP hardware.  */
-  ARM_FP_MODEL_UNKNOWN,                /* FPUTYPE_NONE  */
-  ARM_FP_MODEL_FPA,            /* FPUTYPE_FPA  */
-  ARM_FP_MODEL_FPA,            /* FPUTYPE_FPA_EMU2  */
-  ARM_FP_MODEL_FPA,            /* FPUTYPE_FPA_EMU3  */
-  ARM_FP_MODEL_MAVERICK,       /* FPUTYPE_MAVERICK  */
-  ARM_FP_MODEL_VFP,            /* FPUTYPE_VFP  */
-  ARM_FP_MODEL_VFP,            /* FPUTYPE_VFP3D16  */
-  ARM_FP_MODEL_VFP,            /* FPUTYPE_VFP3  */
-  ARM_FP_MODEL_VFP,            /* FPUTYPE_NEON  */
-  ARM_FP_MODEL_VFP             /* FPUTYPE_NEON_FP16  */
+static const struct arm_fpu_desc all_fpus[] =
+{
+  {"fpa",              ARM_FP_MODEL_FPA, 0, VFP_NONE, false, false},
+  {"fpe2",             ARM_FP_MODEL_FPA, 2, VFP_NONE, false, false},
+  {"fpe3",             ARM_FP_MODEL_FPA, 3, VFP_NONE, false, false},
+  {"maverick",         ARM_FP_MODEL_MAVERICK, 0, VFP_NONE, false, false},
+  {"vfp",              ARM_FP_MODEL_VFP, 2, VFP_REG_D16, false, false},
+  {"vfpv3",            ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false},
+  {"vfpv3-fp16",       ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, true},
+  {"vfpv3-d16",                ARM_FP_MODEL_VFP, 3, VFP_REG_D16, false, false},
+  {"vfpv3-d16-fp16",   ARM_FP_MODEL_VFP, 3, VFP_REG_D16, false, true},
+  {"vfpv3xd",          ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, false, false},
+  {"vfpv3xd-fp16",     ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, false, true},
+  {"neon",             ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , false},
+  {"neon-fp16",                ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , true },
+  {"vfpv4",            ARM_FP_MODEL_VFP, 4, VFP_REG_D32, false, true},
+  {"vfpv4-d16",                ARM_FP_MODEL_VFP, 4, VFP_REG_D16, false, true},
+  {"fpv4-sp-d16",      ARM_FP_MODEL_VFP, 4, VFP_REG_SINGLE, false, true},
+  {"neon-vfpv4",       ARM_FP_MODEL_VFP, 4, VFP_REG_D32, true, true},
+  /* Compatibility aliases.  */
+  {"vfp3",             ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false},
  };
  
  
@@ -895,6 +928,13 @@ enum tls_reloc {
    TLS_LE32
  };
  
+/* The maximum number of insns to be used when loading a constant.  */
+inline static int
+arm_constant_limit (bool size_p)
+{
+  return size_p ? 1 : current_tune->constant_limit;
+}
+
  /* Emit an insn that's a simple single-set.  Both the operands must be known
     to be valid.  */
  inline static rtx
@@ -1062,6 +1102,9 @@ arm_init_libfuncs (void)
      default:
        break;
      }
+
+  if (TARGET_AAPCS_BASED)
+    synchronize_libfunc = init_one_libfunc ("__sync_synchronize");
  }
  
  /* On AAPCS systems, this is the "struct __va_list".  */
@@ -1432,6 +1475,7 @@ arm_override_options (void)
    gcc_assert (arm_tune != arm_none);
  
    tune_flags = all_cores[(int)arm_tune].flags;
+  current_tune = all_cores[(int)arm_tune].tune;
  
    if (target_fp16_format_name)
      {
@@ -1542,6 +1586,7 @@ arm_override_options (void)
    arm_arch6 = (insn_flags & FL_ARCH6) != 0;
    arm_arch6k = (insn_flags & FL_ARCH6K) != 0;
    arm_arch_notm = (insn_flags & FL_NOTM) != 0;
+  arm_arch7em = (insn_flags & FL_ARCH7EM) != 0;
    arm_arch_thumb2 = (insn_flags & FL_THUMB2) != 0;
    arm_arch_xscale = (insn_flags & FL_XSCALE) != 0;
    arm_arch_cirrus = (insn_flags & FL_CIRRUS) != 0;
@@ -1593,7 +1638,6 @@ arm_override_options (void)
    if (TARGET_IWMMXT_ABI && !TARGET_IWMMXT)
      error ("iwmmxt abi requires an iwmmxt capable cpu");
  
-  arm_fp_model = ARM_FP_MODEL_UNKNOWN;
    if (target_fpu_name == NULL && target_fpe_name != NULL)
      {
        if (streq (target_fpe_name, "2"))
@@ -1604,46 +1648,56 @@ arm_override_options (void)
         error ("invalid floating point emulation option: -mfpe=%s",
                target_fpe_name);
      }
-  if (target_fpu_name != NULL)
-    {
-      /* The user specified a FPU.  */
-      for (i = 0; i < ARRAY_SIZE (all_fpus); i++)
-       {
-         if (streq (all_fpus[i].name, target_fpu_name))
-           {
-             arm_fpu_arch = all_fpus[i].fpu;
-             arm_fpu_tune = arm_fpu_arch;
-             arm_fp_model = fp_model_for_fpu[arm_fpu_arch];
-             break;
-           }
-       }
-      if (arm_fp_model == ARM_FP_MODEL_UNKNOWN)
-       error ("invalid floating point option: -mfpu=%s", target_fpu_name);
-    }
-  else
+
+  if (target_fpu_name == NULL)
      {
  #ifdef FPUTYPE_DEFAULT
-      /* Use the default if it is specified for this platform.  */
-      arm_fpu_arch = FPUTYPE_DEFAULT;
-      arm_fpu_tune = FPUTYPE_DEFAULT;
+      target_fpu_name = FPUTYPE_DEFAULT;
  #else
-      /* Pick one based on CPU type.  */
-      /* ??? Some targets assume FPA is the default.
-      if ((insn_flags & FL_VFP) != 0)
-       arm_fpu_arch = FPUTYPE_VFP;
-      else
-      */
        if (arm_arch_cirrus)
-       arm_fpu_arch = FPUTYPE_MAVERICK;
+       target_fpu_name = "maverick";
        else
-       arm_fpu_arch = FPUTYPE_FPA_EMU2;
+       target_fpu_name = "fpe2";
  #endif
-      if (tune_flags & FL_CO_PROC && arm_fpu_arch == FPUTYPE_FPA_EMU2)
-       arm_fpu_tune = FPUTYPE_FPA;
+    }
+
+  arm_fpu_desc = NULL;
+  for (i = 0; i < ARRAY_SIZE (all_fpus); i++)
+    {
+      if (streq (all_fpus[i].name, target_fpu_name))
+       {
+         arm_fpu_desc = &all_fpus[i];
+         break;
+       }
+    }
+
+  if (!arm_fpu_desc)
+    {
+      error ("invalid floating point option: -mfpu=%s", target_fpu_name);
+      return;
+    }
+
+  switch (arm_fpu_desc->model)
+    {
+    case ARM_FP_MODEL_FPA:
+      if (arm_fpu_desc->rev == 2)
+       arm_fpu_attr = FPU_FPE2;
+      else if (arm_fpu_desc->rev == 3)
+       arm_fpu_attr = FPU_FPE3;
        else
-       arm_fpu_tune = arm_fpu_arch;
-      arm_fp_model = fp_model_for_fpu[arm_fpu_arch];
-      gcc_assert (arm_fp_model != ARM_FP_MODEL_UNKNOWN);
+       arm_fpu_attr = FPU_FPA;
+      break;
+
+    case ARM_FP_MODEL_MAVERICK:
+      arm_fpu_attr = FPU_MAVERICK;
+      break;
+
+    case ARM_FP_MODEL_VFP:
+      arm_fpu_attr = FPU_VFP;
+      break;
+
+    default:
+      gcc_unreachable();
      }
  
    if (target_float_abi_name != NULL)
@@ -1665,7 +1719,7 @@ arm_override_options (void)
      arm_float_abi = TARGET_DEFAULT_FLOAT_ABI;
  
    if (TARGET_AAPCS_BASED
-      && (arm_fp_model == ARM_FP_MODEL_FPA))
+      && (arm_fpu_desc->model == ARM_FP_MODEL_FPA))
      error ("FPA is unsupported in the AAPCS");
  
    if (TARGET_AAPCS_BASED)
@@ -1693,7 +1747,7 @@ arm_override_options (void)
  
    /* If soft-float is specified then don't use FPU.  */
    if (TARGET_SOFT_FLOAT)
-    arm_fpu_arch = FPUTYPE_NONE;
+    arm_fpu_attr = FPU_NONE;
  
    if (TARGET_AAPCS_BASED)
      {
@@ -1720,8 +1774,7 @@ arm_override_options (void)
    /* For arm2/3 there is no need to do any scheduling if there is only
       a floating point emulator, or we are doing software floating-point.  */
    if ((TARGET_SOFT_FLOAT
-       || arm_fpu_tune == FPUTYPE_FPA_EMU2
-       || arm_fpu_tune == FPUTYPE_FPA_EMU3)
+       || (TARGET_FPA && arm_fpu_desc->rev))
        && (tune_flags & FL_MODE32) == 0)
      flag_schedule_insns = flag_schedule_insns_after_reload = 0;
  
@@ -1740,7 +1793,7 @@ arm_override_options (void)
    /* Use the cp15 method if it is available.  */
    if (target_thread_pointer == TP_AUTO)
      {
-      if (arm_arch6k && !TARGET_THUMB)
+      if (arm_arch6k && !TARGET_THUMB1)
         target_thread_pointer = TP_CP15;
        else
         target_thread_pointer = TP_SOFT;
@@ -1811,8 +1864,7 @@ arm_override_options (void)
         fix_cm3_ldrd = 0;
      }
  
-  /* ??? We might want scheduling for thumb2.  */
-  if (TARGET_THUMB && flag_schedule_insns)
+  if (TARGET_THUMB1 && flag_schedule_insns)
      {
        /* Don't warn since it's on by default in -O2.  */
        flag_schedule_insns = 0;
@@ -1820,32 +1872,28 @@ arm_override_options (void)
  
    if (optimize_size)
      {
-      arm_constant_limit = 1;
-
        /* If optimizing for size, bump the number of instructions that we
           are prepared to conditionally execute (even on a StrongARM).  */
        max_insns_skipped = 6;
      }
    else
      {
-      /* For processors with load scheduling, it never costs more than
-         2 cycles to load a constant, and the load scheduler may well
-        reduce that to 1.  */
-      if (arm_ld_sched)
-        arm_constant_limit = 1;
-
-      /* On XScale the longer latency of a load makes it more difficult
-         to achieve a good schedule, so it's faster to synthesize
-        constants that can be done in two insns.  */
-      if (arm_tune_xscale)
-        arm_constant_limit = 2;
-
        /* StrongARM has early execution of branches, so a sequence
           that is worth skipping is shorter.  */
        if (arm_tune_strongarm)
          max_insns_skipped = 3;
      }
  
+  /* Hot/Cold partitioning is not currently supported, since we can't
+     handle literal pool placement in that case.  */
+  if (flag_reorder_blocks_and_partition)
+    {
+      inform (input_location,
+             "-freorder-blocks-and-partition not supported on this architecture");
+      flag_reorder_blocks_and_partition = 0;
+      flag_reorder_blocks = 1;
+    }
+
    /* Register global variables with the garbage collector.  */
    arm_add_gc_roots ();
  }
@@ -1978,6 +2026,84 @@ arm_allocate_stack_slots_for_args (void)
  }
  
  \f
+/* Output assembler code for a block containing the constant parts
+   of a trampoline, leaving space for the variable parts.
+
+   On the ARM, (if r8 is the static chain regnum, and remembering that
+   referencing pc adds an offset of 8) the trampoline looks like:
+          ldr          r8, [pc, #0]
+          ldr          pc, [pc]
+          .word        static chain value
+          .word        function's address
+   XXX FIXME: When the trampoline returns, r8 will be clobbered.  */
+
+static void
+arm_asm_trampoline_template (FILE *f)
+{
+  if (TARGET_ARM)
+    {
+      asm_fprintf (f, "\tldr\t%r, [%r, #0]\n", STATIC_CHAIN_REGNUM, PC_REGNUM);
+      asm_fprintf (f, "\tldr\t%r, [%r, #0]\n", PC_REGNUM, PC_REGNUM);
+    }
+  else if (TARGET_THUMB2)
+    {
+      /* The Thumb-2 trampoline is similar to the arm implementation.
+        Unlike 16-bit Thumb, we enter the stub in thumb mode.  */
+      asm_fprintf (f, "\tldr.w\t%r, [%r, #4]\n",
+                  STATIC_CHAIN_REGNUM, PC_REGNUM);
+      asm_fprintf (f, "\tldr.w\t%r, [%r, #4]\n", PC_REGNUM, PC_REGNUM);
+    }
+  else
+    {
+      ASM_OUTPUT_ALIGN (f, 2);
+      fprintf (f, "\t.code\t16\n");
+      fprintf (f, ".Ltrampoline_start:\n");
+      asm_fprintf (f, "\tpush\t{r0, r1}\n");
+      asm_fprintf (f, "\tldr\tr0, [%r, #8]\n", PC_REGNUM);
+      asm_fprintf (f, "\tmov\t%r, r0\n", STATIC_CHAIN_REGNUM);
+      asm_fprintf (f, "\tldr\tr0, [%r, #8]\n", PC_REGNUM);
+      asm_fprintf (f, "\tstr\tr0, [%r, #4]\n", SP_REGNUM);
+      asm_fprintf (f, "\tpop\t{r0, %r}\n", PC_REGNUM);
+    }
+  assemble_aligned_integer (UNITS_PER_WORD, const0_rtx);
+  assemble_aligned_integer (UNITS_PER_WORD, const0_rtx);
+}
+
+/* Emit RTL insns to initialize the variable parts of a trampoline.  */
+
+static void
+arm_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
+{
+  rtx fnaddr, mem, a_tramp;
+
+  emit_block_move (m_tramp, assemble_trampoline_template (),
+                  GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL);
+
+  mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 8 : 12);
+  emit_move_insn (mem, chain_value);
+
+  mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 12 : 16);
+  fnaddr = XEXP (DECL_RTL (fndecl), 0);
+  emit_move_insn (mem, fnaddr);
+
+  a_tramp = XEXP (m_tramp, 0);
+  emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
+                    LCT_NORMAL, VOIDmode, 2, a_tramp, Pmode,
+                    plus_constant (a_tramp, TRAMPOLINE_SIZE), Pmode);
+}
+
+/* Thumb trampolines should be entered in thumb mode, so set
+   the bottom bit of the address.  */
+
+static rtx
+arm_trampoline_adjust_address (rtx addr)
+{
+  if (TARGET_THUMB)
+    addr = expand_simple_binop (Pmode, IOR, addr, const1_rtx,
+                               NULL, 0, OPTAB_LIB_WIDEN);
+  return addr;
+}
+\f
  /* Return 1 if it is possible to return using a single instruction.
     If SIBLING is non-null, this is a test for a return before a sibling
     call.  SIBLING is the call insn, so we can examine its register usage.  */
@@ -2252,7 +2378,8 @@ arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn,
           && !cond
           && (arm_gen_constant (code, mode, NULL_RTX, val, target, source,
                                 1, 0)
-             > arm_constant_limit + (code != SET)))
+             > (arm_constant_limit (optimize_function_for_size_p (cfun))
+                + (code != SET))))
         {
           if (code == SET)
             {
@@ -2290,20 +2417,24 @@ arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn,
                            1);
  }
  
-/* Return the number of ARM instructions required to synthesize the given
-   constant.  */
+/* Return the number of instructions required to synthesize the given
+   constant, if we start emitting them from bit-position I.  */
  static int
  count_insns_for_constant (HOST_WIDE_INT remainder, int i)
  {
    HOST_WIDE_INT temp1;
+  int step_size = TARGET_ARM ? 2 : 1;
    int num_insns = 0;
+
+  gcc_assert (TARGET_ARM || i == 0);
+
    do
      {
        int end;
  
        if (i <= 0)
         i += 32;
-      if (remainder & (3 << (i - 2)))
+      if (remainder & (((1 << step_size) - 1) << (i - step_size)))
         {
           end = i - 8;
           if (end < 0)
@@ -2312,13 +2443,77 @@ count_insns_for_constant (HOST_WIDE_INT remainder, int i)
                                     | ((i < end) ? (0xff >> (32 - end)) : 0));
           remainder &= ~temp1;
           num_insns++;
-         i -= 6;
+         i -= 8 - step_size;
         }
-      i -= 2;
+      i -= step_size;
      } while (remainder);
    return num_insns;
  }
  
+static int
+find_best_start (unsigned HOST_WIDE_INT remainder)
+{
+  int best_consecutive_zeros = 0;
+  int i;
+  int best_start = 0;
+
+  /* If we aren't targetting ARM, the best place to start is always at
+     the bottom.  */
+  if (! TARGET_ARM)
+    return 0;
+
+  for (i = 0; i < 32; i += 2)
+    {
+      int consecutive_zeros = 0;
+
+      if (!(remainder & (3 << i)))
+       {
+         while ((i < 32) && !(remainder & (3 << i)))
+           {
+             consecutive_zeros += 2;
+             i += 2;
+           }
+         if (consecutive_zeros > best_consecutive_zeros)
+           {
+             best_consecutive_zeros = consecutive_zeros;
+             best_start = i - consecutive_zeros;
+           }
+         i -= 2;
+       }
+    }
+
+  /* So long as it won't require any more insns to do so, it's
+     desirable to emit a small constant (in bits 0...9) in the last
+     insn.  This way there is more chance that it can be combined with
+     a later addressing insn to form a pre-indexed load or store
+     operation.  Consider:
+
+          *((volatile int *)0xe0000100) = 1;
+          *((volatile int *)0xe0000110) = 2;
+
+     We want this to wind up as:
+
+           mov rA, #0xe0000000
+           mov rB, #1
+           str rB, [rA, #0x100]
+           mov rB, #2
+           str rB, [rA, #0x110]
+
+     rather than having to synthesize both large constants from scratch.
+
+     Therefore, we calculate how many insns would be required to emit
+     the constant starting from `best_start', and also starting from
+     zero (i.e. with bit 31 first to be output).  If `best_start' doesn't
+     yield a shorter sequence, we may as well use zero.  */
+  if (best_start != 0
+      && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder)
+      && (count_insns_for_constant (remainder, 0) <=
+         count_insns_for_constant (remainder, best_start)))
+    best_start = 0;
+
+  return best_start;
+}
+
  /* Emit an instruction with the indicated PATTERN.  If COND is
     non-NULL, conditionalize the execution of the instruction on COND
     being true.  */
@@ -2342,8 +2537,8 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
  {
    int can_invert = 0;
    int can_negate = 0;
+  int final_invert = 0;
    int can_negate_initial = 0;
-  int can_shift = 0;
    int i;
    int num_bits_set = 0;
    int set_sign_bit_copies = 0;
@@ -2353,6 +2548,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
    int insns = 0;
    unsigned HOST_WIDE_INT temp1, temp2;
    unsigned HOST_WIDE_INT remainder = val & 0xffffffff;
+  int step_size = TARGET_ARM ? 2 : 1;
  
    /* Find out which operations are safe for a given CODE.  Also do a quick
       check for degenerate cases; these can occur when DImode operations
@@ -2361,7 +2557,6 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
      {
      case SET:
        can_invert = 1;
-      can_shift = 1;
        can_negate = 1;
        break;
  
@@ -2426,14 +2621,15 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
           return 1;
         }
  
-      /* We don't know how to handle other cases yet.  */
-      gcc_assert (remainder == 0xffffffff);
-
-      if (generate)
-       emit_constant_insn (cond,
-                           gen_rtx_SET (VOIDmode, target,
-                                        gen_rtx_NOT (mode, source)));
-      return 1;
+      if (remainder == 0xffffffff)
+       {
+         if (generate)
+           emit_constant_insn (cond,
+                               gen_rtx_SET (VOIDmode, target,
+                                            gen_rtx_NOT (mode, source)));
+         return 1;
+       }
+      break;
  
      case MINUS:
        /* We treat MINUS as (val - source), since (source - val) is always
@@ -2884,9 +3080,25 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
  
    if ((code == AND)
        || (code != IOR && can_invert && num_bits_set > 16))
-    remainder = (~remainder) & 0xffffffff;
+    remainder ^= 0xffffffff;
    else if (code == PLUS && num_bits_set > 16)
      remainder = (-remainder) & 0xffffffff;
+
+  /* For XOR, if more than half the bits are set and there's a sequence
+     of more than 8 consecutive ones in the pattern then we can XOR by the
+     inverted constant and then invert the final result; this may save an
+     instruction and might also lead to the final mvn being merged with
+     some other operation.  */
+  else if (code == XOR && num_bits_set > 16
+          && (count_insns_for_constant (remainder ^ 0xffffffff,
+                                        find_best_start
+                                        (remainder ^ 0xffffffff))
+              < count_insns_for_constant (remainder,
+                                          find_best_start (remainder))))
+    {
+      remainder ^= 0xffffffff;
+      final_invert = 1;
+    }
    else
      {
        can_invert = 0;
@@ -2905,63 +3117,8 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
    /* ??? Use thumb2 replicated constants when the high and low halfwords are
       the same.  */
    {
-    int best_start = 0;
-    if (!TARGET_THUMB2)
-      {
-       int best_consecutive_zeros = 0;
-
-       for (i = 0; i < 32; i += 2)
-         {
-           int consecutive_zeros = 0;
-
-           if (!(remainder & (3 << i)))
-             {
-               while ((i < 32) && !(remainder & (3 << i)))
-                 {
-                   consecutive_zeros += 2;
-                   i += 2;
-                 }
-               if (consecutive_zeros > best_consecutive_zeros)
-                 {
-                   best_consecutive_zeros = consecutive_zeros;
-                   best_start = i - consecutive_zeros;
-                 }
-               i -= 2;
-             }
-         }
-
-       /* So long as it won't require any more insns to do so, it's
-          desirable to emit a small constant (in bits 0...9) in the last
-          insn.  This way there is more chance that it can be combined with
-          a later addressing insn to form a pre-indexed load or store
-          operation.  Consider:
-
-                  *((volatile int *)0xe0000100) = 1;
-                  *((volatile int *)0xe0000110) = 2;
-
-          We want this to wind up as:
-
-                   mov rA, #0xe0000000
-                   mov rB, #1
-                   str rB, [rA, #0x100]
-                   mov rB, #2
-                   str rB, [rA, #0x110]
-
-          rather than having to synthesize both large constants from scratch.
-
-          Therefore, we calculate how many insns would be required to emit
-          the constant starting from `best_start', and also starting from
-          zero (i.e. with bit 31 first to be output).  If `best_start' doesn't
-          yield a shorter sequence, we may as well use zero.  */
-       if (best_start != 0
-           && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder)
-           && (count_insns_for_constant (remainder, 0) <=
-               count_insns_for_constant (remainder, best_start)))
-         best_start = 0;
-      }
-
      /* Now start emitting the insns.  */
-    i = best_start;
+    i = find_best_start (remainder);
      do
        {
         int end;
@@ -2989,7 +3146,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
                   }
                 else
                   {
-                   if (remainder && subtargets)
+                   if ((final_invert || remainder) && subtargets)
                       new_src = gen_reg_rtx (mode);
                     else
                       new_src = target;
@@ -3024,21 +3181,23 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
               code = PLUS;
  
             insns++;
-           if (TARGET_ARM)
-             i -= 6;
-           else
-             i -= 7;
+           i -= 8 - step_size;
           }
         /* Arm allows rotates by a multiple of two. Thumb-2 allows arbitrary
            shifts.  */
-       if (TARGET_ARM)
-         i -= 2;
-       else
-         i--;
+       i -= step_size;
        }
      while (remainder);
    }
  
+  if (final_invert)
+    {
+      if (generate)
+       emit_constant_insn (cond, gen_rtx_SET (VOIDmode, target,
+                                              gen_rtx_NOT (mode, source)));
+      insns++;
+    }
+
    return insns;
  }
  
@@ -3161,7 +3320,7 @@ add_libcall (htab_t htab, rtx libcall)
  }
  
  static bool
-arm_libcall_uses_aapcs_base (rtx libcall)
+arm_libcall_uses_aapcs_base (const_rtx libcall)
  {
    static bool init_done = false;
    static htab_t libcall_htab;
@@ -3208,7 +3367,7 @@ arm_libcall_uses_aapcs_base (rtx libcall)
  }
  
  rtx
-arm_libcall_value (enum machine_mode mode, rtx libcall)
+arm_libcall_value (enum machine_mode mode, const_rtx libcall)
  {
    if (TARGET_AAPCS_BASED && arm_pcs_default != ARM_PCS_AAPCS
        && GET_MODE_CLASS (mode) == MODE_FLOAT)
@@ -3421,9 +3580,13 @@ const struct pcs_attribute_arg
    {
      {"aapcs", ARM_PCS_AAPCS},
      {"aapcs-vfp", ARM_PCS_AAPCS_VFP},
+#if 0
+    /* We could recognize these, but changes would be needed elsewhere
+     * to implement them.  */
      {"aapcs-iwmmxt", ARM_PCS_AAPCS_IWMMXT},
      {"atpcs", ARM_PCS_ATPCS},
      {"apcs", ARM_PCS_APCS},
+#endif
      {NULL, ARM_PCS_UNKNOWN}
    };
  
@@ -3686,38 +3849,57 @@ aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
    return -1;
  }
  
+/* Return true if PCS_VARIANT should use VFP registers.  */
  static bool
-aapcs_vfp_is_call_or_return_candidate (enum machine_mode mode, const_tree type,
-                                      int *base_mode,
-                                      int *count)
+use_vfp_abi (enum arm_pcs pcs_variant, bool is_double)
  {
+  if (pcs_variant == ARM_PCS_AAPCS_VFP)
+    return true;
+
+  if (pcs_variant != ARM_PCS_AAPCS_LOCAL)
+    return false;
+
+  return (TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT &&
+         (TARGET_VFP_DOUBLE || !is_double));
+}
+
+static bool
+aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
+                                      enum machine_mode mode, const_tree type,
+                                      enum machine_mode *base_mode, int *count)
+{
+  enum machine_mode new_mode = VOIDmode;
+
    if (GET_MODE_CLASS (mode) == MODE_FLOAT
        || GET_MODE_CLASS (mode) == MODE_VECTOR_INT
        || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
      {
        *count = 1;
-      *base_mode = mode;
-      return true;
+      new_mode = mode;
      }
    else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
      {
        *count = 2;
-      *base_mode = (mode == DCmode ? DFmode : SFmode);
-      return true;
+      new_mode = (mode == DCmode ? DFmode : SFmode);
      }
    else if (type && (mode == BLKmode || TREE_CODE (type) == VECTOR_TYPE))
      {
-      enum machine_mode aggregate_mode = VOIDmode;
-      int ag_count = aapcs_vfp_sub_candidate (type, &aggregate_mode);
+      int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
  
        if (ag_count > 0 && ag_count <= 4)
-       {
-         *count = ag_count;
-         *base_mode = aggregate_mode;
-         return true;
-       }
+       *count = ag_count;
+      else
+       return false;
      }
-  return false;
+  else
+    return false;
+
+
+  if (!use_vfp_abi (pcs_variant, ARM_NUM_REGS (new_mode) > 1))
+    return false;
+
+  *base_mode = new_mode;
+  return true;
  }
  
  static bool
@@ -3725,24 +3907,22 @@ aapcs_vfp_is_return_candidate (enum arm_pcs pcs_variant,
                                enum machine_mode mode, const_tree type)
  {
    int count ATTRIBUTE_UNUSED;
-  int ag_mode ATTRIBUTE_UNUSED;
+  enum machine_mode ag_mode ATTRIBUTE_UNUSED;
  
-  if (!(pcs_variant == ARM_PCS_AAPCS_VFP
-       || (pcs_variant == ARM_PCS_AAPCS_LOCAL
-           && TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT)))
+  if (!use_vfp_abi (pcs_variant, false))
      return false;
-  return aapcs_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count);
+  return aapcs_vfp_is_call_or_return_candidate (pcs_variant, mode, type,
+                                               &ag_mode, &count);
  }
  
  static bool
  aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, enum machine_mode mode, 
                              const_tree type)
  {
-  if (!(pcum->pcs_variant == ARM_PCS_AAPCS_VFP
-       || (pcum->pcs_variant == ARM_PCS_AAPCS_LOCAL
-           && TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT)))
+  if (!use_vfp_abi (pcum->pcs_variant, false))
      return false;
-  return aapcs_vfp_is_call_or_return_candidate (mode, type,
+
+  return aapcs_vfp_is_call_or_return_candidate (pcum->pcs_variant, mode, type,
                                                 &pcum->aapcs_vfp_rmode,
                                                 &pcum->aapcs_vfp_rcount);
  }
@@ -3803,19 +3983,19 @@ aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED,
                                enum machine_mode mode,
                                const_tree type ATTRIBUTE_UNUSED)
  {
-  if (!(pcs_variant == ARM_PCS_AAPCS_VFP
-       || (pcs_variant == ARM_PCS_AAPCS_LOCAL
-           && TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT)))
+  if (!use_vfp_abi (pcs_variant, false))
      return false;
+
    if (mode == BLKmode || (mode == TImode && !TARGET_NEON))
      {
        int count;
-      int ag_mode;
+      enum machine_mode ag_mode;
        int i;
        rtx par;
        int shift;
        
-      aapcs_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count);
+      aapcs_vfp_is_call_or_return_candidate (pcs_variant, mode, type,
+                                            &ag_mode, &count);
  
        if (!TARGET_NEON)
         {
@@ -4439,8 +4619,7 @@ arm_handle_pcs_attribute (tree *node ATTRIBUTE_UNUSED, tree name, tree args,
  {
    if (arm_pcs_from_attribute (args) == ARM_PCS_UNKNOWN)
      {
-      warning (OPT_Wattributes, "%qs attribute ignored",
-              IDENTIFIER_POINTER (name));
+      warning (OPT_Wattributes, "%qE attribute ignored", name);
        *no_add_attrs = true;
      }
    return NULL_TREE;
@@ -4615,8 +4794,8 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
      return false;
  
    /* Never tailcall something for which we have no decl, or if we
-     are in Thumb mode.  */
-  if (decl == NULL || TARGET_THUMB)
+     are generating code for Thumb-1.  */
+  if (decl == NULL || TARGET_THUMB1)
      return false;
  
    /* The PIC register is live on entry to VxWorks PLT entries, so we
@@ -4742,31 +4921,16 @@ legitimize_pic_address (rtx orig, enum machine_mode mode, rtx reg)
      {
        rtx pic_ref, address;
        rtx insn;
-      int subregs = 0;
-
-      /* If this function doesn't have a pic register, create one now.  */
-      require_pic_register ();
  
        if (reg == 0)
         {
           gcc_assert (can_create_pseudo_p ());
           reg = gen_reg_rtx (Pmode);
-
-         subregs = 1;
+         address = gen_reg_rtx (Pmode);
         }
-
-      if (subregs)
-       address = gen_reg_rtx (Pmode);
        else
         address = reg;
  
-      if (TARGET_ARM)
-       emit_insn (gen_pic_load_addr_arm (address, orig));
-      else if (TARGET_THUMB2)
-       emit_insn (gen_pic_load_addr_thumb2 (address, orig));
-      else /* TARGET_THUMB1 */
-       emit_insn (gen_pic_load_addr_thumb1 (address, orig));
-
        /* VxWorks does not impose a fixed gap between segments; the run-time
          gap can be different from the object-file gap.  We therefore can't
          use GOTOFF unless we are absolutely sure that the symbol is in the
@@ -4778,16 +4942,23 @@ legitimize_pic_address (rtx orig, enum machine_mode mode, rtx reg)
                SYMBOL_REF_LOCAL_P (orig)))
           && NEED_GOT_RELOC
           && !TARGET_VXWORKS_RTP)
-       pic_ref = gen_rtx_PLUS (Pmode, cfun->machine->pic_reg, address);
+       insn = arm_pic_static_addr (orig, reg);
        else
         {
+         /* If this function doesn't have a pic register, create one now.  */
+         require_pic_register ();
+
+         if (TARGET_32BIT)
+           emit_insn (gen_pic_load_addr_32bit (address, orig));
+         else /* TARGET_THUMB1 */
+           emit_insn (gen_pic_load_addr_thumb1 (address, orig));
+
           pic_ref = gen_const_mem (Pmode,
                                    gen_rtx_PLUS (Pmode, cfun->machine->pic_reg,
                                                  address));
+         insn = emit_move_insn (reg, pic_ref);
         }
  
-      insn = emit_move_insn (reg, pic_ref);
-
        /* Put a REG_EQUAL note on this insn, so that it can be optimized
          by loop.  */
        set_unique_reg_note (insn, REG_EQUAL, orig);
@@ -4940,7 +5111,7 @@ arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
      {
        pic_rtx = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE);
        pic_rtx = gen_rtx_CONST (Pmode, pic_rtx);
-      emit_insn (gen_pic_load_addr_arm (pic_reg, pic_rtx));
+      emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx));
  
        emit_insn (gen_rtx_SET (Pmode, pic_reg, gen_rtx_MEM (Pmode, pic_reg)));
  
@@ -4963,29 +5134,13 @@ arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
                                 UNSPEC_GOTSYM_OFF);
        pic_rtx = gen_rtx_CONST (Pmode, pic_rtx);
  
-      if (TARGET_ARM)
-       {
-         emit_insn (gen_pic_load_addr_arm (pic_reg, pic_rtx));
-         emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno));
-       }
-      else if (TARGET_THUMB2)
+      if (TARGET_32BIT)
         {
-         /* Thumb-2 only allows very limited access to the PC.  Calculate the
-            address in a temporary register.  */
-         if (arm_pic_register != INVALID_REGNUM)
-           {
-             pic_tmp = gen_rtx_REG (SImode,
-                                    thumb_find_work_register (saved_regs));
-           }
+         emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx));
+         if (TARGET_ARM)
+           emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno));
           else
-           {
-             gcc_assert (can_create_pseudo_p ());
-             pic_tmp = gen_reg_rtx (Pmode);
-           }
-
-         emit_insn (gen_pic_load_addr_thumb2 (pic_reg, pic_rtx));
-         emit_insn (gen_pic_load_dot_plus_four (pic_tmp, labelno));
-         emit_insn (gen_addsi3 (pic_reg, pic_reg, pic_tmp));
+           emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
         }
        else /* TARGET_THUMB1 */
         {
@@ -5010,6 +5165,43 @@ arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
    emit_use (pic_reg);
  }
  
+/* Generate code to load the address of a static var when flag_pic is set.  */
+static rtx
+arm_pic_static_addr (rtx orig, rtx reg)
+{
+  rtx l1, labelno, offset_rtx, insn;
+
+  gcc_assert (flag_pic);
+
+  /* We use an UNSPEC rather than a LABEL_REF because this label
+     never appears in the code stream.  */
+  labelno = GEN_INT (pic_labelno++);
+  l1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL);
+  l1 = gen_rtx_CONST (VOIDmode, l1);
+
+  /* On the ARM the PC register contains 'dot + 8' at the time of the
+     addition, on the Thumb it is 'dot + 4'.  */
+  offset_rtx = plus_constant (l1, TARGET_ARM ? 8 : 4);
+  offset_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, orig, offset_rtx),
+                               UNSPEC_SYMBOL_OFFSET);
+  offset_rtx = gen_rtx_CONST (Pmode, offset_rtx);
+
+  if (TARGET_32BIT)
+    {
+      emit_insn (gen_pic_load_addr_32bit (reg, offset_rtx));
+      if (TARGET_ARM)
+        insn = emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno));
+      else
+        insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
+    }
+  else /* TARGET_THUMB1 */
+    {
+      emit_insn (gen_pic_load_addr_thumb1 (reg, offset_rtx));
+      insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
+    }
+
+  return insn;
+}
  
  /* Return nonzero if X is valid as an ARM state addressing register.  */
  static int
@@ -5642,14 +5834,7 @@ arm_call_tls_get_addr (rtx x, rtx reg, rtx *valuep, int reloc)
    if (TARGET_ARM)
      emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno));
    else if (TARGET_THUMB2)
-    {
-      rtx tmp;
-      /* Thumb-2 only allows very limited access to the PC.  Calculate
-        the address in a temporary register.  */
-      tmp = gen_reg_rtx (SImode);
-      emit_insn (gen_pic_load_dot_plus_four (tmp, labelno));
-      emit_insn (gen_addsi3(reg, reg, tmp));
-    }
+    emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
    else /* TARGET_THUMB1 */
      emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
  
@@ -5705,15 +5890,7 @@ legitimize_tls_address (rtx x, rtx reg)
        if (TARGET_ARM)
         emit_insn (gen_tls_load_dot_plus_eight (reg, reg, labelno));
        else if (TARGET_THUMB2)
-       {
-         rtx tmp;
-         /* Thumb-2 only allows very limited access to the PC.  Calculate
-            the address in a temporary register.  */
-         tmp = gen_reg_rtx (SImode);
-         emit_insn (gen_pic_load_dot_plus_four (tmp, labelno));
-         emit_insn (gen_addsi3(reg, reg, tmp));
-         emit_move_insn (reg, gen_const_mem (SImode, reg));
-       }
+       emit_insn (gen_tls_load_dot_plus_four (reg, NULL, reg, labelno));
        else
         {
           emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
@@ -6095,9 +6272,18 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
        else if ((outer == PLUS || outer == COMPARE)
                && INTVAL (x) < 256 && INTVAL (x) > -256)
         return 0;
-      else if (outer == AND
+      else if ((outer == IOR || outer == XOR || outer == AND)
                && INTVAL (x) < 256 && INTVAL (x) >= -256)
         return COSTS_N_INSNS (1);
+      else if (outer == AND)
+       {
+         int i;
+         /* This duplicates the tests in the andsi3 expander.  */
+         for (i = 9; i <= 31; i++)
+           if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
+               || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
+             return COSTS_N_INSNS (2);
+       }
        else if (outer == ASHIFT || outer == ASHIFTRT
                || outer == LSHIFTRT)
         return 0;
@@ -6169,7 +6355,6 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
    enum rtx_code subcode;
    rtx operand;
    enum rtx_code code = GET_CODE (x);
-  int extra_cost;
    *total = 0;
  
    switch (code)
@@ -6186,7 +6371,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
      case UMOD:
        if (TARGET_HARD_FLOAT && mode == SFmode)
         *total = COSTS_N_INSNS (2);
-      else if (TARGET_HARD_FLOAT && mode == DFmode)
+      else if (TARGET_HARD_FLOAT && mode == DFmode && !TARGET_VFP_SINGLE)
         *total = COSTS_N_INSNS (4);
        else
         *total = COSTS_N_INSNS (20);
@@ -6264,7 +6449,9 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
  
        if (GET_MODE_CLASS (mode) == MODE_FLOAT)
         {
-         if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+         if (TARGET_HARD_FLOAT
+             && (mode == SFmode
+                 || (mode == DFmode && !TARGET_VFP_SINGLE)))
             {
               *total = COSTS_N_INSNS (1);
               if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
@@ -6359,7 +6546,9 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
  
        if (GET_MODE_CLASS (mode) == MODE_FLOAT)
         {
-         if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+         if (TARGET_HARD_FLOAT
+             && (mode == SFmode
+                 || (mode == DFmode && !TARGET_VFP_SINGLE)))
             {
               *total = COSTS_N_INSNS (1);
               if (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE
@@ -6389,7 +6578,6 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        /* Fall through */
  
      case AND: case XOR: case IOR:
-      extra_cost = 0;
  
        /* Normally the frame registers will be spilt into reg+const during
          reload, so it is a bad idea to combine them with other instructions,
@@ -6472,7 +6660,9 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
      case NEG:
        if (GET_MODE_CLASS (mode) == MODE_FLOAT)
         {
-         if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+         if (TARGET_HARD_FLOAT
+             && (mode == SFmode
+                 || (mode == DFmode && !TARGET_VFP_SINGLE)))
             {
               *total = COSTS_N_INSNS (1);
               return false;
@@ -6621,7 +6811,9 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
      case ABS:
        if (GET_MODE_CLASS (mode) == MODE_FLOAT)
         {
-         if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+         if (TARGET_HARD_FLOAT
+             && (mode == SFmode
+                 || (mode == DFmode && !TARGET_VFP_SINGLE)))
             {
               *total = COSTS_N_INSNS (1);
               return false;
@@ -6724,7 +6916,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        return true;
  
      case CONST_DOUBLE:
-      if (TARGET_HARD_FLOAT && vfp3_const_double_rtx (x))
+      if (TARGET_HARD_FLOAT && vfp3_const_double_rtx (x)
+         && (mode == SFmode || !TARGET_VFP_SINGLE))
         *total = COSTS_N_INSNS (1);
        else
         *total = COSTS_N_INSNS (4);
@@ -6736,6 +6929,130 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
      }
  }
  
+/* Estimates the size cost of thumb1 instructions.
+   For now most of the code is copied from thumb1_rtx_costs. We need more
+   fine grain tuning when we have more related test cases.  */
+static inline int
+thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
+{
+  enum machine_mode mode = GET_MODE (x);
+
+  switch (code)
+    {
+    case ASHIFT:
+    case ASHIFTRT:
+    case LSHIFTRT:
+    case ROTATERT:
+    case PLUS:
+    case MINUS:
+    case COMPARE:
+    case NEG:
+    case NOT:
+      return COSTS_N_INSNS (1);
+
+    case MULT:
+      if (GET_CODE (XEXP (x, 1)) == CONST_INT)
+        {
+          /* Thumb1 mul instruction can't operate on const. We must Load it
+             into a register first.  */
+          int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET);
+          return COSTS_N_INSNS (1) + const_size;
+        }
+      return COSTS_N_INSNS (1);
+
+    case SET:
+      return (COSTS_N_INSNS (1)
+              + 4 * ((GET_CODE (SET_SRC (x)) == MEM)
+                     + GET_CODE (SET_DEST (x)) == MEM));
+
+    case CONST_INT:
+      if (outer == SET)
+        {
+          if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
+            return 0;
+          if (thumb_shiftable_const (INTVAL (x)))
+            return COSTS_N_INSNS (2);
+          return COSTS_N_INSNS (3);
+        }
+      else if ((outer == PLUS || outer == COMPARE)
+               && INTVAL (x) < 256 && INTVAL (x) > -256)
+        return 0;
+      else if ((outer == IOR || outer == XOR || outer == AND)
+               && INTVAL (x) < 256 && INTVAL (x) >= -256)
+        return COSTS_N_INSNS (1);
+      else if (outer == AND)
+        {
+          int i;
+          /* This duplicates the tests in the andsi3 expander.  */
+          for (i = 9; i <= 31; i++)
+            if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x)
+                || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x))
+              return COSTS_N_INSNS (2);
+        }
+      else if (outer == ASHIFT || outer == ASHIFTRT
+               || outer == LSHIFTRT)
+        return 0;
+      return COSTS_N_INSNS (2);
+
+    case CONST:
+    case CONST_DOUBLE:
+    case LABEL_REF:
+    case SYMBOL_REF:
+      return COSTS_N_INSNS (3);
+
+    case UDIV:
+    case UMOD:
+    case DIV:
+    case MOD:
+      return 100;
+
+    case TRUNCATE:
+      return 99;
+
+    case AND:
+    case XOR:
+    case IOR:
+      /* XXX guess.  */
+      return 8;
+
+    case MEM:
+      /* XXX another guess.  */
+      /* Memory costs quite a lot for the first word, but subsequent words
+         load at the equivalent of a single insn each.  */
+      return (10 + 4 * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD)
+              + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
+                 ? 4 : 0));
+
+    case IF_THEN_ELSE:
+      /* XXX a guess.  */
+      if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC)
+        return 14;
+      return 2;
+
+    case ZERO_EXTEND:
+      /* XXX still guessing.  */
+      switch (GET_MODE (XEXP (x, 0)))
+        {
+          case QImode:
+            return (1 + (mode == DImode ? 4 : 0)
+                    + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0));
+
+          case HImode:
+            return (4 + (mode == DImode ? 4 : 0)
+                    + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0));
+
+          case SImode:
+            return (1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0));
+
+          default:
+            return 99;
+        }
+
+    default:
+      return 99;
+    }
+}
+
  /* RTX costs when optimizing for size.  */
  static bool
  arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
@@ -6744,8 +7061,7 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
    enum machine_mode mode = GET_MODE (x);
    if (TARGET_THUMB1)
      {
-      /* XXX TBD.  For now, use the standard costs.  */
-      *total = thumb1_rtx_costs (x, code, outer_code);
+      *total = thumb1_size_rtx_costs (x, code, outer_code);
        return true;
      }
  
@@ -6799,7 +7115,8 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
        return false;
  
      case MINUS:
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT)
+      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
+         && (mode == SFmode || !TARGET_VFP_SINGLE))
         {
           *total = COSTS_N_INSNS (1);
           return false;
@@ -6829,7 +7146,8 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
        return false;
  
      case PLUS:
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT)
+      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
+         && (mode == SFmode || !TARGET_VFP_SINGLE))
         {
           *total = COSTS_N_INSNS (1);
           return false;
@@ -6869,7 +7187,8 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
        return false;
  
      case NEG:
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT)
+      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
+         && (mode == SFmode || !TARGET_VFP_SINGLE))
         {
           *total = COSTS_N_INSNS (1);
           return false;
@@ -6893,7 +7212,8 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
        return false;
  
      case ABS:
-      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT)
+      if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT
+         && (mode == SFmode || !TARGET_VFP_SINGLE))
         *total = COSTS_N_INSNS (1);
        else
         *total = COSTS_N_INSNS (1 + ARM_NUM_REGS (mode));
@@ -6991,9 +7311,9 @@ arm_rtx_costs (rtx x, int code, int outer_code, int *total,
      return arm_size_rtx_costs (x, (enum rtx_code) code,
                                (enum rtx_code) outer_code, total);
    else
-    return all_cores[(int)arm_tune].rtx_costs (x, (enum rtx_code) code,
-                                              (enum rtx_code) outer_code,
-                                              total, speed);
+    return current_tune->rtx_costs (x, (enum rtx_code) code,
+                                   (enum rtx_code) outer_code,
+                                   total, speed);
  }
  
  /* RTX costs for cores with a slow MUL implementation.  Thumb-2 is not
@@ -7115,7 +7435,9 @@ arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
  
        if (GET_MODE_CLASS (mode) == MODE_FLOAT)
         {
-         if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+         if (TARGET_HARD_FLOAT
+             && (mode == SFmode
+                 || (mode == DFmode && !TARGET_VFP_SINGLE)))
             {
               *total = COSTS_N_INSNS (1);
               return false;
@@ -7136,7 +7458,8 @@ arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
     so it can be ignored.  */
  
  static bool
-arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, int *total, bool speed)
+arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+                     int *total, bool speed)
  {
    enum machine_mode mode = GET_MODE (x);
  
@@ -7272,7 +7595,9 @@ arm_9e_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
  
        if (GET_MODE_CLASS (mode) == MODE_FLOAT)
         {
-         if (TARGET_HARD_FLOAT && (mode == SFmode || mode == DFmode))
+         if (TARGET_HARD_FLOAT
+             && (mode == SFmode
+                 || (mode == DFmode && !TARGET_VFP_SINGLE)))
             {
               *total = COSTS_N_INSNS (1);
               return false;
@@ -7474,7 +7799,7 @@ neg_const_double_rtx_ok_for_fpa (rtx x)
      init_fp_table ();
  
    REAL_VALUE_FROM_CONST_DOUBLE (r, x);
-  r = REAL_VALUE_NEGATE (r);
+  r = real_value_negate (&r);
    if (REAL_VALUE_MINUS_ZERO (r))
      return 0;
  
@@ -7525,7 +7850,7 @@ vfp3_const_double_index (rtx x)
  
    /* Extract sign, exponent and mantissa.  */
    sign = REAL_VALUE_NEGATIVE (r) ? 1 : 0;
-  r = REAL_VALUE_ABS (r);
+  r = real_value_abs (&r);
    exponent = REAL_EXP (&r);
    /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
       highest (sign) bit, with a fixed binary point at bit point_pos.
@@ -7918,25 +8243,171 @@ neon_pairwise_reduce (rtx op0, rtx op1, enum machine_mode mode,
      }
  }
  
-/* Initialize a vector with non-constant elements.  FIXME: We can do better
-   than the current implementation (building a vector on the stack and then
-   loading it) in many cases.  See rs6000.c.  */
+/* If VALS is a vector constant that can be loaded into a register
+   using VDUP, generate instructions to do so and return an RTX to
+   assign to the register.  Otherwise return NULL_RTX.  */
+
+static rtx
+neon_vdup_constant (rtx vals)
+{
+  enum machine_mode mode = GET_MODE (vals);
+  enum machine_mode inner_mode = GET_MODE_INNER (mode);
+  int n_elts = GET_MODE_NUNITS (mode);
+  bool all_same = true;
+  rtx x;
+  int i;
+
+  if (GET_CODE (vals) != CONST_VECTOR || GET_MODE_SIZE (inner_mode) > 4)
+    return NULL_RTX;
+
+  for (i = 0; i < n_elts; ++i)
+    {
+      x = XVECEXP (vals, 0, i);
+      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+       all_same = false;
+    }
+
+  if (!all_same)
+    /* The elements are not all the same.  We could handle repeating
+       patterns of a mode larger than INNER_MODE here (e.g. int8x8_t
+       {0, C, 0, C, 0, C, 0, C} which can be loaded using
+       vdup.i16).  */
+    return NULL_RTX;
+
+  /* We can load this constant by using VDUP and a constant in a
+     single ARM register.  This will be cheaper than a vector
+     load.  */
+
+  x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+  return gen_rtx_UNSPEC (mode, gen_rtvec (1, x),
+                        UNSPEC_VDUP_N);
+}
+
+/* Generate code to load VALS, which is a PARALLEL containing only
+   constants (for vec_init) or CONST_VECTOR, efficiently into a
+   register.  Returns an RTX to copy into the register, or NULL_RTX
+   for a PARALLEL that can not be converted into a CONST_VECTOR.  */
+
+rtx
+neon_make_constant (rtx vals)
+{
+  enum machine_mode mode = GET_MODE (vals);
+  rtx target;
+  rtx const_vec = NULL_RTX;
+  int n_elts = GET_MODE_NUNITS (mode);
+  int n_const = 0;
+  int i;
+
+  if (GET_CODE (vals) == CONST_VECTOR)
+    const_vec = vals;
+  else if (GET_CODE (vals) == PARALLEL)
+    {
+      /* A CONST_VECTOR must contain only CONST_INTs and
+        CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
+        Only store valid constants in a CONST_VECTOR.  */
+      for (i = 0; i < n_elts; ++i)
+       {
+         rtx x = XVECEXP (vals, 0, i);
+         if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE)
+           n_const++;
+       }
+      if (n_const == n_elts)
+       const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
+    }
+  else
+    gcc_unreachable ();
+
+  if (const_vec != NULL
+      && neon_immediate_valid_for_move (const_vec, mode, NULL, NULL))
+    /* Load using VMOV.  On Cortex-A8 this takes one cycle.  */
+    return const_vec;
+  else if ((target = neon_vdup_constant (vals)) != NULL_RTX)
+    /* Loaded using VDUP.  On Cortex-A8 the VDUP takes one NEON
+       pipeline cycle; creating the constant takes one or two ARM
+       pipeline cycles.  */
+    return target;
+  else if (const_vec != NULL_RTX)
+    /* Load from constant pool.  On Cortex-A8 this takes two cycles
+       (for either double or quad vectors).  We can not take advantage
+       of single-cycle VLD1 because we need a PC-relative addressing
+       mode.  */
+    return const_vec;
+  else
+    /* A PARALLEL containing something not valid inside CONST_VECTOR.
+       We can not construct an initializer.  */
+    return NULL_RTX;
+}
+
+/* Initialize vector TARGET to VALS.  */
  
  void
  neon_expand_vector_init (rtx target, rtx vals)
  {
    enum machine_mode mode = GET_MODE (target);
-  enum machine_mode inner = GET_MODE_INNER (mode);
-  unsigned int i, n_elts = GET_MODE_NUNITS (mode);
-  rtx mem;
+  enum machine_mode inner_mode = GET_MODE_INNER (mode);
+  int n_elts = GET_MODE_NUNITS (mode);
+  int n_var = 0, one_var = -1;
+  bool all_same = true;
+  rtx x, mem;
+  int i;
+
+  for (i = 0; i < n_elts; ++i)
+    {
+      x = XVECEXP (vals, 0, i);
+      if (!CONSTANT_P (x))
+       ++n_var, one_var = i;
+
+      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
+       all_same = false;
+    }
+
+  if (n_var == 0)
+    {
+      rtx constant = neon_make_constant (vals);
+      if (constant != NULL_RTX)
+       {
+         emit_move_insn (target, constant);
+         return;
+       }
+    }
  
-  gcc_assert (VECTOR_MODE_P (mode));
+  /* Splat a single non-constant element if we can.  */
+  if (all_same && GET_MODE_SIZE (inner_mode) <= 4)
+    {
+      x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+      emit_insn (gen_rtx_SET (VOIDmode, target,
+                             gen_rtx_UNSPEC (mode, gen_rtvec (1, x),
+                                             UNSPEC_VDUP_N)));
+      return;
+    }
  
+  /* One field is non-constant.  Load constant then overwrite varying
+     field.  This is more efficient than using the stack.  */
+  if (n_var == 1)
+    {
+      rtx copy = copy_rtx (vals);
+      rtvec ops;
+
+      /* Load constant part of vector, substitute neighboring value for
+        varying element.  */
+      XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, (one_var + 1) % n_elts);
+      neon_expand_vector_init (target, copy);
+
+      /* Insert variable.  */
+      x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
+      ops = gen_rtvec (3, x, target, GEN_INT (one_var));
+      emit_insn (gen_rtx_SET (VOIDmode, target,
+                             gen_rtx_UNSPEC (mode, ops, UNSPEC_VSET_LANE)));
+      return;
+    }
+
+  /* Construct the vector in memory one field at a time
+     and load the whole vector.  */
    mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), 0);
    for (i = 0; i < n_elts; i++)
-    emit_move_insn (adjust_address_nv (mem, inner, i * GET_MODE_SIZE (inner)),
-                   XVECEXP (vals, 0, i));
-
+    emit_move_insn (adjust_address_nv (mem, inner_mode,
+                                   i * GET_MODE_SIZE (inner_mode)),
+                   XVECEXP (vals, 0, i));
    emit_move_insn (target, mem);
  }
  
@@ -8211,6 +8682,8 @@ coproc_secondary_reload_class (enum machine_mode mode, rtx x, bool wb)
  {
    if (mode == HFmode)
      {
+      if (!TARGET_NEON_FP16)
+       return GENERAL_REGS;
        if (s_register_operand (x, mode) || neon_vector_mem_operand (x, 2))
         return NO_REGS;
        return GENERAL_REGS;
@@ -8497,28 +8970,21 @@ tls_mentioned_p (rtx x)
      }
  }
  
-/* Must not copy a SET whose source operand is PC-relative.  */
+/* Must not copy any rtx that uses a pc-relative address.  */
+
+static int
+arm_note_pic_base (rtx *x, void *date ATTRIBUTE_UNUSED)
+{
+  if (GET_CODE (*x) == UNSPEC
+      && XINT (*x, 1) == UNSPEC_PIC_BASE)
+    return 1;
+  return 0;
+}
  
  static bool
  arm_cannot_copy_insn_p (rtx insn)
  {
-  rtx pat = PATTERN (insn);
-
-  if (GET_CODE (pat) == SET)
-    {
-      rtx rhs = SET_SRC (pat);
-
-      if (GET_CODE (rhs) == UNSPEC
-         && XINT (rhs, 1) == UNSPEC_PIC_BASE)
-       return TRUE;
-
-      if (GET_CODE (rhs) == MEM
-         && GET_CODE (XEXP (rhs, 0)) == UNSPEC
-         && XINT (XEXP (rhs, 0), 1) == UNSPEC_PIC_BASE)
-       return TRUE;
-    }
-
-  return FALSE;
+  return for_each_rtx (&PATTERN (insn), arm_note_pic_base, NULL);
  }
  
  enum rtx_code
@@ -8587,40 +9053,124 @@ adjacent_mem_locations (rtx a, rtx b)
        if (arm_eliminable_register (reg0))
         return 0;
  
-      val_diff = val1 - val0;
+      val_diff = val1 - val0;
+
+      if (arm_ld_sched)
+       {
+         /* If the target has load delay slots, then there's no benefit
+            to using an ldm instruction unless the offset is zero and
+            we are optimizing for size.  */
+         return (optimize_size && (REGNO (reg0) == REGNO (reg1))
+                 && (val0 == 0 || val1 == 0 || val0 == 4 || val1 == 4)
+                 && (val_diff == 4 || val_diff == -4));
+       }
+
+      return ((REGNO (reg0) == REGNO (reg1))
+             && (val_diff == 4 || val_diff == -4));
+    }
+
+  return 0;
+}
+
+/* Return true iff it would be profitable to turn a sequence of NOPS loads
+   or stores (depending on IS_STORE) into a load-multiple or store-multiple
+   instruction.  ADD_OFFSET is nonzero if the base address register needs
+   to be modified with an add instruction before we can use it.  */
+
+static bool
+multiple_operation_profitable_p (bool is_store ATTRIBUTE_UNUSED,
+                                int nops, HOST_WIDE_INT add_offset)
+ {
+  /* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm
+     if the offset isn't small enough.  The reason 2 ldrs are faster
+     is because these ARMs are able to do more than one cache access
+     in a single cycle.  The ARM9 and StrongARM have Harvard caches,
+     whilst the ARM8 has a double bandwidth cache.  This means that
+     these cores can do both an instruction fetch and a data fetch in
+     a single cycle, so the trick of calculating the address into a
+     scratch register (one of the result regs) and then doing a load
+     multiple actually becomes slower (and no smaller in code size).
+     That is the transformation
+
+       ldr     rd1, [rbase + offset]
+       ldr     rd2, [rbase + offset + 4]
+
+     to
+
+       add     rd1, rbase, offset
+       ldmia   rd1, {rd1, rd2}
+
+     produces worse code -- '3 cycles + any stalls on rd2' instead of
+     '2 cycles + any stalls on rd2'.  On ARMs with only one cache
+     access per cycle, the first sequence could never complete in less
+     than 6 cycles, whereas the ldm sequence would only take 5 and
+     would make better use of sequential accesses if not hitting the
+     cache.
+
+     We cheat here and test 'arm_ld_sched' which we currently know to
+     only be true for the ARM8, ARM9 and StrongARM.  If this ever
+     changes, then the test below needs to be reworked.  */
+  if (nops == 2 && arm_ld_sched && add_offset != 0)
+    return false;
+
+  return true;
+}
+
+/* Subroutine of load_multiple_sequence and store_multiple_sequence.
+   Given an array of UNSORTED_OFFSETS, of which there are NOPS, compute
+   an array ORDER which describes the sequence to use when accessing the
+   offsets that produces an ascending order.  In this sequence, each
+   offset must be larger by exactly 4 than the previous one.  ORDER[0]
+   must have been filled in with the lowest offset by the caller.
+   If UNSORTED_REGS is nonnull, it is an array of register numbers that
+   we use to verify that ORDER produces an ascending order of registers.
+   Return true if it was possible to construct such an order, false if
+   not.  */
  
-      if (arm_ld_sched)
-       {
-         /* If the target has load delay slots, then there's no benefit
-            to using an ldm instruction unless the offset is zero and
-            we are optimizing for size.  */
-         return (optimize_size && (REGNO (reg0) == REGNO (reg1))
-                 && (val0 == 0 || val1 == 0 || val0 == 4 || val1 == 4)
-                 && (val_diff == 4 || val_diff == -4));
-       }
+static bool
+compute_offset_order (int nops, HOST_WIDE_INT *unsorted_offsets, int *order,
+                     int *unsorted_regs)
+{
+  int i;
+  for (i = 1; i < nops; i++)
+    {
+      int j;
  
-      return ((REGNO (reg0) == REGNO (reg1))
-             && (val_diff == 4 || val_diff == -4));
+      order[i] = order[i - 1];
+      for (j = 0; j < nops; j++)
+       if (unsorted_offsets[j] == unsorted_offsets[order[i - 1]] + 4)
+         {
+           /* We must find exactly one offset that is higher than the
+              previous one by 4.  */
+           if (order[i] != order[i - 1])
+             return false;
+           order[i] = j;
+         }
+      if (order[i] == order[i - 1])
+       return false;
+      /* The register numbers must be ascending.  */
+      if (unsorted_regs != NULL
+         && unsorted_regs[order[i]] <= unsorted_regs[order[i - 1]])
+       return false;
      }
-
-  return 0;
+  return true;
  }
  
  int
  load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
                         HOST_WIDE_INT *load_offset)
  {
-  int unsorted_regs[4];
-  HOST_WIDE_INT unsorted_offsets[4];
-  int order[4];
+  int unsorted_regs[MAX_LDM_STM_OPS];
+  HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
+  int order[MAX_LDM_STM_OPS];
    int base_reg = -1;
-  int i;
+  int i, ldm_case;
  
-  /* Can only handle 2, 3, or 4 insns at present,
-     though could be easily extended if required.  */
-  gcc_assert (nops >= 2 && nops <= 4);
+  /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
+     easily extended if required.  */
+  gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
  
-  memset (order, 0, 4 * sizeof (int));
+  memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
  
    /* Loop over the operands and check that the memory references are
       suitable (i.e. immediate offsets from the same base register).  At
@@ -8656,25 +9206,16 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
                   == CONST_INT)))
         {
           if (i == 0)
-           {
-             base_reg = REGNO (reg);
-             unsorted_regs[0] = (GET_CODE (operands[i]) == REG
-                                 ? REGNO (operands[i])
-                                 : REGNO (SUBREG_REG (operands[i])));
-             order[0] = 0;
-           }
+           base_reg = REGNO (reg);
           else
             {
               if (base_reg != (int) REGNO (reg))
                 /* Not addressed from the same base register.  */
                 return 0;
-
-             unsorted_regs[i] = (GET_CODE (operands[i]) == REG
-                                 ? REGNO (operands[i])
-                                 : REGNO (SUBREG_REG (operands[i])));
-             if (unsorted_regs[i] < unsorted_regs[order[0]])
-               order[0] = i;
             }
+         unsorted_regs[i] = (GET_CODE (operands[i]) == REG
+                             ? REGNO (operands[i])
+                             : REGNO (SUBREG_REG (operands[i])));
  
           /* If it isn't an integer register, or if it overwrites the
              base register but isn't the last insn in the list, then
@@ -8684,6 +9225,8 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
             return 0;
  
           unsorted_offsets[i] = INTVAL (offset);
+         if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]])
+           order[0] = i;
         }
        else
         /* Not a suitable memory address.  */
@@ -8692,30 +9235,11 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
  
    /* All the useful information has now been extracted from the
       operands into unsorted_regs and unsorted_offsets; additionally,
-     order[0] has been set to the lowest numbered register in the
-     list.  Sort the registers into order, and check that the memory
-     offsets are ascending and adjacent.  */
-
-  for (i = 1; i < nops; i++)
-    {
-      int j;
-
-      order[i] = order[i - 1];
-      for (j = 0; j < nops; j++)
-       if (unsorted_regs[j] > unsorted_regs[order[i - 1]]
-           && (order[i] == order[i - 1]
-               || unsorted_regs[j] < unsorted_regs[order[i]]))
-         order[i] = j;
-
-      /* Have we found a suitable register? if not, one must be used more
-        than once.  */
-      if (order[i] == order[i - 1])
-       return 0;
-
-      /* Is the memory address adjacent and ascending? */
-      if (unsorted_offsets[order[i]] != unsorted_offsets[order[i - 1]] + 4)
-       return 0;
-    }
+     order[0] has been set to the lowest offset in the list.  Sort
+     the offsets into order, verifying that they are adjacent, and
+     check that the register numbers are ascending.  */
+  if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
+    return 0;
  
    if (base)
      {
@@ -8728,59 +9252,31 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
      }
  
    if (unsorted_offsets[order[0]] == 0)
-    return 1; /* ldmia */
-
-  if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
-    return 2; /* ldmib */
-
-  if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
-    return 3; /* ldmda */
-
-  if (unsorted_offsets[order[nops - 1]] == -4)
-    return 4; /* ldmdb */
-
-  /* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm
-     if the offset isn't small enough.  The reason 2 ldrs are faster
-     is because these ARMs are able to do more than one cache access
-     in a single cycle.  The ARM9 and StrongARM have Harvard caches,
-     whilst the ARM8 has a double bandwidth cache.  This means that
-     these cores can do both an instruction fetch and a data fetch in
-     a single cycle, so the trick of calculating the address into a
-     scratch register (one of the result regs) and then doing a load
-     multiple actually becomes slower (and no smaller in code size).
-     That is the transformation
-
-       ldr     rd1, [rbase + offset]
-       ldr     rd2, [rbase + offset + 4]
-
-     to
-
-       add     rd1, rbase, offset
-       ldmia   rd1, {rd1, rd2}
-
-     produces worse code -- '3 cycles + any stalls on rd2' instead of
-     '2 cycles + any stalls on rd2'.  On ARMs with only one cache
-     access per cycle, the first sequence could never complete in less
-     than 6 cycles, whereas the ldm sequence would only take 5 and
-     would make better use of sequential accesses if not hitting the
-     cache.
+    ldm_case = 1; /* ldmia */
+  else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
+    ldm_case = 2; /* ldmib */
+  else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
+    ldm_case = 3; /* ldmda */
+  else if (unsorted_offsets[order[nops - 1]] == -4)
+    ldm_case = 4; /* ldmdb */
+  else if (const_ok_for_arm (unsorted_offsets[order[0]])
+          || const_ok_for_arm (-unsorted_offsets[order[0]]))
+    ldm_case = 5;
+  else
+    return 0;
  
-     We cheat here and test 'arm_ld_sched' which we currently know to
-     only be true for the ARM8, ARM9 and StrongARM.  If this ever
-     changes, then the test below needs to be reworked.  */
-  if (nops == 2 && arm_ld_sched)
+  if (!multiple_operation_profitable_p (false, nops,
+                                       ldm_case == 5
+                                       ? unsorted_offsets[order[0]] : 0))
      return 0;
  
-  /* Can't do it without setting up the offset, only do this if it takes
-     no more than one insn.  */
-  return (const_ok_for_arm (unsorted_offsets[order[0]])
-         || const_ok_for_arm (-unsorted_offsets[order[0]])) ? 5 : 0;
+  return ldm_case;
  }
  
  const char *
  emit_ldm_seq (rtx *operands, int nops)
  {
-  int regs[4];
+  int regs[MAX_LDM_STM_OPS];
    int base_reg;
    HOST_WIDE_INT offset;
    char buf[100];
@@ -8839,17 +9335,17 @@ int
  store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
                          HOST_WIDE_INT * load_offset)
  {
-  int unsorted_regs[4];
-  HOST_WIDE_INT unsorted_offsets[4];
-  int order[4];
+  int unsorted_regs[MAX_LDM_STM_OPS];
+  HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
+  int order[MAX_LDM_STM_OPS];
    int base_reg = -1;
-  int i;
+  int i, stm_case;
  
-  /* Can only handle 2, 3, or 4 insns at present, though could be easily
-     extended if required.  */
-  gcc_assert (nops >= 2 && nops <= 4);
+  /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
+     easily extended if required.  */
+  gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
  
-  memset (order, 0, 4 * sizeof (int));
+  memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
  
    /* Loop over the operands and check that the memory references are
       suitable (i.e. immediate offsets from the same base register).  At
@@ -8884,32 +9380,22 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
               && (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1))
                   == CONST_INT)))
         {
+         unsorted_regs[i] = (GET_CODE (operands[i]) == REG
+                             ? REGNO (operands[i])
+                             : REGNO (SUBREG_REG (operands[i])));
           if (i == 0)
-           {
-             base_reg = REGNO (reg);
-             unsorted_regs[0] = (GET_CODE (operands[i]) == REG
-                                 ? REGNO (operands[i])
-                                 : REGNO (SUBREG_REG (operands[i])));
-             order[0] = 0;
-           }
-         else
-           {
-             if (base_reg != (int) REGNO (reg))
-               /* Not addressed from the same base register.  */
-               return 0;
-
-             unsorted_regs[i] = (GET_CODE (operands[i]) == REG
-                                 ? REGNO (operands[i])
-                                 : REGNO (SUBREG_REG (operands[i])));
-             if (unsorted_regs[i] < unsorted_regs[order[0]])
-               order[0] = i;
-           }
+           base_reg = REGNO (reg);
+         else if (base_reg != (int) REGNO (reg))
+           /* Not addressed from the same base register.  */
+           return 0;
  
           /* If it isn't an integer register, then we can't do this.  */
           if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14)
             return 0;
  
           unsorted_offsets[i] = INTVAL (offset);
+         if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]])
+           order[0] = i;
         }
        else
         /* Not a suitable memory address.  */
@@ -8918,30 +9404,11 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
  
    /* All the useful information has now been extracted from the
       operands into unsorted_regs and unsorted_offsets; additionally,
-     order[0] has been set to the lowest numbered register in the
-     list.  Sort the registers into order, and check that the memory
-     offsets are ascending and adjacent.  */
-
-  for (i = 1; i < nops; i++)
-    {
-      int j;
-
-      order[i] = order[i - 1];
-      for (j = 0; j < nops; j++)
-       if (unsorted_regs[j] > unsorted_regs[order[i - 1]]
-           && (order[i] == order[i - 1]
-               || unsorted_regs[j] < unsorted_regs[order[i]]))
-         order[i] = j;
-
-      /* Have we found a suitable register? if not, one must be used more
-        than once.  */
-      if (order[i] == order[i - 1])
-       return 0;
-
-      /* Is the memory address adjacent and ascending? */
-      if (unsorted_offsets[order[i]] != unsorted_offsets[order[i - 1]] + 4)
-       return 0;
-    }
+     order[0] has been set to the lowest offset in the list.  Sort
+     the offsets into order, verifying that they are adjacent, and
+     check that the register numbers are ascending.  */
+  if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
+    return 0;
  
    if (base)
      {
@@ -8954,24 +9421,26 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
      }
  
    if (unsorted_offsets[order[0]] == 0)
-    return 1; /* stmia */
-
-  if (unsorted_offsets[order[0]] == 4)
-    return 2; /* stmib */
-
-  if (unsorted_offsets[order[nops - 1]] == 0)
-    return 3; /* stmda */
+    stm_case = 1; /* stmia */
+  else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
+    stm_case = 2; /* stmib */
+  else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
+    stm_case = 3; /* stmda */
+  else if (unsorted_offsets[order[nops - 1]] == -4)
+    stm_case = 4; /* stmdb */
+  else
+    return 0;
  
-  if (unsorted_offsets[order[nops - 1]] == -4)
-    return 4; /* stmdb */
+  if (!multiple_operation_profitable_p (false, nops, 0))
+    return 0;
  
-  return 0;
+  return stm_case;
  }
  
  const char *
  emit_stm_seq (rtx *operands, int nops)
  {
-  int regs[4];
+  int regs[MAX_LDM_STM_OPS];
    int base_reg;
    HOST_WIDE_INT offset;
    char buf[100];
@@ -9497,7 +9966,8 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
  
    /* A compare with a shifted operand.  Because of canonicalization, the
       comparison will have to be swapped when we emit the assembler.  */
-  if (GET_MODE (y) == SImode && GET_CODE (y) == REG
+  if (GET_MODE (y) == SImode 
+      && (REG_P (y) || (GET_CODE (y) == SUBREG))
        && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
           || GET_CODE (x) == LSHIFTRT || GET_CODE (x) == ROTATE
           || GET_CODE (x) == ROTATERT))
@@ -9505,7 +9975,8 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
  
    /* This operation is performed swapped, but since we only rely on the Z
       flag we don't need an additional mode.  */
-  if (GET_MODE (y) == SImode && REG_P (y)
+  if (GET_MODE (y) == SImode 
+      && (REG_P (y) || (GET_CODE (y) == SUBREG))
        && GET_CODE (x) == NEG
        && (op ==        EQ || op == NE))
      return CC_Zmode;
@@ -11351,9 +11822,14 @@ vfp_emit_fstmd (int base_reg, int count)
  
    XVECEXP (par, 0, 0)
      = gen_rtx_SET (VOIDmode,
-                  gen_frame_mem (BLKmode,
-                                 gen_rtx_PRE_DEC (BLKmode,
-                                                  stack_pointer_rtx)),
+                  gen_frame_mem
+                  (BLKmode,
+                   gen_rtx_PRE_MODIFY (Pmode,
+                                       stack_pointer_rtx,
+                                       plus_constant
+                                       (stack_pointer_rtx,
+                                        - (count * 8)))
+                   ),
                    gen_rtx_UNSPEC (BLKmode,
                                    gen_rtvec (1, reg),
                                    UNSPEC_PUSH_MULT));
@@ -11439,11 +11915,14 @@ output_call (rtx *operands)
    return "";
  }
  
-/* Output a 'call' insn that is a reference in memory.  */
+/* Output a 'call' insn that is a reference in memory. This is
+   disabled for ARMv5 and we prefer a blx instead because otherwise
+   there's a significant performance overhead.  */
  const char *
  output_call_mem (rtx *operands)
  {
-  if (TARGET_INTERWORK && !arm_arch5)
+  gcc_assert (!arm_arch5);
+  if (TARGET_INTERWORK)
      {
        output_asm_insn ("ldr%?\t%|ip, %0", operands);
        output_asm_insn ("mov%?\t%|lr, %|pc", operands);
@@ -11455,16 +11934,11 @@ output_call_mem (rtx *operands)
          first instruction.  It's safe to use IP as the target of the
          load since the call will kill it anyway.  */
        output_asm_insn ("ldr%?\t%|ip, %0", operands);
-      if (arm_arch5)
-       output_asm_insn ("blx%?\t%|ip", operands);
+      output_asm_insn ("mov%?\t%|lr, %|pc", operands);
+      if (arm_arch4t)
+       output_asm_insn ("bx%?\t%|ip", operands);
        else
-       {
-         output_asm_insn ("mov%?\t%|lr, %|pc", operands);
-         if (arm_arch4t)
-           output_asm_insn ("bx%?\t%|ip", operands);
-         else
-           output_asm_insn ("mov%?\t%|pc, %|ip", operands);
-       }
+       output_asm_insn ("mov%?\t%|pc, %|ip", operands);
      }
    else
      {
@@ -11551,14 +12025,23 @@ output_mov_long_double_arm_from_arm (rtx *operands)
    return "";
  }
  
-
-/* Emit a MOVW/MOVT pair.  */
-void arm_emit_movpair (rtx dest, rtx src)
-{
-  emit_set_insn (dest, gen_rtx_HIGH (SImode, src));
-  emit_set_insn (dest, gen_rtx_LO_SUM (SImode, dest, src));
-}
-
+void
+arm_emit_movpair (rtx dest, rtx src)
+ {
+  /* If the src is an immediate, simplify it.  */
+  if (CONST_INT_P (src))
+    {
+      HOST_WIDE_INT val = INTVAL (src);
+      emit_set_insn (dest, GEN_INT (val & 0x0000ffff));
+      if ((val >> 16) & 0x0000ffff)
+        emit_set_insn (gen_rtx_ZERO_EXTRACT (SImode, dest, GEN_INT (16),
+                                             GEN_INT (16)),
+                       GEN_INT ((val >> 16) & 0x0000ffff));
+      return;
+    }
+   emit_set_insn (dest, gen_rtx_HIGH (SImode, src));
+   emit_set_insn (dest, gen_rtx_LO_SUM (SImode, dest, src));
+ }
  
  /* Output a move from arm registers to an fpa registers.
     OPERANDS[0] is an fpa register.
@@ -12152,7 +12635,7 @@ output_move_neon (rtx *operands)
           {
             /* We're only using DImode here because it's a convenient size.  */
             ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i);
-           ops[1] = adjust_address (mem, SImode, 8 * i);
+           ops[1] = adjust_address (mem, DImode, 8 * i);
             if (reg_overlap_mentioned_p (ops[0], mem))
               {
                 gcc_assert (overlap == -1);
@@ -12493,6 +12976,20 @@ arm_compute_save_reg0_reg12_mask (void)
           && crtl->uses_pic_offset_table)
         save_reg_mask |= 1 << PIC_OFFSET_TABLE_REGNUM;
      }
+  else if (IS_VOLATILE(func_type))
+    {
+      /* For noreturn functions we historically omitted register saves
+        altogether.  However this really messes up debugging.  As a
+        compromise save just the frame pointers.  Combined with the link
+        register saved elsewhere this should be sufficient to get
+        a backtrace.  */
+      if (frame_pointer_needed)
+       save_reg_mask |= 1 << HARD_FRAME_POINTER_REGNUM;
+      if (df_regs_ever_live_p (ARM_HARD_FRAME_POINTER_REGNUM))
+       save_reg_mask |= 1 << ARM_HARD_FRAME_POINTER_REGNUM;
+      if (df_regs_ever_live_p (THUMB_HARD_FRAME_POINTER_REGNUM))
+       save_reg_mask |= 1 << THUMB_HARD_FRAME_POINTER_REGNUM;
+    }
    else
      {
        /* In the normal case we only need to save those registers
@@ -12579,11 +13076,6 @@ arm_compute_save_reg_mask (void)
        | (1 << LR_REGNUM)
        | (1 << PC_REGNUM);
  
-  /* Volatile functions do not return, so there
-     is no need to save any other registers.  */
-  if (IS_VOLATILE (func_type))
-    return save_reg_mask;
-
    save_reg_mask |= arm_compute_save_reg0_reg12_mask ();
  
    /* Decide if we need to save the link register.
@@ -12871,18 +13363,28 @@ output_return_instruction (rtx operand, int really_return, int reverse)
               gcc_assert (stack_adjust == 0 || stack_adjust == 4);
  
               if (stack_adjust && arm_arch5 && TARGET_ARM)
-               sprintf (instr, "ldm%sib\t%%|sp, {", conditional);
+               if (TARGET_UNIFIED_ASM)
+                 sprintf (instr, "ldmib%s\t%%|sp, {", conditional);
+               else
+                 sprintf (instr, "ldm%sib\t%%|sp, {", conditional);
               else
                 {
                   /* If we can't use ldmib (SA110 bug),
                      then try to pop r3 instead.  */
                   if (stack_adjust)
                     live_regs_mask |= 1 << 3;
-                 sprintf (instr, "ldm%sfd\t%%|sp, {", conditional);
+                 
+                 if (TARGET_UNIFIED_ASM)
+                   sprintf (instr, "ldmfd%s\t%%|sp, {", conditional);
+                 else
+                   sprintf (instr, "ldm%sfd\t%%|sp, {", conditional);
                 }
             }
           else
-           sprintf (instr, "ldm%sfd\t%%|sp!, {", conditional);
+           if (TARGET_UNIFIED_ASM)
+             sprintf (instr, "pop%s\t{", conditional);
+           else
+             sprintf (instr, "ldm%sfd\t%%|sp!, {", conditional);
  
           p = instr + strlen (instr);
  
@@ -13130,7 +13632,7 @@ arm_output_epilogue (rtx sibling)
        /* This variable is for the Virtual Frame Pointer, not VFP regs.  */
        int vfp_offset = offsets->frame;
  
-      if (arm_fpu_arch == FPUTYPE_FPA_EMU2)
+      if (TARGET_FPA_EMU2)
         {
           for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--)
             if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
@@ -13353,7 +13855,7 @@ arm_output_epilogue (rtx sibling)
                          SP_REGNUM, HARD_FRAME_POINTER_REGNUM);
         }
  
-      if (arm_fpu_arch == FPUTYPE_FPA_EMU2)
+      if (TARGET_FPA_EMU2)
         {
           for (reg = FIRST_FPA_REGNUM; reg <= LAST_FPA_REGNUM; reg++)
             if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
@@ -13394,24 +13896,29 @@ arm_output_epilogue (rtx sibling)
  
        if (TARGET_HARD_FLOAT && TARGET_VFP)
         {
-         start_reg = FIRST_VFP_REGNUM;
-         for (reg = FIRST_VFP_REGNUM; reg < LAST_VFP_REGNUM; reg += 2)
+         int end_reg = LAST_VFP_REGNUM + 1;
+
+         /* Scan the registers in reverse order.  We need to match
+            any groupings made in the prologue and generate matching
+            pop operations.  */
+         for (reg = LAST_VFP_REGNUM - 1; reg >= FIRST_VFP_REGNUM; reg -= 2)
             {
               if ((!df_regs_ever_live_p (reg) || call_used_regs[reg])
-                 && (!df_regs_ever_live_p (reg + 1) || call_used_regs[reg + 1]))
+                 && (!df_regs_ever_live_p (reg + 1)
+                     || call_used_regs[reg + 1]))
                 {
-                 if (start_reg != reg)
+                 if (end_reg > reg + 2)
                     vfp_output_fldmd (f, SP_REGNUM,
-                                     (start_reg - FIRST_VFP_REGNUM) / 2,
-                                     (reg - start_reg) / 2);
-                 start_reg = reg + 2;
+                                     (reg + 2 - FIRST_VFP_REGNUM) / 2,
+                                     (end_reg - (reg + 2)) / 2);
+                 end_reg = reg;
                 }
             }
-         if (start_reg != reg)
-           vfp_output_fldmd (f, SP_REGNUM,
-                             (start_reg - FIRST_VFP_REGNUM) / 2,
-                             (reg - start_reg) / 2);
+         if (end_reg > reg + 2)
+           vfp_output_fldmd (f, SP_REGNUM, 0,
+                             (end_reg - (reg + 2)) / 2);
         }
+
        if (TARGET_IWMMXT)
         for (reg = FIRST_IWMMXT_REGNUM; reg <= LAST_IWMMXT_REGNUM; reg++)
           if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
@@ -13580,16 +14087,17 @@ emit_multi_reg_push (unsigned long mask)
  
    /* For the body of the insn we are going to generate an UNSPEC in
       parallel with several USEs.  This allows the insn to be recognized
-     by the push_multi pattern in the arm.md file.  The insn looks
-     something like this:
+     by the push_multi pattern in the arm.md file.
+
+     The body of the insn looks something like this:
  
         (parallel [
-           (set (mem:BLK (pre_dec:BLK (reg:SI sp)))
+           (set (mem:BLK (pre_modify:SI (reg:SI sp)
+                                       (const_int:SI <num>)))
                 (unspec:BLK [(reg:SI r4)] UNSPEC_PUSH_MULT))
-           (use (reg:SI 11 fp))
-           (use (reg:SI 12 ip))
-           (use (reg:SI 14 lr))
-           (use (reg:SI 15 pc))
+           (use (reg:SI XX))
+           (use (reg:SI YY))
+          ...
          ])
  
       For the frame note however, we try to be more explicit and actually
@@ -13602,13 +14110,20 @@ emit_multi_reg_push (unsigned long mask)
        (sequence [
             (set (reg:SI sp) (plus:SI (reg:SI sp) (const_int -20)))
             (set (mem:SI (reg:SI sp)) (reg:SI r4))
-           (set (mem:SI (plus:SI (reg:SI sp) (const_int 4))) (reg:SI fp))
-           (set (mem:SI (plus:SI (reg:SI sp) (const_int 8))) (reg:SI ip))
-           (set (mem:SI (plus:SI (reg:SI sp) (const_int 12))) (reg:SI lr))
+           (set (mem:SI (plus:SI (reg:SI sp) (const_int 4))) (reg:SI XX))
+           (set (mem:SI (plus:SI (reg:SI sp) (const_int 8))) (reg:SI YY))
+          ...
          ])
  
-      This sequence is used both by the code to support stack unwinding for
-      exceptions handlers and the code to generate dwarf2 frame debugging.  */
+     FIXME:: In an ideal world the PRE_MODIFY would not exist and
+     instead we'd have a parallel expression detailing all
+     the stores to the various memory addresses so that debug
+     information is more up-to-date. Remember however while writing
+     this to take care of the constraints with the push instruction.
+
+     Note also that this has to be taken care of for the VFP registers.
+
+     For more see PR43399.  */
  
    par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (num_regs));
    dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (num_dwarf_regs + 1));
@@ -13622,9 +14137,14 @@ emit_multi_reg_push (unsigned long mask)
  
           XVECEXP (par, 0, 0)
             = gen_rtx_SET (VOIDmode,
-                          gen_frame_mem (BLKmode,
-                                         gen_rtx_PRE_DEC (BLKmode,
-                                                          stack_pointer_rtx)),
+                          gen_frame_mem
+                          (BLKmode,
+                           gen_rtx_PRE_MODIFY (Pmode,
+                                               stack_pointer_rtx,
+                                               plus_constant
+                                               (stack_pointer_rtx,
+                                                -4 * num_regs))
+                           ),
                            gen_rtx_UNSPEC (BLKmode,
                                            gen_rtvec (1, reg),
                                            UNSPEC_PUSH_MULT));
@@ -13655,9 +14175,10 @@ emit_multi_reg_push (unsigned long mask)
             {
               tmp
                 = gen_rtx_SET (VOIDmode,
-                              gen_frame_mem (SImode,
-                                             plus_constant (stack_pointer_rtx,
-                                                            4 * j)),
+                              gen_frame_mem
+                              (SImode,
+                               plus_constant (stack_pointer_rtx,
+                                              4 * j)),
                                reg);
               RTX_FRAME_RELATED_P (tmp) = 1;
               XVECEXP (dwarf, 0, dwarf_par_index++) = tmp;
@@ -13709,9 +14230,14 @@ emit_sfm (int base_reg, int count)
  
    XVECEXP (par, 0, 0)
      = gen_rtx_SET (VOIDmode,
-                  gen_frame_mem (BLKmode,
-                                 gen_rtx_PRE_DEC (BLKmode,
-                                                  stack_pointer_rtx)),
+                  gen_frame_mem
+                  (BLKmode,
+                   gen_rtx_PRE_MODIFY (Pmode,
+                                       stack_pointer_rtx,
+                                       plus_constant
+                                       (stack_pointer_rtx,
+                                        -12 * count))
+                   ),
                    gen_rtx_UNSPEC (BLKmode,
                                    gen_rtvec (1, reg),
                                    UNSPEC_PUSH_MULT));
@@ -14037,6 +14563,24 @@ arm_compute_initial_elimination_offset (unsigned int from, unsigned int to)
      }
  }
  
+/* Given FROM and TO register numbers, say whether this elimination is
+   allowed.  Frame pointer elimination is automatically handled.
+
+   All eliminations are permissible.  Note that ARG_POINTER_REGNUM and
+   HARD_FRAME_POINTER_REGNUM are in fact the same thing.  If we need a frame
+   pointer, we must eliminate FRAME_POINTER_REGNUM into
+   HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM or
+   ARG_POINTER_REGNUM.  */
+
+bool
+arm_can_eliminate (const int from, const int to)
+{
+  return ((to == FRAME_POINTER_REGNUM && from == ARG_POINTER_REGNUM) ? false :
+          (to == STACK_POINTER_REGNUM && frame_pointer_needed) ? false :
+          (to == ARM_HARD_FRAME_POINTER_REGNUM && TARGET_THUMB) ? false :
+          (to == THUMB_HARD_FRAME_POINTER_REGNUM && TARGET_ARM) ? false :
+           true);
+}
  
  /* Emit RTL to save coprocessor registers on function entry.  Returns the
     number of bytes pushed.  */
@@ -14052,7 +14596,7 @@ arm_save_coproc_regs(void)
    for (reg = LAST_IWMMXT_REGNUM; reg >= FIRST_IWMMXT_REGNUM; reg--)
      if (df_regs_ever_live_p (reg) && ! call_used_regs[reg])
        {
-       insn = gen_rtx_PRE_DEC (V2SImode, stack_pointer_rtx);
+       insn = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx);
         insn = gen_rtx_MEM (V2SImode, insn);
         insn = emit_set_insn (insn, gen_rtx_REG (V2SImode, reg));
         RTX_FRAME_RELATED_P (insn) = 1;
@@ -14061,12 +14605,12 @@ arm_save_coproc_regs(void)
  
    /* Save any floating point call-saved registers used by this
       function.  */
-  if (arm_fpu_arch == FPUTYPE_FPA_EMU2)
+  if (TARGET_FPA_EMU2)
      {
        for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--)
         if (df_regs_ever_live_p (reg) && !call_used_regs[reg])
           {
-           insn = gen_rtx_PRE_DEC (XFmode, stack_pointer_rtx);
+           insn = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx);
             insn = gen_rtx_MEM (XFmode, insn);
             insn = emit_set_insn (insn, gen_rtx_REG (XFmode, reg));
             RTX_FRAME_RELATED_P (insn) = 1;
@@ -14589,7 +15133,7 @@ arm_print_operand (FILE *stream, rtx x, int code)
        {
         REAL_VALUE_TYPE r;
         REAL_VALUE_FROM_CONST_DOUBLE (r, x);
-       r = REAL_VALUE_NEGATE (r);
+       r = real_value_negate (&r);
         fprintf (stream, "%s", fp_const_from_val (&r));
        }
        return;
@@ -14886,6 +15430,30 @@ arm_print_operand (FILE *stream, rtx x, int code)
         }
        return;
  
+    /* Print the high single-precision register of a VFP double-precision
+       register.  */
+    case 'p':
+      {
+        int mode = GET_MODE (x);
+        int regno;
+
+        if (GET_MODE_SIZE (mode) != 8 || GET_CODE (x) != REG)
+          {
+           output_operand_lossage ("invalid operand for code '%c'", code);
+           return;
+          }
+
+        regno = REGNO (x);
+        if (!VFP_REGNO_OK_FOR_DOUBLE (regno))
+          {
+           output_operand_lossage ("invalid operand for code '%c'", code);
+           return;
+          }
+
+       fprintf (stream, "s%d", regno - FIRST_VFP_REGNUM + 1);
+      }
+      return;
+
      /* Print a VFP/Neon double precision or quad precision register name.  */
      case 'P':
      case 'q':
@@ -15021,6 +15589,30 @@ arm_print_operand (FILE *stream, rtx x, int code)
        }
        return;
  
+    /* Translate an S register number into a D register number and element index.  */
+    case 'y':
+      {
+        int mode = GET_MODE (x);
+        int regno;
+
+        if (GET_MODE_SIZE (mode) != 4 || GET_CODE (x) != REG)
+          {
+           output_operand_lossage ("invalid operand for code '%c'", code);
+           return;
+          }
+
+        regno = REGNO (x);
+        if (!VFP_REGNO_OK_FOR_SINGLE (regno))
+          {
+           output_operand_lossage ("invalid operand for code '%c'", code);
+           return;
+          }
+
+       regno = regno - FIRST_VFP_REGNUM;
+       fprintf (stream, "d%d[%d]", regno / 2, regno % 2);
+      }
+      return;
+
      /* Register specifier for vld1.16/vst1.16.  Translate the S register
         number into a D register number and element index.  */
      case 'z':
@@ -15804,10 +16396,9 @@ arm_hard_regno_mode_ok (unsigned int regno, enum machine_mode mode)
         return VFP_REGNO_OK_FOR_DOUBLE (regno);
  
        /* VFP registers can hold HFmode values, but there is no point in
-        putting them there unless we have the NEON extensions for
-        loading/storing them, too.  */
+        putting them there unless we have hardware conversion insns. */
        if (mode == HFmode)
-       return TARGET_NEON_FP16 && VFP_REGNO_OK_FOR_SINGLE (regno);
+       return TARGET_FP16 && VFP_REGNO_OK_FOR_SINGLE (regno);
  
        if (TARGET_NEON)
          return (VALID_NEON_DREG_MODE (mode) && VFP_REGNO_OK_FOR_DOUBLE (regno))
@@ -18812,6 +19403,51 @@ thumb_compute_initial_elimination_offset (unsigned int from, unsigned int to)
      }
  }
  
+/* Given the stack offsets and register mask in OFFSETS, decide
+   how many additional registers to push instead of subtracting
+   a constant from SP.  */
+static int
+thumb1_extra_regs_pushed (arm_stack_offsets *offsets)
+{
+  HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs;
+  unsigned long live_regs_mask = offsets->saved_regs_mask;
+  /* Extract a mask of the ones we can give to the Thumb's push instruction.  */
+  unsigned long l_mask = live_regs_mask & 0x40ff;
+  /* Then count how many other high registers will need to be pushed.  */
+  unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
+  int n_free;
+
+  /* If the stack frame size is 512 exactly, we can save one load
+     instruction, which should make this a win even when optimizing
+     for speed.  */
+  if (!optimize_size && amount != 512)
+    return 0;
+
+  /* Can't do this if there are high registers to push, or if we
+     are not going to do a push at all.  */
+  if (high_regs_pushed != 0 || l_mask == 0)
+    return 0;
+
+  /* Don't do this if thumb1_expand_prologue wants to emit instructions
+     between the push and the stack frame allocation.  */
+  if ((flag_pic && arm_pic_register != INVALID_REGNUM)
+      || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))
+    return 0;
+
+  for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1)
+    n_free++;
+
+  if (n_free == 0)
+    return 0;
+  gcc_assert (amount / 4 * 4 == amount);
+
+  if (amount >= 512 && (amount - n_free * 4) < 512)
+    return (amount - 508) / 4;
+  if (amount <= n_free * 4)
+    return amount / 4;
+  return 0;
+}
+
  /* Generate the rest of a function's prologue.  */
  void
  thumb1_expand_prologue (void)
@@ -18848,6 +19484,7 @@ thumb1_expand_prologue (void)
                     stack_pointer_rtx);
  
    amount = offsets->outgoing_args - offsets->saved_regs;
+  amount -= 4 * thumb1_extra_regs_pushed (offsets);
    if (amount)
      {
        if (amount < 512)
@@ -19152,7 +19789,11 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
       register.  */
    else if ((l_mask & 0xff) != 0
            || (high_regs_pushed == 0 && l_mask))
-    thumb_pushpop (f, l_mask, 1, &cfa_offset, l_mask);
+    {
+      unsigned long mask = l_mask;
+      mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1;
+      thumb_pushpop (f, mask, 1, &cfa_offset, mask);
+    }
  
    if (high_regs_pushed)
      {
@@ -19543,45 +20184,8 @@ arm_file_start (void)
         }
        else
         {
-         int set_float_abi_attributes = 0;
-         switch (arm_fpu_arch)
-           {
-           case FPUTYPE_FPA:
-             fpu_name = "fpa";
-             break;
-           case FPUTYPE_FPA_EMU2:
-             fpu_name = "fpe2";
-             break;
-           case FPUTYPE_FPA_EMU3:
-             fpu_name = "fpe3";
-             break;
-           case FPUTYPE_MAVERICK:
-             fpu_name = "maverick";
-             break;
-           case FPUTYPE_VFP:
-             fpu_name = "vfp";
-             set_float_abi_attributes = 1;
-             break;
-           case FPUTYPE_VFP3D16:
-             fpu_name = "vfpv3-d16";
-             set_float_abi_attributes = 1;
-             break;
-           case FPUTYPE_VFP3:
-             fpu_name = "vfpv3";
-             set_float_abi_attributes = 1;
-             break;
-           case FPUTYPE_NEON:
-             fpu_name = "neon";
-             set_float_abi_attributes = 1;
-             break;
-           case FPUTYPE_NEON_FP16:
-             fpu_name = "neon-fp16";
-             set_float_abi_attributes = 1;
-             break;
-           default:
-             abort();
-           }
-         if (set_float_abi_attributes)
+         fpu_name = arm_fpu_desc->name;
+         if (arm_fpu_desc->model == ARM_FP_MODEL_VFP)
             {
               if (TARGET_HARD_FLOAT)
                 asm_fprintf (asm_out_file, "\t.eabi_attribute 27, 3\n");
@@ -20342,6 +20946,13 @@ arm_vector_mode_supported_p (enum machine_mode mode)
    return false;
  }
  
+/* Implements target hook small_register_classes_for_mode_p.  */
+bool
+arm_small_register_classes_for_mode_p (enum machine_mode mode ATTRIBUTE_UNUSED)
+{
+  return TARGET_THUMB1;
+}
+
  /* Implement TARGET_SHIFT_TRUNCATION_MASK.  SImode shifts use normal
     ARM insns and therefore guarantee that the shift count is modulo 256.
     DImode shifts (those implemented by lib1funcs.asm or by optabs.c)
@@ -20367,9 +20978,14 @@ arm_dbx_register_number (unsigned int regno)
    if (IS_FPA_REGNUM (regno))
      return (TARGET_AAPCS_BASED ? 96 : 16) + regno - FIRST_FPA_REGNUM;
  
-  /* FIXME: VFPv3 register numbering.  */
    if (IS_VFP_REGNUM (regno))
-    return 64 + regno - FIRST_VFP_REGNUM;
+    {
+      /* See comment in arm_dwarf_register_span.  */
+      if (VFP_REGNO_OK_FOR_SINGLE (regno))
+       return 64 + regno - FIRST_VFP_REGNUM;
+      else
+       return 256 + (regno - FIRST_VFP_REGNUM) / 2;
+    }
  
    if (IS_IWMMXT_GR_REGNUM (regno))
      return 104 + regno - FIRST_IWMMXT_GR_REGNUM;
@@ -20380,6 +20996,39 @@ arm_dbx_register_number (unsigned int regno)
    gcc_unreachable ();
  }
  
+/* Dwarf models VFPv3 registers as 32 64-bit registers.
+   GCC models tham as 64 32-bit registers, so we need to describe this to
+   the DWARF generation code.  Other registers can use the default.  */
+static rtx
+arm_dwarf_register_span (rtx rtl)
+{
+  unsigned regno;
+  int nregs;
+  int i;
+  rtx p;
+
+  regno = REGNO (rtl);
+  if (!IS_VFP_REGNUM (regno))
+    return NULL_RTX;
+
+  /* XXX FIXME: The EABI defines two VFP register ranges:
+       64-95: Legacy VFPv2 numbering for S0-S31 (obsolescent)
+       256-287: D0-D31
+     The recommended encoding for S0-S31 is a DW_OP_bit_piece of the
+     corresponding D register.  Until GDB supports this, we shall use the
+     legacy encodings.  We also use these encodings for D0-D15 for
+     compatibility with older debuggers.  */
+  if (VFP_REGNO_OK_FOR_SINGLE (regno))
+    return NULL_RTX;
+
+  nregs = GET_MODE_SIZE (GET_MODE (rtl)) / 8;
+  p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs));
+  regno = (regno - FIRST_VFP_REGNUM) / 2;
+  for (i = 0; i < nregs; i++)
+    XVECEXP (p, 0, i) = gen_rtx_REG (DImode, 256 + regno + i);
+
+  return p;
+}
  
  #ifdef TARGET_UNWIND_INFO
  /* Emit unwind directives for a store-multiple instruction or stack pointer
@@ -20546,7 +21195,7 @@ arm_unwind_emit_set (FILE * asm_out_file, rtx p)
               offset = INTVAL (XEXP (e1, 1));
               asm_fprintf (asm_out_file, "\t.setfp %r, %r, #%wd\n",
                            HARD_FRAME_POINTER_REGNUM, reg,
-                          INTVAL (XEXP (e1, 1)));
+                          offset);
             }
           else if (GET_CODE (e1) == REG)
             {
@@ -20782,6 +21431,16 @@ arm_output_addr_const_extra (FILE *fp, rtx x)
        fputc (')', fp);
        return TRUE;
      }
+  else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SYMBOL_OFFSET)
+    {
+      output_addr_const (fp, XVECEXP (x, 0, 0));
+      if (GOT_PCREL)
+        fputs ("+.", fp);
+      fputs ("-(", fp);
+      output_addr_const (fp, XVECEXP (x, 0, 1));
+      fputc (')', fp);
+      return TRUE;
+    }
    else if (GET_CODE (x) == CONST_VECTOR)
      return arm_emit_vector_const (fp, x);
  
@@ -20826,12 +21485,9 @@ const char *
  thumb1_output_casesi (rtx *operands)
  {
    rtx diff_vec = PATTERN (next_real_insn (operands[0]));
-  addr_diff_vec_flags flags;
  
    gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
  
-  flags = ADDR_DIFF_VEC_FLAGS (diff_vec);
-
    switch (GET_MODE(diff_vec))
      {
      case QImode:
@@ -20947,7 +21603,7 @@ arm_mangle_type (const_tree type)
        && lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
      {
        static bool warned;
-      if (!warned && warn_psabi)
+      if (!warned && warn_psabi && !in_system_header)
         {
           warned = true;
           inform (input_location,
@@ -21028,4 +21684,12 @@ arm_frame_pointer_required (void)
            || (TARGET_ARM && TARGET_APCS_FRAME && ! leaf_function_p ()));
  }
  
+/* Only thumb1 can't support conditional execution, so return true if
+   the target is not thumb1.  */
+static bool
+arm_have_conditional_execution (void)
+{
+  return !TARGET_THUMB1;
+}
+
  #include "gt-arm.h"