2010-12-09 Yao Qi <yao@codesourcery.com>

[pf3gnuchains/gcc-fork.git] / gcc / config / arm / arm.c
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c

index 92f19c3..ac25365 100644 (file)
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -40,7 +40,7 @@
  #include "function.h"
  #include "expr.h"
  #include "optabs.h"
-#include "toplev.h"
+#include "diagnostic-core.h"
  #include "recog.h"
  #include "cgraph.h"
  #include "ggc.h"
@@ -55,6 +55,7 @@
  #include "df.h"
  #include "intl.h"
  #include "libfuncs.h"
+#include "params.h"
  
  /* Forward definitions of types.  */
  typedef struct minipool_node    Mnode;
@@ -63,6 +64,7 @@ typedef struct minipool_fixup   Mfix;
  void (*arm_lang_output_object_attributes_hook)(void);
  
  /* Forward function declarations.  */
+static bool arm_needs_doubleword_align (enum machine_mode, const_tree);
  static int arm_compute_static_chain_stack_bytes (void);
  static arm_stack_offsets *arm_get_frame_offsets (void);
  static void arm_add_gc_roots (void);
@@ -162,6 +164,11 @@ static void emit_constant_insn (rtx cond, rtx pattern);
  static rtx emit_set_insn (rtx, rtx);
  static int arm_arg_partial_bytes (CUMULATIVE_ARGS *, enum machine_mode,
                                   tree, bool);
+static rtx arm_function_arg (CUMULATIVE_ARGS *, enum machine_mode,
+                            const_tree, bool);
+static void arm_function_arg_advance (CUMULATIVE_ARGS *, enum machine_mode,
+                                     const_tree, bool);
+static unsigned int arm_function_arg_boundary (enum machine_mode, const_tree);
  static rtx aapcs_allocate_return_reg (enum machine_mode, const_tree,
                                       const_tree);
  static int aapcs_select_return_coproc (const_tree, const_tree);
@@ -187,10 +194,13 @@ static bool arm_align_anon_bitfield (void);
  static bool arm_return_in_msb (const_tree);
  static bool arm_must_pass_in_stack (enum machine_mode, const_tree);
  static bool arm_return_in_memory (const_tree, const_tree);
-#ifdef TARGET_UNWIND_INFO
+#if ARM_UNWIND_INFO
  static void arm_unwind_emit (FILE *, rtx);
  static bool arm_output_ttype (rtx);
+static void arm_asm_emit_except_personality (rtx);
+static void arm_asm_init_sections (void);
  #endif
+static enum unwind_info_type arm_except_unwind_info (struct gcc_options *);
  static void arm_dwarf_handle_frame_unspec (const char *, rtx, int);
  static rtx arm_dwarf_register_span (rtx);
  
@@ -207,6 +217,7 @@ static void arm_init_libfuncs (void);
  static tree arm_build_builtin_va_list (void);
  static void arm_expand_builtin_va_start (tree, rtx);
  static tree arm_gimplify_va_arg_expr (tree, tree, gimple_seq *, gimple_seq *);
+static void arm_option_override (void);
  static bool arm_handle_option (size_t, const char *, int);
  static void arm_target_help (void);
  static unsigned HOST_WIDE_INT arm_shift_truncation_mask (enum machine_mode);
@@ -214,6 +225,7 @@ static bool arm_cannot_copy_insn_p (rtx);
  static bool arm_tls_symbol_p (rtx x);
  static int arm_issue_rate (void);
  static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
+static bool arm_output_addr_const_extra (FILE *, rtx);
  static bool arm_allocate_stack_slots_for_args (void);
  static const char *arm_invalid_parameter_type (const_tree t);
  static const char *arm_invalid_return_type (const_tree t);
@@ -226,6 +238,17 @@ static void arm_asm_trampoline_template (FILE *);
  static void arm_trampoline_init (rtx, tree, rtx);
  static rtx arm_trampoline_adjust_address (rtx);
  static rtx arm_pic_static_addr (rtx orig, rtx reg);
+static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
+static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
+static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
+static bool arm_class_likely_spilled_p (reg_class_t);
+static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
+static bool arm_builtin_support_vector_misalignment (enum machine_mode mode,
+                                                    const_tree type,
+                                                    int misalignment,
+                                                    bool is_packed);
+static void arm_conditional_register_usage (void);
+static reg_class_t arm_preferred_rename_class (reg_class_t class);
  
  \f
  /* Table of machine attributes.  */
@@ -265,6 +288,15 @@ static const struct attribute_spec arm_attribute_table[] =
  #endif
    { NULL,           0, 0, false, false, false, NULL }
  };
+
+/* Set default optimization options.  */
+static const struct default_options arm_option_optimization_table[] =
+  {
+    /* Enable section anchors by default at -O1 or higher.  */
+    { OPT_LEVELS_1_PLUS, OPT_fsection_anchors, NULL, 1 },
+    { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
+    { OPT_LEVELS_NONE, 0, NULL, 0 }
+  };
  \f
  /* Initialize the GCC target structure.  */
  #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
@@ -295,6 +327,9 @@ static const struct attribute_spec arm_attribute_table[] =
  #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
  #define TARGET_PRINT_OPERAND_PUNCT_VALID_P arm_print_operand_punct_valid_p
  
+#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
+#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA arm_output_addr_const_extra
+
  #undef  TARGET_ASM_FUNCTION_PROLOGUE
  #define TARGET_ASM_FUNCTION_PROLOGUE arm_output_function_prologue
  
@@ -307,6 +342,10 @@ static const struct attribute_spec arm_attribute_table[] =
  #define TARGET_HANDLE_OPTION arm_handle_option
  #undef  TARGET_HELP
  #define TARGET_HELP arm_target_help
+#undef  TARGET_OPTION_OVERRIDE
+#define TARGET_OPTION_OVERRIDE arm_option_override
+#undef  TARGET_OPTION_OPTIMIZATION_TABLE
+#define TARGET_OPTION_OPTIMIZATION_TABLE arm_option_optimization_table
  
  #undef  TARGET_COMP_TYPE_ATTRIBUTES
  #define TARGET_COMP_TYPE_ATTRIBUTES arm_comp_type_attributes
@@ -353,6 +392,8 @@ static const struct attribute_spec arm_attribute_table[] =
  #define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask
  #undef TARGET_VECTOR_MODE_SUPPORTED_P
  #define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p
+#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode
  
  #undef  TARGET_MACHINE_DEPENDENT_REORG
  #define TARGET_MACHINE_DEPENDENT_REORG arm_reorg
@@ -373,6 +414,12 @@ static const struct attribute_spec arm_attribute_table[] =
  #define TARGET_PASS_BY_REFERENCE arm_pass_by_reference
  #undef TARGET_ARG_PARTIAL_BYTES
  #define TARGET_ARG_PARTIAL_BYTES arm_arg_partial_bytes
+#undef TARGET_FUNCTION_ARG
+#define TARGET_FUNCTION_ARG arm_function_arg
+#undef TARGET_FUNCTION_ARG_ADVANCE
+#define TARGET_FUNCTION_ARG_ADVANCE arm_function_arg_advance
+#undef TARGET_FUNCTION_ARG_BOUNDARY
+#define TARGET_FUNCTION_ARG_BOUNDARY arm_function_arg_boundary
  
  #undef  TARGET_SETUP_INCOMING_VARARGS
  #define TARGET_SETUP_INCOMING_VARARGS arm_setup_incoming_varargs
@@ -433,7 +480,7 @@ static const struct attribute_spec arm_attribute_table[] =
  #undef TARGET_MUST_PASS_IN_STACK
  #define TARGET_MUST_PASS_IN_STACK arm_must_pass_in_stack
  
-#ifdef TARGET_UNWIND_INFO
+#if ARM_UNWIND_INFO
  #undef TARGET_ASM_UNWIND_EMIT
  #define TARGET_ASM_UNWIND_EMIT arm_unwind_emit
  
@@ -443,7 +490,16 @@ static const struct attribute_spec arm_attribute_table[] =
  
  #undef TARGET_ARM_EABI_UNWINDER
  #define TARGET_ARM_EABI_UNWINDER true
-#endif /* TARGET_UNWIND_INFO */
+
+#undef TARGET_ASM_EMIT_EXCEPT_PERSONALITY
+#define TARGET_ASM_EMIT_EXCEPT_PERSONALITY arm_asm_emit_except_personality
+
+#undef TARGET_ASM_INIT_SECTIONS
+#define TARGET_ASM_INIT_SECTIONS arm_asm_init_sections
+#endif /* ARM_UNWIND_INFO */
+
+#undef TARGET_EXCEPT_UNWIND_INFO
+#define TARGET_EXCEPT_UNWIND_INFO  arm_except_unwind_info
  
  #undef TARGET_DWARF_HANDLE_FRAME_UNSPEC
  #define TARGET_DWARF_HANDLE_FRAME_UNSPEC arm_dwarf_handle_frame_unspec
@@ -516,6 +572,24 @@ static const struct attribute_spec arm_attribute_table[] =
  #undef TARGET_CAN_ELIMINATE
  #define TARGET_CAN_ELIMINATE arm_can_eliminate
  
+#undef TARGET_CONDITIONAL_REGISTER_USAGE
+#define TARGET_CONDITIONAL_REGISTER_USAGE arm_conditional_register_usage
+
+#undef TARGET_CLASS_LIKELY_SPILLED_P
+#define TARGET_CLASS_LIKELY_SPILLED_P arm_class_likely_spilled_p
+
+#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
+#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
+  arm_vector_alignment_reachable
+
+#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
+#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
+  arm_builtin_support_vector_misalignment
+
+#undef TARGET_PREFERRED_RENAME_CLASS
+#define TARGET_PREFERRED_RENAME_CLASS \
+  arm_preferred_rename_class
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  \f
  /* Obstack for minipool constant handling.  */
@@ -589,6 +663,7 @@ static int thumb_call_reg_needed;
  #define FL_NEON       (1 << 20)       /* Neon instructions.  */
  #define FL_ARCH7EM    (1 << 21)              /* Instructions present in the ARMv7E-M
                                          architecture.  */
+#define FL_ARCH7      (1 << 22)       /* Architecture 7.  */
  
  #define FL_IWMMXT     (1 << 29)              /* XScale v2 or "Intel Wireless MMX technology".  */
  
@@ -613,7 +688,7 @@ static int thumb_call_reg_needed;
  #define FL_FOR_ARCH6ZK FL_FOR_ARCH6K
  #define FL_FOR_ARCH6T2 (FL_FOR_ARCH6 | FL_THUMB2)
  #define FL_FOR_ARCH6M  (FL_FOR_ARCH6 & ~FL_NOTM)
-#define FL_FOR_ARCH7   (FL_FOR_ARCH6T2 &~ FL_NOTM)
+#define FL_FOR_ARCH7   ((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7)
  #define FL_FOR_ARCH7A  (FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
  #define FL_FOR_ARCH7R  (FL_FOR_ARCH7A | FL_DIV)
  #define FL_FOR_ARCH7M  (FL_FOR_ARCH7 | FL_DIV)
@@ -651,6 +726,9 @@ int arm_arch6 = 0;
  /* Nonzero if this chip supports the ARM 6K extensions.  */
  int arm_arch6k = 0;
  
+/* Nonzero if this chip supports the ARM 7 extensions.  */
+int arm_arch7 = 0;
+
  /* Nonzero if instructions not present in the 'M' profile can be used.  */
  int arm_arch_notm = 0;
  
@@ -685,6 +763,9 @@ int arm_tune_cortex_a9 = 0;
  /* Nonzero if generating Thumb instructions.  */
  int thumb_code = 0;
  
+/* Nonzero if generating Thumb-1 instructions.  */
+int thumb1_code = 0;
+
  /* Nonzero if we should define __THUMB_INTERWORK__ in the
     preprocessor.
     XXX This is a bit of a hack, it's intended to help work around
@@ -710,12 +791,13 @@ unsigned arm_pic_register = INVALID_REGNUM;
     the next function.  */
  static int after_arm_reorg = 0;
  
-static enum arm_pcs arm_pcs_default;
+enum arm_pcs arm_pcs_default;
  
  /* For an explanation of these variables, see final_prescan_insn below.  */
  int arm_ccfsm_state;
  /* arm_current_cc is also used for Thumb-2 cond_exec blocks.  */
  enum arm_cond_code arm_current_cc;
+
  rtx arm_target_insn;
  int arm_target_label;
  /* The number of conditionally executed insns, including the current insn.  */
@@ -733,6 +815,12 @@ static const char * const arm_condition_codes[] =
    "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
  };
  
+/* The register numbers in sequence, for passing to arm_gen_load_multiple.  */
+int arm_regs_in_sequence[] =
+{
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
  #define ARM_LSL_NAME (TARGET_UNIFIED_ASM ? "lsl" : "asl")
  #define streq(string1, string2) (strcmp (string1, string2) == 0)
  
@@ -751,30 +839,54 @@ struct processors
    const struct tune_params *const tune;
  };
  
+
+#define ARM_PREFETCH_NOT_BENEFICIAL 0, -1, -1
+#define ARM_PREFETCH_BENEFICIAL(prefetch_slots,l1_size,l1_line_size) \
+  prefetch_slots, \
+  l1_size, \
+  l1_line_size
+
  const struct tune_params arm_slowmul_tune =
  {
    arm_slowmul_rtx_costs,
-  3
+  NULL,
+  3,
+  ARM_PREFETCH_NOT_BENEFICIAL
  };
  
  const struct tune_params arm_fastmul_tune =
  {
    arm_fastmul_rtx_costs,
-  1
+  NULL,
+  1,
+  ARM_PREFETCH_NOT_BENEFICIAL
  };
  
  const struct tune_params arm_xscale_tune =
  {
    arm_xscale_rtx_costs,
-  2
+  xscale_sched_adjust_cost,
+  2,
+  ARM_PREFETCH_NOT_BENEFICIAL
  };
  
  const struct tune_params arm_9e_tune =
  {
    arm_9e_rtx_costs,
-  1
+  NULL,
+  1,
+  ARM_PREFETCH_NOT_BENEFICIAL
+};
+
+const struct tune_params arm_cortex_a9_tune =
+{
+  arm_9e_rtx_costs,
+  cortex_a9_sched_adjust_cost,
+  1,
+  ARM_PREFETCH_BENEFICIAL(4,32,32)
  };
  
+
  /* Not all of these give usefully different compilation alternatives,
     but there is no simple way of generalizing them.  */
  static const struct processors all_cores[] =
@@ -1138,6 +1250,7 @@ arm_build_builtin_va_list (void)
                              va_list_type);
    DECL_ARTIFICIAL (va_list_name) = 1;
    TYPE_NAME (va_list_type) = va_list_name;
+  TYPE_STUB_DECL (va_list_type) = va_list_name;
    /* Create the __ap field.  */
    ap_field = build_decl (BUILTINS_LOCATION,
                          FIELD_DECL, 
@@ -1255,7 +1368,7 @@ arm_target_help (void)
      {
        const char *p;
  
-      GET_ENVIRONMENT (p, "COLUMNS");
+      p = getenv ("COLUMNS");
        if (p != NULL)
         {
           int value = atoi (p);
@@ -1326,13 +1439,16 @@ arm_target_help (void)
  
  }
  
-/* Fix up any incompatible options that the user has specified.
-   This has now turned into a maze.  */
-void
-arm_override_options (void)
+/* Fix up any incompatible options that the user has specified.  */
+static void
+arm_option_override (void)
  {
    unsigned i;
  
+#ifdef SUBTARGET_OVERRIDE_OPTIONS
+  SUBTARGET_OVERRIDE_OPTIONS;
+#endif
+
    if (arm_selected_arch)
      {
        if (arm_selected_cpu)
@@ -1373,7 +1489,7 @@ arm_override_options (void)
           arm_selected_cpu = &all_cores[SUBTARGET_CPU_DEFAULT];
  #endif
           /* Default to ARM6.  */
-         if (arm_selected_cpu->name)
+         if (!arm_selected_cpu->name)
             arm_selected_cpu = &all_cores[arm6];
         }
  
@@ -1515,7 +1631,7 @@ arm_override_options (void)
    /* Callee super interworking implies thumb interworking.  Adding
       this to the flags here simplifies the logic elsewhere.  */
    if (TARGET_THUMB && TARGET_CALLEE_INTERWORKING)
-      target_flags |= MASK_INTERWORK;
+    target_flags |= MASK_INTERWORK;
  
    /* TARGET_BACKTRACE calls leaf_function_p, which causes a crash if done
       from here where no function is being compiled currently.  */
@@ -1525,9 +1641,6 @@ arm_override_options (void)
    if (TARGET_ARM && TARGET_CALLEE_INTERWORKING)
      warning (0, "enabling callee interworking support is only meaningful when compiling for the Thumb");
  
-  if (TARGET_ARM && TARGET_CALLER_INTERWORKING)
-    warning (0, "enabling caller interworking support is only meaningful when compiling for the Thumb");
-
    if (TARGET_APCS_STACK && !TARGET_APCS_FRAME)
      {
        warning (0, "-mapcs-stack-check incompatible with -mno-apcs-frame");
@@ -1563,6 +1676,7 @@ arm_override_options (void)
    arm_arch6 = (insn_flags & FL_ARCH6) != 0;
    arm_arch6k = (insn_flags & FL_ARCH6K) != 0;
    arm_arch_notm = (insn_flags & FL_NOTM) != 0;
+  arm_arch7 = (insn_flags & FL_ARCH7) != 0;
    arm_arch7em = (insn_flags & FL_ARCH7EM) != 0;
    arm_arch_thumb2 = (insn_flags & FL_THUMB2) != 0;
    arm_arch_xscale = (insn_flags & FL_XSCALE) != 0;
@@ -1570,7 +1684,8 @@ arm_override_options (void)
  
    arm_ld_sched = (tune_flags & FL_LDSCHED) != 0;
    arm_tune_strongarm = (tune_flags & FL_STRONG) != 0;
-  thumb_code = (TARGET_ARM == 0);
+  thumb_code = TARGET_ARM == 0;
+  thumb1_code = TARGET_THUMB1 != 0;
    arm_tune_wbuf = (tune_flags & FL_WBUF) != 0;
    arm_tune_xscale = (tune_flags & FL_XSCALE) != 0;
    arm_arch_iwmmxt = (insn_flags & FL_IWMMXT) != 0;
@@ -1871,6 +1986,45 @@ arm_override_options (void)
        flag_reorder_blocks = 1;
      }
  
+  if (flag_pic)
+    /* Hoisting PIC address calculations more aggressively provides a small,
+       but measurable, size reduction for PIC code.  Therefore, we decrease
+       the bar for unrestricted expression hoisting to the cost of PIC address
+       calculation, which is 2 instructions.  */
+    maybe_set_param_value (PARAM_GCSE_UNRESTRICTED_COST, 2,
+                          global_options.x_param_values,
+                          global_options_set.x_param_values);
+
+  /* ARM EABI defaults to strict volatile bitfields.  */
+  if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0)
+    flag_strict_volatile_bitfields = 1;
+
+  /* Enable sw prefetching at -O3 for CPUS that have prefetch, and we have deemed
+     it beneficial (signified by setting num_prefetch_slots to 1 or more.)  */
+  if (flag_prefetch_loop_arrays < 0
+      && HAVE_prefetch
+      && optimize >= 3
+      && current_tune->num_prefetch_slots > 0)
+    flag_prefetch_loop_arrays = 1;
+
+  /* Set up parameters to be used in prefetching algorithm.  Do not override the
+     defaults unless we are tuning for a core we have researched values for.  */
+  if (current_tune->num_prefetch_slots > 0)
+    maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+                           current_tune->num_prefetch_slots,
+                           global_options.x_param_values,
+                           global_options_set.x_param_values);
+  if (current_tune->l1_cache_line_size >= 0)
+    maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+                           current_tune->l1_cache_line_size,
+                           global_options.x_param_values,
+                           global_options_set.x_param_values);
+  if (current_tune->l1_cache_size >= 0)
+    maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+                           current_tune->l1_cache_size,
+                           global_options.x_param_values,
+                           global_options_set.x_param_values);
+
    /* Register global variables with the garbage collector.  */
    arm_add_gc_roots ();
  }
@@ -1959,7 +2113,8 @@ arm_compute_func_type (void)
    if (optimize > 0
        && (TREE_NOTHROW (current_function_decl)
            || !(flag_unwind_tables
-               || (flag_exceptions && !USING_SJLJ_EXCEPTIONS)))
+               || (flag_exceptions
+                  && arm_except_unwind_info (&global_options) != UI_SJLJ)))
        && TREE_THIS_VOLATILE (current_function_decl))
      type |= ARM_FT_VOLATILE;
  
@@ -2259,11 +2414,17 @@ const_ok_for_arm (HOST_WIDE_INT i)
      {
        HOST_WIDE_INT v;
  
-      /* Allow repeated pattern.  */
+      /* Allow repeated patterns 0x00XY00XY or 0xXYXYXYXY.  */
        v = i & 0xff;
        v |= v << 16;
        if (i == v || i == (v | (v << 8)))
         return TRUE;
+
+      /* Allow repeated pattern 0xXY00XY00.  */
+      v = i & 0xff00;
+      v |= v << 16;
+      if (i == v)
+       return TRUE;
      }
  
    return FALSE;
@@ -3183,13 +3344,82 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
     immediate value easier to load.  */
  
  enum rtx_code
-arm_canonicalize_comparison (enum rtx_code code, enum machine_mode mode,
-                            rtx * op1)
+arm_canonicalize_comparison (enum rtx_code code, rtx *op0, rtx *op1)
  {
-  unsigned HOST_WIDE_INT i = INTVAL (*op1);
-  unsigned HOST_WIDE_INT maxval;
+  enum machine_mode mode;
+  unsigned HOST_WIDE_INT i, maxval;
+
+  mode = GET_MODE (*op0);
+  if (mode == VOIDmode)
+    mode = GET_MODE (*op1);
+
    maxval = (((unsigned HOST_WIDE_INT) 1) << (GET_MODE_BITSIZE(mode) - 1)) - 1;
  
+  /* For DImode, we have GE/LT/GEU/LTU comparisons.  In ARM mode
+     we can also use cmp/cmpeq for GTU/LEU.  GT/LE must be either
+     reversed or (for constant OP1) adjusted to GE/LT.  Similarly
+     for GTU/LEU in Thumb mode.  */
+  if (mode == DImode)
+    {
+      rtx tem;
+
+      /* To keep things simple, always use the Cirrus cfcmp64 if it is
+        available.  */
+      if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK)
+       return code;
+
+      if (code == GT || code == LE
+         || (!TARGET_ARM && (code == GTU || code == LEU)))
+       {
+         /* Missing comparison.  First try to use an available
+            comparison.  */
+         if (GET_CODE (*op1) == CONST_INT)
+           {
+             i = INTVAL (*op1);
+             switch (code)
+               {
+               case GT:
+               case LE:
+                 if (i != maxval
+                     && arm_const_double_by_immediates (GEN_INT (i + 1)))
+                   {
+                     *op1 = GEN_INT (i + 1);
+                     return code == GT ? GE : LT;
+                   }
+                 break;
+               case GTU:
+               case LEU:
+                 if (i != ~((unsigned HOST_WIDE_INT) 0)
+                     && arm_const_double_by_immediates (GEN_INT (i + 1)))
+                   {
+                     *op1 = GEN_INT (i + 1);
+                     return code == GTU ? GEU : LTU;
+                   }
+                 break;
+               default:
+                 gcc_unreachable ();
+               }
+           }
+
+         /* If that did not work, reverse the condition.  */
+         tem = *op0;
+         *op0 = *op1;
+         *op1 = tem;
+         return swap_condition (code);
+       }
+
+      return code;
+    }
+
+  /* Comparisons smaller than DImode.  Only adjust comparisons against
+     an out-of-range constant.  */
+  if (GET_CODE (*op1) != CONST_INT
+      || const_ok_for_arm (INTVAL (*op1))
+      || const_ok_for_arm (- INTVAL (*op1)))
+    return code;
+
+  i = INTVAL (*op1);
+
    switch (code)
      {
      case EQ:
@@ -3466,7 +3696,7 @@ arm_return_in_memory (const_tree type, const_tree fntype)
          have been created by C++.  */
        for (field = TYPE_FIELDS (type);
            field && TREE_CODE (field) != FIELD_DECL;
-          field = TREE_CHAIN (field))
+          field = DECL_CHAIN (field))
         continue;
  
        if (field == NULL)
@@ -3485,9 +3715,9 @@ arm_return_in_memory (const_tree type, const_tree fntype)
  
        /* Now check the remaining fields, if any.  Only bitfields are allowed,
          since they are not addressable.  */
-      for (field = TREE_CHAIN (field);
+      for (field = DECL_CHAIN (field);
            field;
-          field = TREE_CHAIN (field))
+          field = DECL_CHAIN (field))
         {
           if (TREE_CODE (field) != FIELD_DECL)
             continue;
@@ -3507,7 +3737,7 @@ arm_return_in_memory (const_tree type, const_tree fntype)
          integral, or can be returned in an integer register.  */
        for (field = TYPE_FIELDS (type);
            field;
-          field = TREE_CHAIN (field))
+          field = DECL_CHAIN (field))
         {
           if (TREE_CODE (field) != FIELD_DECL)
             continue;
@@ -3613,16 +3843,14 @@ arm_get_pcs_model (const_tree type, const_tree decl)
        /* Detect varargs functions.  These always use the base rules
          (no argument is ever a candidate for a co-processor
          register).  */
-      bool base_rules = (TYPE_ARG_TYPES (type) != 0
-                        && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (type)))
-                            != void_type_node));
+      bool base_rules = stdarg_p (type);
        
        if (user_convention)
         {
           if (user_pcs > ARM_PCS_AAPCS_LOCAL)
-           sorry ("Non-AAPCS derived PCS variant");
+           sorry ("non-AAPCS derived PCS variant");
           else if (base_rules && user_pcs != ARM_PCS_AAPCS)
-           error ("Variadic functions must use the base AAPCS variant");
+           error ("variadic functions must use the base AAPCS variant");
         }
  
        if (base_rules)
@@ -3767,7 +3995,7 @@ aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
         if (!COMPLETE_TYPE_P(type))
           return -1;
  
-       for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
+       for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
           {
             if (TREE_CODE (field) != FIELD_DECL)
               continue;
@@ -3799,7 +4027,7 @@ aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
         if (!COMPLETE_TYPE_P(type))
           return -1;
  
-       for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
+       for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
           {
             if (TREE_CODE (field) != FIELD_DECL)
               continue;
@@ -4074,7 +4302,7 @@ static struct
  
  static int
  aapcs_select_call_coproc (CUMULATIVE_ARGS *pcum, enum machine_mode mode, 
-                         tree type)
+                         const_tree type)
  {
    int i;
  
@@ -4186,7 +4414,7 @@ aapcs_libcall_value (enum machine_mode mode)
     numbers referred to here are those in the AAPCS.  */
  static void
  aapcs_layout_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
-                 tree type, int named)
+                 const_tree type, bool named)
  {
    int nregs, nregs2;
    int ncrn;
@@ -4350,8 +4578,8 @@ arm_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree fntype,
  
  
  /* Return true if mode/type need doubleword alignment.  */
-bool
-arm_needs_doubleword_align (enum machine_mode mode, tree type)
+static bool
+arm_needs_doubleword_align (enum machine_mode mode, const_tree type)
  {
    return (GET_MODE_ALIGNMENT (mode) > PARM_BOUNDARY
           || (type && TYPE_ALIGN (type) > PARM_BOUNDARY));
@@ -4369,11 +4597,17 @@ arm_needs_doubleword_align (enum machine_mode mode, tree type)
     CUM is a variable of type CUMULATIVE_ARGS which gives info about
      the preceding args and about the function being called.
     NAMED is nonzero if this argument is a named parameter
-    (otherwise it is an extra parameter matching an ellipsis).  */
+    (otherwise it is an extra parameter matching an ellipsis).
  
-rtx
+   On the ARM, normally the first 16 bytes are passed in registers r0-r3; all
+   other arguments are passed on the stack.  If (NAMED == 0) (which happens
+   only in assign_parms, since TARGET_SETUP_INCOMING_VARARGS is
+   defined), say it is passed in the stack (function_prologue will
+   indeed make it pass in the stack if necessary).  */
+
+static rtx
  arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
-                 tree type, int named)
+                 const_tree type, bool named)
  {
    int nregs;
  
@@ -4409,10 +4643,6 @@ arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
        && arm_needs_doubleword_align (mode, type))
      pcum->nregs++;
  
-  if (mode == VOIDmode)
-    /* Pick an arbitrary value for operand 2 of the call insn.  */
-    return const0_rtx;
-
    /* Only allow splitting an arg between regs and memory if all preceding
       args were allocated to regs.  For args passed by reference we only count
       the reference pointer.  */
@@ -4427,6 +4657,14 @@ arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
    return gen_rtx_REG (mode, pcum->nregs);
  }
  
+static unsigned int
+arm_function_arg_boundary (enum machine_mode mode, const_tree type)
+{
+  return (ARM_DOUBLEWORD_ALIGN && arm_needs_doubleword_align (mode, type)
+         ? DOUBLEWORD_ALIGNMENT
+         : PARM_BOUNDARY);
+}
+
  static int
  arm_arg_partial_bytes (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
                        tree type, bool named)
@@ -4450,9 +4688,13 @@ arm_arg_partial_bytes (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
    return 0;
  }
  
-void
+/* Update the data in PCUM to advance over an argument
+   of mode MODE and data type TYPE.
+   (TYPE is null for libcalls where that information may not be available.)  */
+
+static void
  arm_function_arg_advance (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
-                         tree type, bool named)
+                         const_tree type, bool named)
  {
    if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
      {
@@ -4907,17 +5149,13 @@ legitimize_pic_address (rtx orig, enum machine_mode mode, rtx reg)
    if (GET_CODE (orig) == SYMBOL_REF
        || GET_CODE (orig) == LABEL_REF)
      {
-      rtx pic_ref, address;
        rtx insn;
  
        if (reg == 0)
         {
           gcc_assert (can_create_pseudo_p ());
           reg = gen_reg_rtx (Pmode);
-         address = gen_reg_rtx (Pmode);
         }
-      else
-       address = reg;
  
        /* VxWorks does not impose a fixed gap between segments; the run-time
          gap can be different from the object-file gap.  We therefore can't
@@ -4933,18 +5171,21 @@ legitimize_pic_address (rtx orig, enum machine_mode mode, rtx reg)
         insn = arm_pic_static_addr (orig, reg);
        else
         {
+         rtx pat;
+         rtx mem;
+
           /* If this function doesn't have a pic register, create one now.  */
           require_pic_register ();
  
-         if (TARGET_32BIT)
-           emit_insn (gen_pic_load_addr_32bit (address, orig));
-         else /* TARGET_THUMB1 */
-           emit_insn (gen_pic_load_addr_thumb1 (address, orig));
+         pat = gen_calculate_pic_address (reg, cfun->machine->pic_reg, orig);
+
+         /* Make the MEM as close to a constant as possible.  */
+         mem = SET_SRC (pat);
+         gcc_assert (MEM_P (mem) && !MEM_VOLATILE_P (mem));
+         MEM_READONLY_P (mem) = 1;
+         MEM_NOTRAP_P (mem) = 1;
  
-         pic_ref = gen_const_mem (Pmode,
-                                  gen_rtx_PLUS (Pmode, cfun->machine->pic_reg,
-                                                address));
-         insn = emit_move_insn (reg, pic_ref);
+         insn = emit_insn (pat);
         }
  
        /* Put a REG_EQUAL note on this insn, so that it can be optimized
@@ -5224,6 +5465,15 @@ pcrel_constant_p (rtx x)
    return FALSE;
  }
  
+/* Return true if X will surely end up in an index register after next
+   splitting pass.  */
+static bool
+will_be_in_index_register (const_rtx x)
+{
+  /* arm.md: calculate_pic_address will split this into a register.  */
+  return GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_PIC_SYM;
+}
+
  /* Return nonzero if X is a valid ARM state address operand.  */
  int
  arm_legitimate_address_outer_p (enum machine_mode mode, rtx x, RTX_CODE outer,
@@ -5281,8 +5531,9 @@ arm_legitimate_address_outer_p (enum machine_mode mode, rtx x, RTX_CODE outer,
        rtx xop1 = XEXP (x, 1);
  
        return ((arm_address_register_rtx_p (xop0, strict_p)
-              && GET_CODE(xop1) == CONST_INT
-              && arm_legitimate_index_p (mode, xop1, outer, strict_p))
+              && ((GET_CODE(xop1) == CONST_INT
+                   && arm_legitimate_index_p (mode, xop1, outer, strict_p))
+                  || (!strict_p && will_be_in_index_register (xop1))))
               || (arm_address_register_rtx_p (xop1, strict_p)
                   && arm_legitimate_index_p (mode, xop0, outer, strict_p)));
      }
@@ -5368,7 +5619,8 @@ thumb2_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p)
        rtx xop1 = XEXP (x, 1);
  
        return ((arm_address_register_rtx_p (xop0, strict_p)
-              && thumb2_legitimate_index_p (mode, xop1, strict_p))
+              && (thumb2_legitimate_index_p (mode, xop1, strict_p)
+                  || (!strict_p && will_be_in_index_register (xop1))))
               || (arm_address_register_rtx_p (xop1, strict_p)
                   && thumb2_legitimate_index_p (mode, xop0, strict_p)));
      }
@@ -5402,13 +5654,25 @@ arm_legitimate_index_p (enum machine_mode mode, rtx index, RTX_CODE outer,
             && INTVAL (index) > -1024
             && (INTVAL (index) & 3) == 0);
  
-  if (TARGET_NEON
-      && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode)))
+  /* For quad modes, we restrict the constant offset to be slightly less
+     than what the instruction format permits.  We do this because for
+     quad mode moves, we will actually decompose them into two separate
+     double-mode reads or writes.  INDEX must therefore be a valid
+     (double-mode) offset and so should INDEX+8.  */
+  if (TARGET_NEON && VALID_NEON_QREG_MODE (mode))
      return (code == CONST_INT
             && INTVAL (index) < 1016
             && INTVAL (index) > -1024
             && (INTVAL (index) & 3) == 0);
  
+  /* We have no such constraint on double mode offsets, so we permit the
+     full range of the instruction format.  */
+  if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
+    return (code == CONST_INT
+           && INTVAL (index) < 1024
+           && INTVAL (index) > -1024
+           && (INTVAL (index) & 3) == 0);
+
    if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))
      return (code == CONST_INT
             && INTVAL (index) < 1024
@@ -5522,13 +5786,25 @@ thumb2_legitimate_index_p (enum machine_mode mode, rtx index, int strict_p)
                 && (INTVAL (index) & 3) == 0);
      }
  
-  if (TARGET_NEON
-      && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode)))
+  /* For quad modes, we restrict the constant offset to be slightly less
+     than what the instruction format permits.  We do this because for
+     quad mode moves, we will actually decompose them into two separate
+     double-mode reads or writes.  INDEX must therefore be a valid
+     (double-mode) offset and so should INDEX+8.  */
+  if (TARGET_NEON && VALID_NEON_QREG_MODE (mode))
      return (code == CONST_INT
             && INTVAL (index) < 1016
             && INTVAL (index) > -1024
             && (INTVAL (index) & 3) == 0);
  
+  /* We have no such constraint on double mode offsets, so we permit the
+     full range of the instruction format.  */
+  if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
+    return (code == CONST_INT
+           && INTVAL (index) < 1024
+           && INTVAL (index) > -1024
+           && (INTVAL (index) & 3) == 0);
+
    if (arm_address_register_rtx_p (index, strict_p)
        && (GET_MODE_SIZE (mode) <= 4))
      return 1;
@@ -5671,7 +5947,8 @@ thumb1_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p)
           && XEXP (x, 0) != frame_pointer_rtx
           && XEXP (x, 1) != frame_pointer_rtx
           && thumb1_index_register_rtx_p (XEXP (x, 0), strict_p)
-         && thumb1_index_register_rtx_p (XEXP (x, 1), strict_p))
+         && (thumb1_index_register_rtx_p (XEXP (x, 1), strict_p)
+             || (!strict_p && will_be_in_index_register (XEXP (x, 1)))))
         return 1;
  
        /* REG+const has 5-7 bit offset for non-SP registers.  */
@@ -5698,7 +5975,8 @@ thumb1_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p)
                && (REGNO (XEXP (x, 0)) == FRAME_POINTER_REGNUM
                    || REGNO (XEXP (x, 0)) == ARG_POINTER_REGNUM
                    || (REGNO (XEXP (x, 0)) >= FIRST_VIRTUAL_REGISTER
-                      && REGNO (XEXP (x, 0)) <= LAST_VIRTUAL_REGISTER))
+                      && REGNO (XEXP (x, 0))
+                         <= LAST_VIRTUAL_POINTER_REGISTER))
                && GET_MODE_SIZE (mode) >= 4
                && GET_CODE (XEXP (x, 1)) == CONST_INT
                && (INTVAL (XEXP (x, 1)) & 3) == 0)
@@ -6207,13 +6485,11 @@ arm_cannot_force_const_mem (rtx x)
  #define REG_OR_SUBREG_RTX(X)                   \
     (GET_CODE (X) == REG ? (X) : SUBREG_REG (X))
  
-#ifndef COSTS_N_INSNS
-#define COSTS_N_INSNS(N) ((N) * 4 - 2)
-#endif
  static inline int
  thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
  {
    enum machine_mode mode = GET_MODE (x);
+  int total;
  
    switch (code)
      {
@@ -6312,24 +6588,20 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
         return 14;
        return 2;
  
+    case SIGN_EXTEND:
      case ZERO_EXTEND:
-      /* XXX still guessing.  */
-      switch (GET_MODE (XEXP (x, 0)))
-       {
-       case QImode:
-         return (1 + (mode == DImode ? 4 : 0)
-                 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0));
+      total = mode == DImode ? COSTS_N_INSNS (1) : 0;
+      total += thumb1_rtx_costs (XEXP (x, 0), GET_CODE (XEXP (x, 0)), code);
  
-       case HImode:
-         return (4 + (mode == DImode ? 4 : 0)
-                 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0));
+      if (mode == SImode)
+       return total;
  
-       case SImode:
-         return (1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0));
+      if (arm_arch6)
+       return total + COSTS_N_INSNS (1);
  
-       default:
-         return 99;
-       }
+      /* Assume a two-shift sequence.  Increase the cost slightly so
+        we prefer actual shifts over an extend operation.  */
+      return total + 1 + COSTS_N_INSNS (2);
  
      default:
        return 99;
@@ -6555,12 +6827,10 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
          since then they might not be moved outside of loops.  As a compromise
          we allow integration with ops that have a constant as their second
          operand.  */
-      if ((REG_OR_SUBREG_REG (XEXP (x, 0))
-          && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
-          && GET_CODE (XEXP (x, 1)) != CONST_INT)
-         || (REG_OR_SUBREG_REG (XEXP (x, 0))
-             && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))))
-       *total = 4;
+      if (REG_OR_SUBREG_REG (XEXP (x, 0))
+         && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
+         && GET_CODE (XEXP (x, 1)) != CONST_INT)
+       *total = COSTS_N_INSNS (1);
  
        if (mode == DImode)
         {
@@ -6798,44 +7068,39 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        return false;
  
      case SIGN_EXTEND:
-      if (GET_MODE_CLASS (mode) == MODE_INT)
-       {
-         *total = 0;
-         if (mode == DImode)
-           *total += COSTS_N_INSNS (1);
-
-         if (GET_MODE (XEXP (x, 0)) != SImode)
-           {
-             if (arm_arch6)
-               {
-                 if (GET_CODE (XEXP (x, 0)) != MEM)
-                   *total += COSTS_N_INSNS (1);
-               }
-             else if (!arm_arch4 || GET_CODE (XEXP (x, 0)) != MEM)
-               *total += COSTS_N_INSNS (2);
-           }
-
-         return false;
-       }
-
-      /* Fall through */
      case ZERO_EXTEND:
        *total = 0;
        if (GET_MODE_CLASS (mode) == MODE_INT)
         {
+         rtx op = XEXP (x, 0);
+         enum machine_mode opmode = GET_MODE (op);
+
           if (mode == DImode)
             *total += COSTS_N_INSNS (1);
  
-         if (GET_MODE (XEXP (x, 0)) != SImode)
+         if (opmode != SImode)
             {
-             if (arm_arch6)
+             if (MEM_P (op))
                 {
-                 if (GET_CODE (XEXP (x, 0)) != MEM)
-                   *total += COSTS_N_INSNS (1);
+                 /* If !arm_arch4, we use one of the extendhisi2_mem
+                    or movhi_bytes patterns for HImode.  For a QImode
+                    sign extension, we first zero-extend from memory
+                    and then perform a shift sequence.  */
+                 if (!arm_arch4 && (opmode != QImode || code == SIGN_EXTEND))
+                   *total += COSTS_N_INSNS (2);
                 }
-             else if (!arm_arch4 || GET_CODE (XEXP (x, 0)) != MEM)
-               *total += COSTS_N_INSNS (GET_MODE (XEXP (x, 0)) == QImode ?
-                                        1 : 2);
+             else if (arm_arch6)
+               *total += COSTS_N_INSNS (1);
+
+             /* We don't have the necessary insn, so we need to perform some
+                other operation.  */
+             else if (TARGET_ARM && code == ZERO_EXTEND && mode == QImode)
+               /* An and with constant 255.  */
+               *total += COSTS_N_INSNS (1);
+             else
+               /* A shift sequence.  Increase costs slightly to avoid
+                  combining two shifts into an extend operation.  */
+               *total += COSTS_N_INSNS (2) + 1;
             }
  
           return false;
@@ -6940,7 +7205,11 @@ thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
        if (outer == SET)
          {
            if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
-            return 0;
+            return COSTS_N_INSNS (1);
+         /* See split "TARGET_THUMB1 && satisfies_constraint_J".  */
+         if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
+            return COSTS_N_INSNS (2);
+         /* See split "TARGET_THUMB1 && satisfies_constraint_K".  */
            if (thumb_shiftable_const (INTVAL (x)))
              return COSTS_N_INSNS (2);
            return COSTS_N_INSNS (3);
@@ -7044,6 +7313,12 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
          a single register, otherwise it costs one insn per word.  */
        if (REG_P (XEXP (x, 0)))
         *total = COSTS_N_INSNS (1);
+      else if (flag_pic
+              && GET_CODE (XEXP (x, 0)) == PLUS
+              && will_be_in_index_register (XEXP (XEXP (x, 0), 1)))
+       /* This will be split into two instructions.
+          See arm.md:calculate_pic_address.  */
+       *total = COSTS_N_INSNS (2);
        else
         *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
        return true;
@@ -7191,41 +7466,8 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
        return false;
  
      case SIGN_EXTEND:
-      *total = 0;
-      if (GET_MODE_SIZE (GET_MODE (XEXP (x, 0))) < 4)
-       {
-         if (!(arm_arch4 && MEM_P (XEXP (x, 0))))
-           *total += COSTS_N_INSNS (arm_arch6 ? 1 : 2);
-       }
-      if (mode == DImode)
-       *total += COSTS_N_INSNS (1);
-      return false;
-
      case ZERO_EXTEND:
-      *total = 0;
-      if (!(arm_arch4 && MEM_P (XEXP (x, 0))))
-       {
-         switch (GET_MODE (XEXP (x, 0)))
-           {
-           case QImode:
-             *total += COSTS_N_INSNS (1);
-             break;
-
-           case HImode:
-             *total += COSTS_N_INSNS (arm_arch6 ? 1 : 2);
-
-           case SImode:
-             break;
-
-           default:
-             *total += COSTS_N_INSNS (2);
-           }
-       }
-
-      if (mode == DImode)
-       *total += COSTS_N_INSNS (1);
-
-      return false;
+      return arm_rtx_costs_1 (x, outer_code, total, 0);
  
      case CONST_INT:
        if (const_ok_for_arm (INTVAL (x)))
@@ -7631,15 +7873,13 @@ arm_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
    return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x);
  }
  
-static int
-arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
+/* Adjust cost hook for XScale.  */
+static bool
+xscale_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
  {
-  rtx i_pat, d_pat;
-
    /* Some true dependencies can have a higher cost depending
       on precisely how certain input operands are used.  */
-  if (arm_tune_xscale
-      && REG_NOTE_KIND (link) == 0
+  if (REG_NOTE_KIND(link) == 0
        && recog_memoized (insn) >= 0
        && recog_memoized (dep) >= 0)
      {
@@ -7673,10 +7913,116 @@ arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
  
               if (reg_overlap_mentioned_p (recog_data.operand[opno],
                                            shifted_operand))
-               return 2;
+               {
+                 *cost = 2;
+                 return false;
+               }
             }
         }
      }
+  return true;
+}
+
+/* Adjust cost hook for Cortex A9.  */
+static bool
+cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
+{
+  switch (REG_NOTE_KIND (link))
+    {
+    case REG_DEP_ANTI:
+      *cost = 0;
+      return false;
+
+    case REG_DEP_TRUE:
+    case REG_DEP_OUTPUT:
+       if (recog_memoized (insn) >= 0
+           && recog_memoized (dep) >= 0)
+         {
+           if (GET_CODE (PATTERN (insn)) == SET)
+             {
+               if (GET_MODE_CLASS 
+                   (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT
+                 || GET_MODE_CLASS 
+                   (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT)
+                 {
+                   enum attr_type attr_type_insn = get_attr_type (insn);
+                   enum attr_type attr_type_dep = get_attr_type (dep);
+
+                   /* By default all dependencies of the form
+                      s0 = s0 <op> s1
+                      s0 = s0 <op> s2
+                      have an extra latency of 1 cycle because
+                      of the input and output dependency in this
+                      case. However this gets modeled as an true
+                      dependency and hence all these checks.  */
+                   if (REG_P (SET_DEST (PATTERN (insn)))
+                       && REG_P (SET_DEST (PATTERN (dep)))
+                       && reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)),
+                                                   SET_DEST (PATTERN (dep))))
+                     {
+                       /* FMACS is a special case where the dependant
+                          instruction can be issued 3 cycles before
+                          the normal latency in case of an output 
+                          dependency.  */
+                       if ((attr_type_insn == TYPE_FMACS
+                            || attr_type_insn == TYPE_FMACD)
+                           && (attr_type_dep == TYPE_FMACS
+                               || attr_type_dep == TYPE_FMACD))
+                         {
+                           if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
+                             *cost = insn_default_latency (dep) - 3;
+                           else
+                             *cost = insn_default_latency (dep);
+                           return false;
+                         }
+                       else
+                         {
+                           if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
+                             *cost = insn_default_latency (dep) + 1;
+                           else
+                             *cost = insn_default_latency (dep);
+                         }
+                       return false;
+                     }
+                 }
+             }
+         }
+       break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return true;
+}
+
+/* This function implements the target macro TARGET_SCHED_ADJUST_COST.
+   It corrects the value of COST based on the relationship between
+   INSN and DEP through the dependence LINK.  It returns the new
+   value. There is a per-core adjust_cost hook to adjust scheduler costs
+   and the per-core hook can choose to completely override the generic 
+   adjust_cost function. Only put bits of code into arm_adjust_cost that 
+   are common across all cores.  */
+static int
+arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
+{
+  rtx i_pat, d_pat;
+
+ /* When generating Thumb-1 code, we want to place flag-setting operations
+    close to a conditional branch which depends on them, so that we can
+    omit the comparison. */
+  if (TARGET_THUMB1
+      && REG_NOTE_KIND (link) == 0
+      && recog_memoized (insn) == CODE_FOR_cbranchsi4_insn
+      && recog_memoized (dep) >= 0
+      && get_attr_conds (dep) == CONDS_SET)
+    return 0;
+
+  if (current_tune->sched_adjust_cost != NULL)
+    {
+      if (!current_tune->sched_adjust_cost (insn, link, dep, &cost))
+       return cost;
+    }
  
    /* XXX This is not strictly true for the FPA.  */
    if (REG_NOTE_KIND (link) == REG_DEP_ANTI
@@ -7699,7 +8045,8 @@ arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
          constant pool are cached, and that others will miss.  This is a
          hack.  */
  
-      if ((GET_CODE (src_mem) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (src_mem))
+      if ((GET_CODE (src_mem) == SYMBOL_REF 
+          && CONSTANT_POOL_ADDRESS_P (src_mem))
           || reg_mentioned_p (stack_pointer_rtx, src_mem)
           || reg_mentioned_p (frame_pointer_rtx, src_mem)
           || reg_mentioned_p (hard_frame_pointer_rtx, src_mem))
@@ -8603,7 +8950,8 @@ neon_vector_mem_operand (rtx op, int type)
      return arm_address_register_rtx_p (ind, 0);
  
    /* Allow post-increment with Neon registers.  */
-  if (type != 1 && (GET_CODE (ind) == POST_INC || GET_CODE (ind) == PRE_DEC))
+  if ((type != 1 && GET_CODE (ind) == POST_INC)
+      || (type == 0 && GET_CODE (ind) == PRE_DEC))
      return arm_address_register_rtx_p (XEXP (ind, 0), 0);
  
    /* FIXME: vld1 allows register post-modify.  */
@@ -9111,6 +9459,36 @@ multiple_operation_profitable_p (bool is_store ATTRIBUTE_UNUSED,
    if (nops == 2 && arm_ld_sched && add_offset != 0)
      return false;
  
+  /* XScale has load-store double instructions, but they have stricter
+     alignment requirements than load-store multiple, so we cannot
+     use them.
+
+     For XScale ldm requires 2 + NREGS cycles to complete and blocks
+     the pipeline until completion.
+
+       NREGS           CYCLES
+         1               3
+         2               4
+         3               5
+         4               6
+
+     An ldr instruction takes 1-3 cycles, but does not block the
+     pipeline.
+
+       NREGS           CYCLES
+         1              1-3
+         2              2-6
+         3              3-9
+         4              4-12
+
+     Best case ldr will always win.  However, the more ldr instructions
+     we issue, the less likely we are to be able to schedule them well.
+     Using ldr instructions also increases code size.
+
+     As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
+     for counts of 3 or 4 regs.  */
+  if (nops <= 2 && arm_tune_xscale && !optimize_size)
+    return false;
    return true;
  }
  
@@ -9154,13 +9532,29 @@ compute_offset_order (int nops, HOST_WIDE_INT *unsorted_offsets, int *order,
    return true;
  }
  
-int
-load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
-                       HOST_WIDE_INT *load_offset)
+/* Used to determine in a peephole whether a sequence of load
+   instructions can be changed into a load-multiple instruction.
+   NOPS is the number of separate load instructions we are examining.  The
+   first NOPS entries in OPERANDS are the destination registers, the
+   next NOPS entries are memory operands.  If this function is
+   successful, *BASE is set to the common base register of the memory
+   accesses; *LOAD_OFFSET is set to the first memory location's offset
+   from that base register.
+   REGS is an array filled in with the destination register numbers.
+   SAVED_ORDER (if nonnull), is an array filled in with an order that maps
+   insn numbers to to an ascending order of stores.  If CHECK_REGS is true,
+   the sequence of registers in REGS matches the loads from ascending memory
+   locations, and the function verifies that the register numbers are
+   themselves ascending.  If CHECK_REGS is false, the register numbers
+   are stored in the order they are found in the operands.  */
+static int
+load_multiple_sequence (rtx *operands, int nops, int *regs, int *saved_order,
+                       int *base, HOST_WIDE_INT *load_offset, bool check_regs)
  {
    int unsorted_regs[MAX_LDM_STM_OPS];
    HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
    int order[MAX_LDM_STM_OPS];
+  rtx base_reg_rtx = NULL;
    int base_reg = -1;
    int i, ldm_case;
  
@@ -9204,13 +9598,16 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
                   == CONST_INT)))
         {
           if (i == 0)
-           base_reg = REGNO (reg);
-         else
             {
-             if (base_reg != (int) REGNO (reg))
-               /* Not addressed from the same base register.  */
+             base_reg = REGNO (reg);
+             base_reg_rtx = reg;
+             if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM)
                 return 0;
             }
+         else if (base_reg != (int) REGNO (reg))
+           /* Not addressed from the same base register.  */
+           return 0;
+
           unsorted_regs[i] = (GET_CODE (operands[i]) == REG
                               ? REGNO (operands[i])
                               : REGNO (SUBREG_REG (operands[i])));
@@ -9218,7 +9615,9 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
           /* If it isn't an integer register, or if it overwrites the
              base register but isn't the last insn in the list, then
              we can't do this.  */
-         if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14
+         if (unsorted_regs[i] < 0
+             || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM)
+             || unsorted_regs[i] > 14
               || (i != nops - 1 && unsorted_regs[i] == base_reg))
             return 0;
  
@@ -9236,26 +9635,34 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
       order[0] has been set to the lowest offset in the list.  Sort
       the offsets into order, verifying that they are adjacent, and
       check that the register numbers are ascending.  */
-  if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
+  if (!compute_offset_order (nops, unsorted_offsets, order,
+                            check_regs ? unsorted_regs : NULL))
      return 0;
  
+  if (saved_order)
+    memcpy (saved_order, order, sizeof order);
+
    if (base)
      {
        *base = base_reg;
  
        for (i = 0; i < nops; i++)
-       regs[i] = unsorted_regs[order[i]];
+       regs[i] = unsorted_regs[check_regs ? order[i] : i];
  
        *load_offset = unsorted_offsets[order[0]];
      }
  
+  if (TARGET_THUMB1
+      && !peep2_reg_dead_p (nops, base_reg_rtx))
+    return 0;
+
    if (unsorted_offsets[order[0]] == 0)
      ldm_case = 1; /* ldmia */
    else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
      ldm_case = 2; /* ldmib */
    else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
      ldm_case = 3; /* ldmda */
-  else if (unsorted_offsets[order[nops - 1]] == -4)
+  else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4)
      ldm_case = 4; /* ldmdb */
    else if (const_ok_for_arm (unsorted_offsets[order[0]])
            || const_ok_for_arm (-unsorted_offsets[order[0]]))
@@ -9271,79 +9678,41 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
    return ldm_case;
  }
  
-const char *
-emit_ldm_seq (rtx *operands, int nops)
+/* Used to determine in a peephole whether a sequence of store instructions can
+   be changed into a store-multiple instruction.
+   NOPS is the number of separate store instructions we are examining.
+   NOPS_TOTAL is the total number of instructions recognized by the peephole
+   pattern.
+   The first NOPS entries in OPERANDS are the source registers, the next
+   NOPS entries are memory operands.  If this function is successful, *BASE is
+   set to the common base register of the memory accesses; *LOAD_OFFSET is set
+   to the first memory location's offset from that base register.  REGS is an
+   array filled in with the source register numbers, REG_RTXS (if nonnull) is
+   likewise filled with the corresponding rtx's.
+   SAVED_ORDER (if nonnull), is an array filled in with an order that maps insn
+   numbers to to an ascending order of stores.
+   If CHECK_REGS is true, the sequence of registers in *REGS matches the stores
+   from ascending memory locations, and the function verifies that the register
+   numbers are themselves ascending.  If CHECK_REGS is false, the register
+   numbers are stored in the order they are found in the operands.  */
+static int
+store_multiple_sequence (rtx *operands, int nops, int nops_total,
+                        int *regs, rtx *reg_rtxs, int *saved_order, int *base,
+                        HOST_WIDE_INT *load_offset, bool check_regs)
  {
-  int regs[MAX_LDM_STM_OPS];
-  int base_reg;
-  HOST_WIDE_INT offset;
-  char buf[100];
-  int i;
+  int unsorted_regs[MAX_LDM_STM_OPS];
+  rtx unsorted_reg_rtxs[MAX_LDM_STM_OPS];
+  HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
+  int order[MAX_LDM_STM_OPS];
+  int base_reg = -1;
+  rtx base_reg_rtx = NULL;
+  int i, stm_case;
  
-  switch (load_multiple_sequence (operands, nops, regs, &base_reg, &offset))
-    {
-    case 1:
-      strcpy (buf, "ldm%(ia%)\t");
-      break;
+  /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
+     easily extended if required.  */
+  gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
  
-    case 2:
-      strcpy (buf, "ldm%(ib%)\t");
-      break;
-
-    case 3:
-      strcpy (buf, "ldm%(da%)\t");
-      break;
-
-    case 4:
-      strcpy (buf, "ldm%(db%)\t");
-      break;
-
-    case 5:
-      if (offset >= 0)
-       sprintf (buf, "add%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX,
-                reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg],
-                (long) offset);
-      else
-       sprintf (buf, "sub%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX,
-                reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg],
-                (long) -offset);
-      output_asm_insn (buf, operands);
-      base_reg = regs[0];
-      strcpy (buf, "ldm%(ia%)\t");
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX,
-          reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]);
-
-  for (i = 1; i < nops; i++)
-    sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX,
-            reg_names[regs[i]]);
-
-  strcat (buf, "}\t%@ phole ldm");
-
-  output_asm_insn (buf, operands);
-  return "";
-}
-
-int
-store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
-                        HOST_WIDE_INT * load_offset)
-{
-  int unsorted_regs[MAX_LDM_STM_OPS];
-  HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
-  int order[MAX_LDM_STM_OPS];
-  int base_reg = -1;
-  int i, stm_case;
-
-  /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
-     easily extended if required.  */
-  gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
-
-  memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
+  memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
  
    /* Loop over the operands and check that the memory references are
       suitable (i.e. immediate offsets from the same base register).  At
@@ -9378,17 +9747,27 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
               && (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1))
                   == CONST_INT)))
         {
-         unsorted_regs[i] = (GET_CODE (operands[i]) == REG
-                             ? REGNO (operands[i])
-                             : REGNO (SUBREG_REG (operands[i])));
+         unsorted_reg_rtxs[i] = (GET_CODE (operands[i]) == REG
+                                 ? operands[i] : SUBREG_REG (operands[i]));
+         unsorted_regs[i] = REGNO (unsorted_reg_rtxs[i]);
+
           if (i == 0)
-           base_reg = REGNO (reg);
+           {
+             base_reg = REGNO (reg);
+             base_reg_rtx = reg;
+             if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM)
+               return 0;
+           }
           else if (base_reg != (int) REGNO (reg))
             /* Not addressed from the same base register.  */
             return 0;
  
           /* If it isn't an integer register, then we can't do this.  */
-         if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14)
+         if (unsorted_regs[i] < 0
+             || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM)
+             || (TARGET_THUMB2 && unsorted_regs[i] == base_reg)
+             || (TARGET_THUMB2 && unsorted_regs[i] == SP_REGNUM)
+             || unsorted_regs[i] > 14)
             return 0;
  
           unsorted_offsets[i] = INTVAL (offset);
@@ -9405,26 +9784,38 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
       order[0] has been set to the lowest offset in the list.  Sort
       the offsets into order, verifying that they are adjacent, and
       check that the register numbers are ascending.  */
-  if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
+  if (!compute_offset_order (nops, unsorted_offsets, order,
+                            check_regs ? unsorted_regs : NULL))
      return 0;
  
+  if (saved_order)
+    memcpy (saved_order, order, sizeof order);
+
    if (base)
      {
        *base = base_reg;
  
        for (i = 0; i < nops; i++)
-       regs[i] = unsorted_regs[order[i]];
+       {
+         regs[i] = unsorted_regs[check_regs ? order[i] : i];
+         if (reg_rtxs)
+           reg_rtxs[i] = unsorted_reg_rtxs[check_regs ? order[i] : i];
+       }
  
        *load_offset = unsorted_offsets[order[0]];
      }
  
+  if (TARGET_THUMB1
+      && !peep2_reg_dead_p (nops_total, base_reg_rtx))
+    return 0;
+
    if (unsorted_offsets[order[0]] == 0)
      stm_case = 1; /* stmia */
    else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
      stm_case = 2; /* stmib */
    else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
      stm_case = 3; /* stmda */
-  else if (unsorted_offsets[order[nops - 1]] == -4)
+  else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4)
      stm_case = 4; /* stmdb */
    else
      return 0;
@@ -9434,110 +9825,85 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
  
    return stm_case;
  }
+\f
+/* Routines for use in generating RTL.  */
  
-const char *
-emit_stm_seq (rtx *operands, int nops)
+/* Generate a load-multiple instruction.  COUNT is the number of loads in
+   the instruction; REGS and MEMS are arrays containing the operands.
+   BASEREG is the base register to be used in addressing the memory operands.
+   WBACK_OFFSET is nonzero if the instruction should update the base
+   register.  */
+
+static rtx
+arm_gen_load_multiple_1 (int count, int *regs, rtx *mems, rtx basereg,
+                        HOST_WIDE_INT wback_offset)
  {
-  int regs[MAX_LDM_STM_OPS];
-  int base_reg;
-  HOST_WIDE_INT offset;
-  char buf[100];
-  int i;
+  int i = 0, j;
+  rtx result;
  
-  switch (store_multiple_sequence (operands, nops, regs, &base_reg, &offset))
+  if (!multiple_operation_profitable_p (false, count, 0))
      {
-    case 1:
-      strcpy (buf, "stm%(ia%)\t");
-      break;
+      rtx seq;
  
-    case 2:
-      strcpy (buf, "stm%(ib%)\t");
-      break;
+      start_sequence ();
  
-    case 3:
-      strcpy (buf, "stm%(da%)\t");
-      break;
+      for (i = 0; i < count; i++)
+       emit_move_insn (gen_rtx_REG (SImode, regs[i]), mems[i]);
  
-    case 4:
-      strcpy (buf, "stm%(db%)\t");
-      break;
+      if (wback_offset != 0)
+       emit_move_insn (basereg, plus_constant (basereg, wback_offset));
  
-    default:
-      gcc_unreachable ();
-    }
+      seq = get_insns ();
+      end_sequence ();
  
-  sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX,
-          reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]);
+      return seq;
+    }
  
-  for (i = 1; i < nops; i++)
-    sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX,
-            reg_names[regs[i]]);
+  result = gen_rtx_PARALLEL (VOIDmode,
+                            rtvec_alloc (count + (wback_offset != 0 ? 1 : 0)));
+  if (wback_offset != 0)
+    {
+      XVECEXP (result, 0, 0)
+       = gen_rtx_SET (VOIDmode, basereg,
+                      plus_constant (basereg, wback_offset));
+      i = 1;
+      count++;
+    }
  
-  strcat (buf, "}\t%@ phole stm");
+  for (j = 0; i < count; i++, j++)
+    XVECEXP (result, 0, i)
+      = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, regs[j]), mems[j]);
  
-  output_asm_insn (buf, operands);
-  return "";
+  return result;
  }
-\f
-/* Routines for use in generating RTL.  */
  
-rtx
-arm_gen_load_multiple (int base_regno, int count, rtx from, int up,
-                      int write_back, rtx basemem, HOST_WIDE_INT *offsetp)
+/* Generate a store-multiple instruction.  COUNT is the number of stores in
+   the instruction; REGS and MEMS are arrays containing the operands.
+   BASEREG is the base register to be used in addressing the memory operands.
+   WBACK_OFFSET is nonzero if the instruction should update the base
+   register.  */
+
+static rtx
+arm_gen_store_multiple_1 (int count, int *regs, rtx *mems, rtx basereg,
+                         HOST_WIDE_INT wback_offset)
  {
-  HOST_WIDE_INT offset = *offsetp;
    int i = 0, j;
    rtx result;
-  int sign = up ? 1 : -1;
-  rtx mem, addr;
-
-  /* XScale has load-store double instructions, but they have stricter
-     alignment requirements than load-store multiple, so we cannot
-     use them.
  
-     For XScale ldm requires 2 + NREGS cycles to complete and blocks
-     the pipeline until completion.
+  if (GET_CODE (basereg) == PLUS)
+    basereg = XEXP (basereg, 0);
  
-       NREGS           CYCLES
-         1               3
-         2               4
-         3               5
-         4               6
-
-     An ldr instruction takes 1-3 cycles, but does not block the
-     pipeline.
-
-       NREGS           CYCLES
-         1              1-3
-         2              2-6
-         3              3-9
-         4              4-12
-
-     Best case ldr will always win.  However, the more ldr instructions
-     we issue, the less likely we are to be able to schedule them well.
-     Using ldr instructions also increases code size.
-
-     As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
-     for counts of 3 or 4 regs.  */
-  if (arm_tune_xscale && count <= 2 && ! optimize_size)
+  if (!multiple_operation_profitable_p (false, count, 0))
      {
        rtx seq;
  
        start_sequence ();
  
        for (i = 0; i < count; i++)
-       {
-         addr = plus_constant (from, i * 4 * sign);
-         mem = adjust_automodify_address (basemem, SImode, addr, offset);
-         emit_move_insn (gen_rtx_REG (SImode, base_regno + i), mem);
-         offset += 4 * sign;
-       }
+       emit_move_insn (mems[i], gen_rtx_REG (SImode, regs[i]));
  
-      if (write_back)
-       {
-         emit_move_insn (from, plus_constant (from, count * 4 * sign));
-         *offsetp = offset;
-       }
+      if (wback_offset != 0)
+       emit_move_insn (basereg, plus_constant (basereg, wback_offset));
  
        seq = get_insns ();
        end_sequence ();
@@ -9546,92 +9912,319 @@ arm_gen_load_multiple (int base_regno, int count, rtx from, int up,
      }
  
    result = gen_rtx_PARALLEL (VOIDmode,
-                            rtvec_alloc (count + (write_back ? 1 : 0)));
-  if (write_back)
+                            rtvec_alloc (count + (wback_offset != 0 ? 1 : 0)));
+  if (wback_offset != 0)
      {
        XVECEXP (result, 0, 0)
-       = gen_rtx_SET (VOIDmode, from, plus_constant (from, count * 4 * sign));
+       = gen_rtx_SET (VOIDmode, basereg,
+                      plus_constant (basereg, wback_offset));
        i = 1;
        count++;
      }
  
    for (j = 0; i < count; i++, j++)
+    XVECEXP (result, 0, i)
+      = gen_rtx_SET (VOIDmode, mems[j], gen_rtx_REG (SImode, regs[j]));
+
+  return result;
+}
+
+/* Generate either a load-multiple or a store-multiple instruction.  This
+   function can be used in situations where we can start with a single MEM
+   rtx and adjust its address upwards.
+   COUNT is the number of operations in the instruction, not counting a
+   possible update of the base register.  REGS is an array containing the
+   register operands.
+   BASEREG is the base register to be used in addressing the memory operands,
+   which are constructed from BASEMEM.
+   WRITE_BACK specifies whether the generated instruction should include an
+   update of the base register.
+   OFFSETP is used to pass an offset to and from this function; this offset
+   is not used when constructing the address (instead BASEMEM should have an
+   appropriate offset in its address), it is used only for setting
+   MEM_OFFSET.  It is updated only if WRITE_BACK is true.*/
+
+static rtx
+arm_gen_multiple_op (bool is_load, int *regs, int count, rtx basereg,
+                    bool write_back, rtx basemem, HOST_WIDE_INT *offsetp)
+{
+  rtx mems[MAX_LDM_STM_OPS];
+  HOST_WIDE_INT offset = *offsetp;
+  int i;
+
+  gcc_assert (count <= MAX_LDM_STM_OPS);
+
+  if (GET_CODE (basereg) == PLUS)
+    basereg = XEXP (basereg, 0);
+
+  for (i = 0; i < count; i++)
      {
-      addr = plus_constant (from, j * 4 * sign);
-      mem = adjust_automodify_address_nv (basemem, SImode, addr, offset);
-      XVECEXP (result, 0, i)
-       = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, base_regno + j), mem);
-      offset += 4 * sign;
+      rtx addr = plus_constant (basereg, i * 4);
+      mems[i] = adjust_automodify_address_nv (basemem, SImode, addr, offset);
+      offset += 4;
      }
  
    if (write_back)
      *offsetp = offset;
  
-  return result;
+  if (is_load)
+    return arm_gen_load_multiple_1 (count, regs, mems, basereg,
+                                   write_back ? 4 * count : 0);
+  else
+    return arm_gen_store_multiple_1 (count, regs, mems, basereg,
+                                    write_back ? 4 * count : 0);
  }
  
  rtx
-arm_gen_store_multiple (int base_regno, int count, rtx to, int up,
-                       int write_back, rtx basemem, HOST_WIDE_INT *offsetp)
+arm_gen_load_multiple (int *regs, int count, rtx basereg, int write_back,
+                      rtx basemem, HOST_WIDE_INT *offsetp)
  {
-  HOST_WIDE_INT offset = *offsetp;
-  int i = 0, j;
-  rtx result;
-  int sign = up ? 1 : -1;
-  rtx mem, addr;
+  return arm_gen_multiple_op (TRUE, regs, count, basereg, write_back, basemem,
+                             offsetp);
+}
  
-  /* See arm_gen_load_multiple for discussion of
-     the pros/cons of ldm/stm usage for XScale.  */
-  if (arm_tune_xscale && count <= 2 && ! optimize_size)
-    {
-      rtx seq;
+rtx
+arm_gen_store_multiple (int *regs, int count, rtx basereg, int write_back,
+                       rtx basemem, HOST_WIDE_INT *offsetp)
+{
+  return arm_gen_multiple_op (FALSE, regs, count, basereg, write_back, basemem,
+                             offsetp);
+}
  
-      start_sequence ();
+/* Called from a peephole2 expander to turn a sequence of loads into an
+   LDM instruction.  OPERANDS are the operands found by the peephole matcher;
+   NOPS indicates how many separate loads we are trying to combine.  SORT_REGS
+   is true if we can reorder the registers because they are used commutatively
+   subsequently.
+   Returns true iff we could generate a new instruction.  */
  
-      for (i = 0; i < count; i++)
-       {
-         addr = plus_constant (to, i * 4 * sign);
-         mem = adjust_automodify_address (basemem, SImode, addr, offset);
-         emit_move_insn (mem, gen_rtx_REG (SImode, base_regno + i));
-         offset += 4 * sign;
-       }
+bool
+gen_ldm_seq (rtx *operands, int nops, bool sort_regs)
+{
+  int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
+  rtx mems[MAX_LDM_STM_OPS];
+  int i, j, base_reg;
+  rtx base_reg_rtx;
+  HOST_WIDE_INT offset;
+  int write_back = FALSE;
+  int ldm_case;
+  rtx addr;
+
+  ldm_case = load_multiple_sequence (operands, nops, regs, mem_order,
+                                    &base_reg, &offset, !sort_regs);
+
+  if (ldm_case == 0)
+    return false;
+
+  if (sort_regs)
+    for (i = 0; i < nops - 1; i++)
+      for (j = i + 1; j < nops; j++)
+       if (regs[i] > regs[j])
+         {
+           int t = regs[i];
+           regs[i] = regs[j];
+           regs[j] = t;
+         }
+  base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
+
+  if (TARGET_THUMB1)
+    {
+      gcc_assert (peep2_reg_dead_p (nops, base_reg_rtx));
+      gcc_assert (ldm_case == 1 || ldm_case == 5);
+      write_back = TRUE;
+    }
  
-      if (write_back)
+  if (ldm_case == 5)
+    {
+      rtx newbase = TARGET_THUMB1 ? base_reg_rtx : gen_rtx_REG (SImode, regs[0]);
+      emit_insn (gen_addsi3 (newbase, base_reg_rtx, GEN_INT (offset)));
+      offset = 0;
+      if (!TARGET_THUMB1)
         {
-         emit_move_insn (to, plus_constant (to, count * 4 * sign));
-         *offsetp = offset;
+         base_reg = regs[0];
+         base_reg_rtx = newbase;
         }
+    }
  
-      seq = get_insns ();
-      end_sequence ();
+  for (i = 0; i < nops; i++)
+    {
+      addr = plus_constant (base_reg_rtx, offset + i * 4);
+      mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
+                                             SImode, addr, 0);
+    }
+  emit_insn (arm_gen_load_multiple_1 (nops, regs, mems, base_reg_rtx,
+                                     write_back ? offset + i * 4 : 0));
+  return true;
+}
  
-      return seq;
+/* Called from a peephole2 expander to turn a sequence of stores into an
+   STM instruction.  OPERANDS are the operands found by the peephole matcher;
+   NOPS indicates how many separate stores we are trying to combine.
+   Returns true iff we could generate a new instruction.  */
+
+bool
+gen_stm_seq (rtx *operands, int nops)
+{
+  int i;
+  int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
+  rtx mems[MAX_LDM_STM_OPS];
+  int base_reg;
+  rtx base_reg_rtx;
+  HOST_WIDE_INT offset;
+  int write_back = FALSE;
+  int stm_case;
+  rtx addr;
+  bool base_reg_dies;
+
+  stm_case = store_multiple_sequence (operands, nops, nops, regs, NULL,
+                                     mem_order, &base_reg, &offset, true);
+
+  if (stm_case == 0)
+    return false;
+
+  base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
+
+  base_reg_dies = peep2_reg_dead_p (nops, base_reg_rtx);
+  if (TARGET_THUMB1)
+    {
+      gcc_assert (base_reg_dies);
+      write_back = TRUE;
      }
  
-  result = gen_rtx_PARALLEL (VOIDmode,
-                            rtvec_alloc (count + (write_back ? 1 : 0)));
-  if (write_back)
+  if (stm_case == 5)
      {
-      XVECEXP (result, 0, 0)
-       = gen_rtx_SET (VOIDmode, to,
-                      plus_constant (to, count * 4 * sign));
-      i = 1;
-      count++;
+      gcc_assert (base_reg_dies);
+      emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset)));
+      offset = 0;
      }
  
-  for (j = 0; i < count; i++, j++)
+  addr = plus_constant (base_reg_rtx, offset);
+
+  for (i = 0; i < nops; i++)
      {
-      addr = plus_constant (to, j * 4 * sign);
-      mem = adjust_automodify_address_nv (basemem, SImode, addr, offset);
-      XVECEXP (result, 0, i)
-       = gen_rtx_SET (VOIDmode, mem, gen_rtx_REG (SImode, base_regno + j));
-      offset += 4 * sign;
+      addr = plus_constant (base_reg_rtx, offset + i * 4);
+      mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
+                                             SImode, addr, 0);
      }
+  emit_insn (arm_gen_store_multiple_1 (nops, regs, mems, base_reg_rtx,
+                                      write_back ? offset + i * 4 : 0));
+  return true;
+}
  
-  if (write_back)
-    *offsetp = offset;
+/* Called from a peephole2 expander to turn a sequence of stores that are
+   preceded by constant loads into an STM instruction.  OPERANDS are the
+   operands found by the peephole matcher; NOPS indicates how many
+   separate stores we are trying to combine; there are 2 * NOPS
+   instructions in the peephole.
+   Returns true iff we could generate a new instruction.  */
  
-  return result;
+bool
+gen_const_stm_seq (rtx *operands, int nops)
+{
+  int regs[MAX_LDM_STM_OPS], sorted_regs[MAX_LDM_STM_OPS];
+  int reg_order[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
+  rtx reg_rtxs[MAX_LDM_STM_OPS], orig_reg_rtxs[MAX_LDM_STM_OPS];
+  rtx mems[MAX_LDM_STM_OPS];
+  int base_reg;
+  rtx base_reg_rtx;
+  HOST_WIDE_INT offset;
+  int write_back = FALSE;
+  int stm_case;
+  rtx addr;
+  bool base_reg_dies;
+  int i, j;
+  HARD_REG_SET allocated;
+
+  stm_case = store_multiple_sequence (operands, nops, 2 * nops, regs, reg_rtxs,
+                                     mem_order, &base_reg, &offset, false);
+
+  if (stm_case == 0)
+    return false;
+
+  memcpy (orig_reg_rtxs, reg_rtxs, sizeof orig_reg_rtxs);
+
+  /* If the same register is used more than once, try to find a free
+     register.  */
+  CLEAR_HARD_REG_SET (allocated);
+  for (i = 0; i < nops; i++)
+    {
+      for (j = i + 1; j < nops; j++)
+       if (regs[i] == regs[j])
+         {
+           rtx t = peep2_find_free_register (0, nops * 2,
+                                             TARGET_THUMB1 ? "l" : "r",
+                                             SImode, &allocated);
+           if (t == NULL_RTX)
+             return false;
+           reg_rtxs[i] = t;
+           regs[i] = REGNO (t);
+         }
+    }
+
+  /* Compute an ordering that maps the register numbers to an ascending
+     sequence.  */
+  reg_order[0] = 0;
+  for (i = 0; i < nops; i++)
+    if (regs[i] < regs[reg_order[0]])
+      reg_order[0] = i;
+
+  for (i = 1; i < nops; i++)
+    {
+      int this_order = reg_order[i - 1];
+      for (j = 0; j < nops; j++)
+       if (regs[j] > regs[reg_order[i - 1]]
+           && (this_order == reg_order[i - 1]
+               || regs[j] < regs[this_order]))
+         this_order = j;
+      reg_order[i] = this_order;
+    }
+
+  /* Ensure that registers that must be live after the instruction end
+     up with the correct value.  */
+  for (i = 0; i < nops; i++)
+    {
+      int this_order = reg_order[i];
+      if ((this_order != mem_order[i]
+          || orig_reg_rtxs[this_order] != reg_rtxs[this_order])
+         && !peep2_reg_dead_p (nops * 2, orig_reg_rtxs[this_order]))
+       return false;
+    }
+
+  /* Load the constants.  */
+  for (i = 0; i < nops; i++)
+    {
+      rtx op = operands[2 * nops + mem_order[i]];
+      sorted_regs[i] = regs[reg_order[i]];
+      emit_move_insn (reg_rtxs[reg_order[i]], op);
+    }
+
+  base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
+
+  base_reg_dies = peep2_reg_dead_p (nops * 2, base_reg_rtx);
+  if (TARGET_THUMB1)
+    {
+      gcc_assert (base_reg_dies);
+      write_back = TRUE;
+    }
+
+  if (stm_case == 5)
+    {
+      gcc_assert (base_reg_dies);
+      emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset)));
+      offset = 0;
+    }
+
+  addr = plus_constant (base_reg_rtx, offset);
+
+  for (i = 0; i < nops; i++)
+    {
+      addr = plus_constant (base_reg_rtx, offset + i * 4);
+      mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
+                                             SImode, addr, 0);
+    }
+  emit_insn (arm_gen_store_multiple_1 (nops, sorted_regs, mems, base_reg_rtx,
+                                      write_back ? offset + i * 4 : 0));
+  return true;
  }
  
  int
@@ -9667,20 +10260,21 @@ arm_gen_movmemqi (rtx *operands)
    for (i = 0; in_words_to_go >= 2; i+=4)
      {
        if (in_words_to_go > 4)
-       emit_insn (arm_gen_load_multiple (0, 4, src, TRUE, TRUE,
-                                         srcbase, &srcoffset));
+       emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, src,
+                                         TRUE, srcbase, &srcoffset));
        else
-       emit_insn (arm_gen_load_multiple (0, in_words_to_go, src, TRUE,
-                                         FALSE, srcbase, &srcoffset));
+       emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, in_words_to_go,
+                                         src, FALSE, srcbase,
+                                         &srcoffset));
  
        if (out_words_to_go)
         {
           if (out_words_to_go > 4)
-           emit_insn (arm_gen_store_multiple (0, 4, dst, TRUE, TRUE,
-                                              dstbase, &dstoffset));
+           emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, dst,
+                                              TRUE, dstbase, &dstoffset));
           else if (out_words_to_go != 1)
-           emit_insn (arm_gen_store_multiple (0, out_words_to_go,
-                                              dst, TRUE,
+           emit_insn (arm_gen_store_multiple (arm_regs_in_sequence,
+                                              out_words_to_go, dst,
                                                (last_bytes == 0
                                                 ? FALSE : TRUE),
                                                dstbase, &dstoffset));
@@ -10055,6 +10649,55 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
        && (rtx_equal_p (XEXP (x, 0), y) || rtx_equal_p (XEXP (x, 1), y)))
      return CC_Cmode;
  
+  if (GET_MODE (x) == DImode || GET_MODE (y) == DImode)
+    {
+      /* To keep things simple, always use the Cirrus cfcmp64 if it is
+        available.  */
+      if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK)
+       return CCmode;
+
+      switch (op)
+       {
+       case EQ:
+       case NE:
+         /* A DImode comparison against zero can be implemented by
+            or'ing the two halves together.  */
+         if (y == const0_rtx)
+           return CC_Zmode;
+
+         /* We can do an equality test in three Thumb instructions.  */
+         if (!TARGET_ARM)
+           return CC_Zmode;
+
+         /* FALLTHROUGH */
+
+       case LTU:
+       case LEU:
+       case GTU:
+       case GEU:
+         /* DImode unsigned comparisons can be implemented by cmp +
+            cmpeq without a scratch register.  Not worth doing in
+            Thumb-2.  */
+         if (TARGET_ARM)
+           return CC_CZmode;
+
+         /* FALLTHROUGH */
+
+       case LT:
+       case LE:
+       case GT:
+       case GE:
+         /* DImode signed and unsigned comparisons can be implemented
+            by cmp + sbcs with a scratch register, but that does not
+            set the Z flag - we must reverse GT/LE/GTU/LEU.  */
+         gcc_assert (op != EQ && op != NE);
+         return CC_NCVmode;
+
+       default:
+         gcc_unreachable ();
+       }
+    }
+
    return CCmode;
  }
  
@@ -10064,10 +10707,39 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
  rtx
  arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y)
  {
-  enum machine_mode mode = SELECT_CC_MODE (code, x, y);
-  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+  enum machine_mode mode;
+  rtx cc_reg;
+  int dimode_comparison = GET_MODE (x) == DImode || GET_MODE (y) == DImode;
  
-  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
+  /* We might have X as a constant, Y as a register because of the predicates
+     used for cmpdi.  If so, force X to a register here.  */
+  if (dimode_comparison && !REG_P (x))
+    x = force_reg (DImode, x);
+
+  mode = SELECT_CC_MODE (code, x, y);
+  cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+
+  if (dimode_comparison
+      && !(TARGET_HARD_FLOAT && TARGET_MAVERICK)
+      && mode != CC_CZmode)
+    {
+      rtx clobber, set;
+
+      /* To compare two non-zero values for equality, XOR them and
+        then compare against zero.  Not used for ARM mode; there
+        CC_CZmode is cheaper.  */
+      if (mode == CC_Zmode && y != const0_rtx)
+       {
+         x = expand_binop (DImode, xor_optab, x, y, NULL_RTX, 0, OPTAB_WIDEN);
+         y = const0_rtx;
+       }
+      /* A scratch register is required.  */
+      clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (SImode));
+      set = gen_rtx_SET (VOIDmode, cc_reg, gen_rtx_COMPARE (mode, x, y));
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
+    }
+  else
+    emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
  
    return cc_reg;
  }
@@ -11396,6 +12068,34 @@ arm_const_double_by_parts (rtx val)
    return false;
  }
  
+/* Return true if it is possible to inline both the high and low parts
+   of a 64-bit constant into 32-bit data processing instructions.  */
+bool
+arm_const_double_by_immediates (rtx val)
+{
+  enum machine_mode mode = GET_MODE (val);
+  rtx part;
+
+  if (mode == VOIDmode)
+    mode = DImode;
+
+  part = gen_highpart_mode (SImode, mode, val);
+
+  gcc_assert (GET_CODE (part) == CONST_INT);
+
+  if (!const_ok_for_arm (INTVAL (part)))
+    return false;
+
+  part = gen_lowpart (SImode, val);
+
+  gcc_assert (GET_CODE (part) == CONST_INT);
+
+  if (!const_ok_for_arm (INTVAL (part)))
+    return false;
+
+  return true;
+}
+
  /* Scan INSN and note any of its operands that need fixing.
     If DO_PUSHES is false we do not actually push any of the fixups
     needed.  The function returns TRUE if any fixups were needed/pushed.
@@ -11488,6 +12188,7 @@ thumb2_reorg (void)
    FOR_EACH_BB (bb)
      {
        rtx insn;
+
        COPY_REG_SET (&live, DF_LR_OUT (bb));
        df_simulate_initialize_backwards (bb, &live);
        FOR_BB_INSNS_REVERSE (bb, insn)
@@ -11505,21 +12206,43 @@ thumb2_reorg (void)
                   rtx dst = XEXP (pat, 0);
                   rtx src = XEXP (pat, 1);
                   rtx op0 = XEXP (src, 0);
+                 rtx op1 = (GET_RTX_CLASS (GET_CODE (src)) == RTX_COMM_ARITH
+                            ? XEXP (src, 1) : NULL);
+
                   if (rtx_equal_p (dst, op0)
                       || GET_CODE (src) == PLUS || GET_CODE (src) == MINUS)
                     {
                       rtx ccreg = gen_rtx_REG (CCmode, CC_REGNUM);
                       rtx clobber = gen_rtx_CLOBBER (VOIDmode, ccreg);
                       rtvec vec = gen_rtvec (2, pat, clobber);
+
+                     PATTERN (insn) = gen_rtx_PARALLEL (VOIDmode, vec);
+                     INSN_CODE (insn) = -1;
+                   }
+                 /* We can also handle a commutative operation where the
+                    second operand matches the destination.  */
+                 else if (op1 && rtx_equal_p (dst, op1))
+                   {
+                     rtx ccreg = gen_rtx_REG (CCmode, CC_REGNUM);
+                     rtx clobber = gen_rtx_CLOBBER (VOIDmode, ccreg);
+                     rtvec vec;
+
+                     src = copy_rtx (src);
+                     XEXP (src, 0) = op1;
+                     XEXP (src, 1) = op0;
+                     pat = gen_rtx_SET (VOIDmode, dst, src);
+                     vec = gen_rtvec (2, pat, clobber);
                       PATTERN (insn) = gen_rtx_PARALLEL (VOIDmode, vec);
                       INSN_CODE (insn) = -1;
                     }
                 }
             }
+
           if (NONDEBUG_INSN_P (insn))
             df_simulate_one_insn_backwards (bb, insn, &live);
         }
      }
+
    CLEAR_REG_SET (&live);
  }
  
@@ -12409,13 +13132,13 @@ output_move_double (rtx *operands)
             {
               if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY)
                 {
-                 output_asm_insn ("ldr%?\t%0, [%1, %2]!", otherops);
-                 output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
+                 output_asm_insn ("str%?\t%0, [%1, %2]!", otherops);
+                 output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
                 }
               else
                 {
-                 output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
-                 output_asm_insn ("ldr%?\t%0, [%1], %2", otherops);
+                 output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
+                 output_asm_insn ("str%?\t%0, [%1], %2", otherops);
                 }
             }
           else if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY)
@@ -12725,6 +13448,84 @@ output_move_neon (rtx *operands)
    return "";
  }
  
+/* Compute and return the length of neon_mov<mode>, where <mode> is
+   one of VSTRUCT modes: EI, OI, CI or XI.  */
+int
+arm_attr_length_move_neon (rtx insn)
+{
+  rtx reg, mem, addr;
+  int load;
+  enum machine_mode mode;
+
+  extract_insn_cached (insn);
+
+  if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
+    {
+      mode = GET_MODE (recog_data.operand[0]);
+      switch (mode)
+       {
+       case EImode:
+       case OImode:
+         return 8;
+       case CImode:
+         return 12;
+       case XImode:
+         return 16;
+       default:
+         gcc_unreachable ();
+       }
+    }
+
+  load = REG_P (recog_data.operand[0]);
+  reg = recog_data.operand[!load];
+  mem = recog_data.operand[load];
+
+  gcc_assert (MEM_P (mem));
+
+  mode = GET_MODE (reg);
+  addr = XEXP (mem, 0);
+
+  /* Strip off const from addresses like (const (plus (...))).  */
+  if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS)
+    addr = XEXP (addr, 0);
+
+  if (GET_CODE (addr) == LABEL_REF || GET_CODE (addr) == PLUS)
+    {
+      int insns = HARD_REGNO_NREGS (REGNO (reg), mode) / 2;
+      return insns * 4;
+    }
+  else
+    return 4;
+}
+
+/* Return nonzero if the offset in the address is an immediate.  Otherwise,
+   return zero.  */
+
+int
+arm_address_offset_is_imm (rtx insn)
+{
+  rtx mem, addr;
+
+  extract_insn_cached (insn);
+
+  if (REG_P (recog_data.operand[0]))
+    return 0;
+
+  mem = recog_data.operand[0];
+
+  gcc_assert (MEM_P (mem));
+
+  addr = XEXP (mem, 0);
+
+  if (GET_CODE (addr) == REG
+      || (GET_CODE (addr) == PLUS
+         && GET_CODE (XEXP (addr, 0)) == REG
+         && GET_CODE (XEXP (addr, 1)) == CONST_INT))
+    return 1;
+  else
+    return 0;
+}
+
  /* Output an ADD r, s, #n where n may be too big for one instruction.
     If adding zero to one register, output nothing.  */
  const char *
@@ -13882,7 +14683,8 @@ arm_output_epilogue (rtx sibling)
                   && !crtl->tail_call_emit)
                 {
                   unsigned long mask;
-                 mask = (1 << (arm_size_return_regs() / 4)) - 1;
+                  /* Preserve return values, of any size.  */
+                 mask = (1 << ((arm_size_return_regs() + 3) / 4)) - 1;
                   mask ^= 0xf;
                   mask &= ~saved_regs_mask;
                   reg = 0;
@@ -14506,7 +15308,8 @@ arm_get_frame_offsets (void)
              generates better code on Thumb-2 by avoiding the need to
              use 32-bit push/pop instructions.  */
           if (!crtl->tail_call_emit
-             && arm_size_return_regs () <= 12)
+             && arm_size_return_regs () <= 12
+             && (offsets->saved_regs_mask & (1 << 3)) == 0)
             {
               reg = 3;
             }
@@ -15060,7 +15863,8 @@ arm_expand_prologue (void)
       using the EABI unwinder, to prevent faulting instructions from being
       swapped with a stack adjustment.  */
    if (crtl->profile || !TARGET_SCHED_PROLOG
-      || (ARM_EABI_UNWIND_TABLES && cfun->can_throw_non_call_exceptions))
+      || (arm_except_unwind_info (&global_options) == UI_TARGET
+         && cfun->can_throw_non_call_exceptions))
      emit_insn (gen_blockage ());
  
    /* If the link register is being kept alive, with the return address in it,
@@ -15297,8 +16101,18 @@ arm_print_operand (FILE *stream, rtx x, int code)
          the value being loaded is big-wordian or little-wordian.  The
          order of the two register loads can matter however, if the address
          of the memory location is actually held in one of the registers
-        being overwritten by the load.  */
+        being overwritten by the load.
+
+        The 'Q' and 'R' constraints are also available for 64-bit
+        constants.  */
      case 'Q':
+      if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE)
+       {
+         rtx part = gen_lowpart (SImode, x);
+         fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part));
+         return;
+       }
+
        if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM)
         {
           output_operand_lossage ("invalid operand for code '%c'", code);
@@ -15309,6 +16123,18 @@ arm_print_operand (FILE *stream, rtx x, int code)
        return;
  
      case 'R':
+      if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE)
+       {
+         enum machine_mode mode = GET_MODE (x);
+         rtx part;
+
+         if (mode == VOIDmode)
+           mode = DImode;
+         part = gen_highpart_mode (SImode, mode, x);
+         fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part));
+         return;
+       }
+
        if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM)
         {
           output_operand_lossage ("invalid operand for code '%c'", code);
@@ -15631,6 +16457,8 @@ arm_print_operand (FILE *stream, rtx x, int code)
        {
         rtx addr;
         bool postinc = FALSE;
+       unsigned align, modesize, align_bits;
+
         gcc_assert (GET_CODE (x) == MEM);
         addr = XEXP (x, 0);
         if (GET_CODE (addr) == POST_INC)
@@ -15638,12 +16466,45 @@ arm_print_operand (FILE *stream, rtx x, int code)
             postinc = 1;
             addr = XEXP (addr, 0);
           }
-       asm_fprintf (stream, "[%r]", REGNO (addr));
+       asm_fprintf (stream, "[%r", REGNO (addr));
+
+       /* We know the alignment of this access, so we can emit a hint in the
+          instruction (for some alignments) as an aid to the memory subsystem
+          of the target.  */
+       align = MEM_ALIGN (x) >> 3;
+       modesize = GET_MODE_SIZE (GET_MODE (x));
+       
+       /* Only certain alignment specifiers are supported by the hardware.  */
+       if (modesize == 16 && (align % 32) == 0)
+         align_bits = 256;
+       else if ((modesize == 8 || modesize == 16) && (align % 16) == 0)
+         align_bits = 128;
+       else if ((align % 8) == 0)
+         align_bits = 64;
+       else
+         align_bits = 0;
+       
+       if (align_bits != 0)
+         asm_fprintf (stream, ":%d", align_bits);
+
+       asm_fprintf (stream, "]");
+
         if (postinc)
           fputs("!", stream);
        }
        return;
  
+    case 'C':
+      {
+       rtx addr;
+
+       gcc_assert (GET_CODE (x) == MEM);
+       addr = XEXP (x, 0);
+       gcc_assert (GET_CODE (addr) == REG);
+       asm_fprintf (stream, "[%r]", REGNO (addr));
+      }
+      return;
+
      /* Translate an S register number into a D register number and element index.  */
      case 'y':
        {
@@ -16129,11 +16990,33 @@ get_arm_condition_code (rtx comparison)
  
      case CC_Cmode:
        switch (comp_code)
-      {
-      case LTU: return ARM_CS;
-      case GEU: return ARM_CC;
-      default: gcc_unreachable ();
-      }
+       {
+       case LTU: return ARM_CS;
+       case GEU: return ARM_CC;
+       default: gcc_unreachable ();
+       }
+
+    case CC_CZmode:
+      switch (comp_code)
+       {
+       case NE: return ARM_NE;
+       case EQ: return ARM_EQ;
+       case GEU: return ARM_CS;
+       case GTU: return ARM_HI;
+       case LEU: return ARM_LS;
+       case LTU: return ARM_CC;
+       default: gcc_unreachable ();
+       }
+
+    case CC_NCVmode:
+      switch (comp_code)
+       {
+       case GE: return ARM_GE;
+       case LT: return ARM_LT;
+       case GEU: return ARM_CS;
+       case LTU: return ARM_CC;
+       default: gcc_unreachable ();
+       }
  
      case CCmode:
        switch (comp_code)
@@ -18268,7 +19151,9 @@ neon_builtin_compare (const void *a, const void *b)
  static enum insn_code
  locate_neon_builtin_icode (int fcode, neon_itype *itype)
  {
-  neon_builtin_datum key, *found;
+  neon_builtin_datum key
+    = { NULL, (neon_itype) 0, 0, { CODE_FOR_nothing }, 0, 0 };
+  neon_builtin_datum *found;
    int idx;
  
    key.base_fcode = fcode;
@@ -18861,7 +19746,7 @@ thumb_pushpop (FILE *f, unsigned long mask, int push, int *cfa_offset,
        return;
      }
  
-  if (ARM_EABI_UNWIND_TABLES && push)
+  if (push && arm_except_unwind_info (&global_options) == UI_TARGET)
      {
        fprintf (f, "\t.save\t{");
        for (regno = 0; regno < 15; regno++)
@@ -19221,14 +20106,45 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
    /* Return to caller.  */
    asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
  }
-
  \f
+/* Scan INSN just before assembler is output for it.
+   For Thumb-1, we track the status of the condition codes; this
+   information is used in the cbranchsi4_insn pattern.  */
  void
  thumb1_final_prescan_insn (rtx insn)
  {
    if (flag_print_asm_name)
      asm_fprintf (asm_out_file, "%@ 0x%04x\n",
                  INSN_ADDRESSES (INSN_UID (insn)));
+  /* Don't overwrite the previous setter when we get to a cbranch.  */
+  if (INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn)
+    {
+      enum attr_conds conds;
+
+      if (cfun->machine->thumb1_cc_insn)
+       {
+         if (modified_in_p (cfun->machine->thumb1_cc_op0, insn)
+             || modified_in_p (cfun->machine->thumb1_cc_op1, insn))
+           CC_STATUS_INIT;
+       }
+      conds = get_attr_conds (insn);
+      if (conds == CONDS_SET)
+       {
+         rtx set = single_set (insn);
+         cfun->machine->thumb1_cc_insn = insn;
+         cfun->machine->thumb1_cc_op0 = SET_DEST (set);
+         cfun->machine->thumb1_cc_op1 = const0_rtx;
+         cfun->machine->thumb1_cc_mode = CC_NOOVmode;
+         if (INSN_CODE (insn) == CODE_FOR_thumb1_subsi3_insn)
+           {
+             rtx src1 = XEXP (SET_SRC (set), 1);
+             if (src1 == const0_rtx)
+               cfun->machine->thumb1_cc_mode = CCmode;
+           }
+       }
+      else if (conds != CONDS_NOCOND)
+       cfun->machine->thumb1_cc_insn = NULL_RTX;
+    }
  }
  
  int
@@ -19336,6 +20252,81 @@ is_called_in_ARM_mode (tree func)
  #endif
  }
  
+/* Given the stack offsets and register mask in OFFSETS, decide how
+   many additional registers to push instead of subtracting a constant
+   from SP.  For epilogues the principle is the same except we use pop.
+   FOR_PROLOGUE indicates which we're generating.  */
+static int
+thumb1_extra_regs_pushed (arm_stack_offsets *offsets, bool for_prologue)
+{
+  HOST_WIDE_INT amount;
+  unsigned long live_regs_mask = offsets->saved_regs_mask;
+  /* Extract a mask of the ones we can give to the Thumb's push/pop
+     instruction.  */
+  unsigned long l_mask = live_regs_mask & (for_prologue ? 0x40ff : 0xff);
+  /* Then count how many other high registers will need to be pushed.  */
+  unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
+  int n_free, reg_base;
+
+  if (!for_prologue && frame_pointer_needed)
+    amount = offsets->locals_base - offsets->saved_regs;
+  else
+    amount = offsets->outgoing_args - offsets->saved_regs;
+
+  /* If the stack frame size is 512 exactly, we can save one load
+     instruction, which should make this a win even when optimizing
+     for speed.  */
+  if (!optimize_size && amount != 512)
+    return 0;
+
+  /* Can't do this if there are high registers to push.  */
+  if (high_regs_pushed != 0)
+    return 0;
+
+  /* Shouldn't do it in the prologue if no registers would normally
+     be pushed at all.  In the epilogue, also allow it if we'll have
+     a pop insn for the PC.  */
+  if  (l_mask == 0
+       && (for_prologue
+          || TARGET_BACKTRACE
+          || (live_regs_mask & 1 << LR_REGNUM) == 0
+          || TARGET_INTERWORK
+          || crtl->args.pretend_args_size != 0))
+    return 0;
+
+  /* Don't do this if thumb_expand_prologue wants to emit instructions
+     between the push and the stack frame allocation.  */
+  if (for_prologue
+      && ((flag_pic && arm_pic_register != INVALID_REGNUM)
+         || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)))
+    return 0;
+
+  reg_base = 0;
+  n_free = 0;
+  if (!for_prologue)
+    {
+      reg_base = arm_size_return_regs () / UNITS_PER_WORD;
+      live_regs_mask >>= reg_base;
+    }
+
+  while (reg_base + n_free < 8 && !(live_regs_mask & 1)
+        && (for_prologue || call_used_regs[reg_base + n_free]))
+    {
+      live_regs_mask >>= 1;
+      n_free++;
+    }
+
+  if (n_free == 0)
+    return 0;
+  gcc_assert (amount / 4 * 4 == amount);
+
+  if (amount >= 512 && (amount - n_free * 4) < 512)
+    return (amount - 508) / 4;
+  if (amount <= n_free * 4)
+    return amount / 4;
+  return 0;
+}
+
  /* The bits which aren't usefully expanded as rtl.  */
  const char *
  thumb_unexpanded_epilogue (void)
@@ -19344,6 +20335,7 @@ thumb_unexpanded_epilogue (void)
    int regno;
    unsigned long live_regs_mask = 0;
    int high_regs_pushed = 0;
+  int extra_pop;
    int had_to_push_lr;
    int size;
  
@@ -19363,6 +20355,13 @@ thumb_unexpanded_epilogue (void)
       the register is used to hold a return value.  */
    size = arm_size_return_regs ();
  
+  extra_pop = thumb1_extra_regs_pushed (offsets, false);
+  if (extra_pop > 0)
+    {
+      unsigned long extra_mask = (1 << extra_pop) - 1;
+      live_regs_mask |= extra_mask << (size / UNITS_PER_WORD);
+    }
+
    /* The prolog may have pushed some high registers to use as
       work registers.  e.g. the testsuite file:
       gcc/testsuite/gcc/gcc.c-torture/execute/complex-2.c
@@ -19446,7 +20445,9 @@ thumb_unexpanded_epilogue (void)
                        live_regs_mask);
  
        /* We have either just popped the return address into the
-        PC or it is was kept in LR for the entire function.  */
+        PC or it is was kept in LR for the entire function.
+        Note that thumb_pushpop has already called thumb_exit if the
+        PC was in the list.  */
        if (!had_to_push_lr)
         thumb_exit (asm_out_file, LR_REGNUM);
      }
@@ -19592,51 +20593,6 @@ thumb_compute_initial_elimination_offset (unsigned int from, unsigned int to)
      }
  }
  
-/* Given the stack offsets and register mask in OFFSETS, decide
-   how many additional registers to push instead of subtracting
-   a constant from SP.  */
-static int
-thumb1_extra_regs_pushed (arm_stack_offsets *offsets)
-{
-  HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs;
-  unsigned long live_regs_mask = offsets->saved_regs_mask;
-  /* Extract a mask of the ones we can give to the Thumb's push instruction.  */
-  unsigned long l_mask = live_regs_mask & 0x40ff;
-  /* Then count how many other high registers will need to be pushed.  */
-  unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
-  int n_free;
-
-  /* If the stack frame size is 512 exactly, we can save one load
-     instruction, which should make this a win even when optimizing
-     for speed.  */
-  if (!optimize_size && amount != 512)
-    return 0;
-
-  /* Can't do this if there are high registers to push, or if we
-     are not going to do a push at all.  */
-  if (high_regs_pushed != 0 || l_mask == 0)
-    return 0;
-
-  /* Don't do this if thumb1_expand_prologue wants to emit instructions
-     between the push and the stack frame allocation.  */
-  if ((flag_pic && arm_pic_register != INVALID_REGNUM)
-      || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))
-    return 0;
-
-  for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1)
-    n_free++;
-
-  if (n_free == 0)
-    return 0;
-  gcc_assert (amount / 4 * 4 == amount);
-
-  if (amount >= 512 && (amount - n_free * 4) < 512)
-    return (amount - 508) / 4;
-  if (amount <= n_free * 4)
-    return amount / 4;
-  return 0;
-}
-
  /* Generate the rest of a function's prologue.  */
  void
  thumb1_expand_prologue (void)
@@ -19673,7 +20629,7 @@ thumb1_expand_prologue (void)
                     stack_pointer_rtx);
  
    amount = offsets->outgoing_args - offsets->saved_regs;
-  amount -= 4 * thumb1_extra_regs_pushed (offsets);
+  amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
    if (amount)
      {
        if (amount < 512)
@@ -19730,7 +20686,8 @@ thumb1_expand_prologue (void)
       using the EABI unwinder, to prevent faulting instructions from being
       swapped with a stack adjustment.  */
    if (crtl->profile || !TARGET_SCHED_PROLOG
-      || (ARM_EABI_UNWIND_TABLES && cfun->can_throw_non_call_exceptions))
+      || (arm_except_unwind_info (&global_options) == UI_TARGET
+         && cfun->can_throw_non_call_exceptions))
      emit_insn (gen_blockage ());
  
    cfun->machine->lr_save_eliminated = !thumb_force_lr_save ();
@@ -19758,6 +20715,7 @@ thumb1_expand_epilogue (void)
        emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx));
        amount = offsets->locals_base - offsets->saved_regs;
      }
+  amount -= 4 * thumb1_extra_regs_pushed (offsets, false);
  
    gcc_assert (amount >= 0);
    if (amount)
@@ -19842,7 +20800,7 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
    if (crtl->args.pretend_args_size)
      {
        /* Output unwind directive for the stack adjustment.  */
-      if (ARM_EABI_UNWIND_TABLES)
+      if (arm_except_unwind_info (&global_options) == UI_TARGET)
         fprintf (f, "\t.pad #%d\n",
                  crtl->args.pretend_args_size);
  
@@ -19912,7 +20870,7 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
  
        work_register = thumb_find_work_register (live_regs_mask);
  
-      if (ARM_EABI_UNWIND_TABLES)
+      if (arm_except_unwind_info (&global_options) == UI_TARGET)
         asm_fprintf (f, "\t.pad #16\n");
  
        asm_fprintf
@@ -19980,7 +20938,7 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
            || (high_regs_pushed == 0 && l_mask))
      {
        unsigned long mask = l_mask;
-      mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1;
+      mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1;
        thumb_pushpop (f, mask, 1, &cfa_offset, mask);
      }
  
@@ -20754,6 +21712,38 @@ arm_no_early_store_addr_dep (rtx producer, rtx consumer)
    return !reg_overlap_mentioned_p (value, addr);
  }
  
+/* Return nonzero if the CONSUMER instruction (a store) does need
+   PRODUCER's value to calculate the address.  */
+
+int
+arm_early_store_addr_dep (rtx producer, rtx consumer)
+{
+  return !arm_no_early_store_addr_dep (producer, consumer);
+}
+
+/* Return nonzero if the CONSUMER instruction (a load) does need
+   PRODUCER's value to calculate the address.  */
+
+int
+arm_early_load_addr_dep (rtx producer, rtx consumer)
+{
+  rtx value = PATTERN (producer);
+  rtx addr = PATTERN (consumer);
+
+  if (GET_CODE (value) == COND_EXEC)
+    value = COND_EXEC_CODE (value);
+  if (GET_CODE (value) == PARALLEL)
+    value = XVECEXP (value, 0, 0);
+  value = XEXP (value, 0);
+  if (GET_CODE (addr) == COND_EXEC)
+    addr = COND_EXEC_CODE (addr);
+  if (GET_CODE (addr) == PARALLEL)
+    addr = XVECEXP (addr, 0, 0);
+  addr = XEXP (addr, 1);
+
+  return reg_overlap_mentioned_p (value, addr);
+}
+
  /* Return nonzero if the CONSUMER instruction (an ALU op) does not
     have an early register shift value or amount dependency on the
     result of PRODUCER.  */
@@ -21108,25 +22098,83 @@ thumb_set_return_address (rtx source, rtx scratch)
        else
         addr = plus_constant (addr, delta);
  
-      emit_move_insn (gen_frame_mem (Pmode, addr), source);
-    }
-  else
-    emit_move_insn (gen_rtx_REG (Pmode, LR_REGNUM), source);
+      emit_move_insn (gen_frame_mem (Pmode, addr), source);
+    }
+  else
+    emit_move_insn (gen_rtx_REG (Pmode, LR_REGNUM), source);
+}
+
+/* Implements target hook vector_mode_supported_p.  */
+bool
+arm_vector_mode_supported_p (enum machine_mode mode)
+{
+  /* Neon also supports V2SImode, etc. listed in the clause below.  */
+  if (TARGET_NEON && (mode == V2SFmode || mode == V4SImode || mode == V8HImode
+      || mode == V16QImode || mode == V4SFmode || mode == V2DImode))
+    return true;
+
+  if ((TARGET_NEON || TARGET_IWMMXT)
+      && ((mode == V2SImode)
+         || (mode == V4HImode)
+         || (mode == V8QImode)))
+    return true;
+
+  return false;
+}
+
+/* Use the option -mvectorize-with-neon-quad to override the use of doubleword
+   registers when autovectorizing for Neon, at least until multiple vector
+   widths are supported properly by the middle-end.  */
+
+static enum machine_mode
+arm_preferred_simd_mode (enum machine_mode mode)
+{
+  if (TARGET_NEON)
+    switch (mode)
+      {
+      case SFmode:
+       return TARGET_NEON_VECTORIZE_QUAD ? V4SFmode : V2SFmode;
+      case SImode:
+       return TARGET_NEON_VECTORIZE_QUAD ? V4SImode : V2SImode;
+      case HImode:
+       return TARGET_NEON_VECTORIZE_QUAD ? V8HImode : V4HImode;
+      case QImode:
+       return TARGET_NEON_VECTORIZE_QUAD ? V16QImode : V8QImode;
+      case DImode:
+       if (TARGET_NEON_VECTORIZE_QUAD)
+         return V2DImode;
+       break;
+
+      default:;
+      }
+
+  if (TARGET_REALLY_IWMMXT)
+    switch (mode)
+      {
+      case SImode:
+       return V2SImode;
+      case HImode:
+       return V4HImode;
+      case QImode:
+       return V8QImode;
+
+      default:;
+      }
+
+  return word_mode;
  }
  
-/* Implements target hook vector_mode_supported_p.  */
-bool
-arm_vector_mode_supported_p (enum machine_mode mode)
-{
-  /* Neon also supports V2SImode, etc. listed in the clause below.  */
-  if (TARGET_NEON && (mode == V2SFmode || mode == V4SImode || mode == V8HImode
-      || mode == V16QImode || mode == V4SFmode || mode == V2DImode))
-    return true;
+/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
+ 
+   We need to define this for LO_REGS on thumb.  Otherwise we can end up
+   using r0-r4 for function arguments, r7 for the stack frame and don't
+   have enough left over to do doubleword arithmetic.  */
  
-  if ((TARGET_NEON || TARGET_IWMMXT)
-      && ((mode == V2SImode)
-         || (mode == V4HImode)
-         || (mode == V8QImode)))
+static bool
+arm_class_likely_spilled_p (reg_class_t rclass)
+{
+  if ((TARGET_THUMB && rclass == LO_REGS)
+      || rclass  == CC_REG)
      return true;
  
    return false;
@@ -21216,7 +22264,7 @@ arm_dwarf_register_span (rtx rtl)
    return p;
  }
  
-#ifdef TARGET_UNWIND_INFO
+#if ARM_UNWIND_INFO
  /* Emit unwind directives for a store-multiple instruction or stack pointer
     push during alignment.
     These should only ever be generated by the function prologue code, so
@@ -21430,7 +22478,7 @@ arm_unwind_emit (FILE * asm_out_file, rtx insn)
  {
    rtx pat;
  
-  if (!ARM_EABI_UNWIND_TABLES)
+  if (arm_except_unwind_info (&global_options) != UI_TARGET)
      return;
  
    if (!(flag_unwind_tables || crtl->uses_eh_lsda)
@@ -21480,7 +22528,52 @@ arm_output_ttype (rtx x)
  
    return TRUE;
  }
-#endif /* TARGET_UNWIND_INFO */
+
+/* Implement TARGET_ASM_EMIT_EXCEPT_PERSONALITY.  */
+
+static void
+arm_asm_emit_except_personality (rtx personality)
+{
+  fputs ("\t.personality\t", asm_out_file);
+  output_addr_const (asm_out_file, personality);
+  fputc ('\n', asm_out_file);
+}
+
+/* Implement TARGET_ASM_INITIALIZE_SECTIONS.  */
+
+static void
+arm_asm_init_sections (void)
+{
+  exception_section = get_unnamed_section (0, output_section_asm_op,
+                                          "\t.handlerdata");
+}
+#endif /* ARM_UNWIND_INFO */
+
+/* Implement TARGET_EXCEPT_UNWIND_INFO.  */
+
+static enum unwind_info_type
+arm_except_unwind_info (struct gcc_options *opts)
+{
+  /* Honor the --enable-sjlj-exceptions configure switch.  */
+#ifdef CONFIG_SJLJ_EXCEPTIONS
+  if (CONFIG_SJLJ_EXCEPTIONS)
+    return UI_SJLJ;
+#endif
+
+  /* If not using ARM EABI unwind tables... */
+  if (ARM_UNWIND_INFO)
+    {
+      /* For simplicity elsewhere in this file, indicate that all unwind
+        info is disabled if we're not emitting unwind tables.  */
+      if (!opts->x_flag_exceptions && !opts->x_flag_unwind_tables)
+       return UI_NONE;
+      else
+       return UI_TARGET;
+    }
+
+  /* ... we use sjlj exceptions for backwards compatibility.  */
+  return UI_SJLJ;
+}
  
  
  /* Handle UNSPEC DWARF call frame instructions.  These are needed for dynamic
@@ -21512,7 +22605,7 @@ arm_dwarf_handle_frame_unspec (const char *label, rtx pattern, int index)
  void
  arm_output_fn_unwind (FILE * f, bool prologue)
  {
-  if (!ARM_EABI_UNWIND_TABLES)
+  if (arm_except_unwind_info (&global_options) != UI_TARGET)
      return;
  
    if (prologue)
@@ -21592,7 +22685,9 @@ arm_output_dwarf_dtprel (FILE *file, int size, rtx x)
    fputs ("(tlsldo)", file);
  }
  
-bool
+/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
+
+static bool
  arm_output_addr_const_extra (FILE *fp, rtx x)
  {
    if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
@@ -21732,6 +22827,7 @@ arm_issue_rate (void)
      {
      case cortexr4:
      case cortexr4f:
+    case cortexa5:
      case cortexa8:
      case cortexa9:
        return 2;
@@ -21849,17 +22945,6 @@ arm_order_regs_for_local_alloc (void)
              sizeof (thumb_core_reg_alloc_order));
  }
  
-/* Set default optimization options.  */
-void
-arm_optimization_options (int level, int size ATTRIBUTE_UNUSED)
-{
-  /* Enable section anchors by default at -O1 or higher.
-     Use 2 to distinguish from an explicit -fsection-anchors
-     given on the command line.  */
-  if (level > 0)
-    flag_section_anchors = 2;
-}
-
  /* Implement TARGET_FRAME_POINTER_REQUIRED.  */
  
  bool
@@ -21878,4 +22963,525 @@ arm_have_conditional_execution (void)
    return !TARGET_THUMB1;
  }
  
+/* Legitimize a memory reference for sync primitive implemented using
+   ldrex / strex.  We currently force the form of the reference to be
+   indirect without offset.  We do not yet support the indirect offset
+   addressing supported by some ARM targets for these
+   instructions.  */
+static rtx
+arm_legitimize_sync_memory (rtx memory)
+{
+  rtx addr = force_reg (Pmode, XEXP (memory, 0));
+  rtx legitimate_memory = gen_rtx_MEM (GET_MODE (memory), addr);
+
+  set_mem_alias_set (legitimate_memory, ALIAS_SET_MEMORY_BARRIER);
+  MEM_VOLATILE_P (legitimate_memory) = MEM_VOLATILE_P (memory);
+  return legitimate_memory;
+}
+
+/* An instruction emitter. */
+typedef void (* emit_f) (int label, const char *, rtx *);
+
+/* An instruction emitter that emits via the conventional
+   output_asm_insn.  */
+static void
+arm_emit (int label ATTRIBUTE_UNUSED, const char *pattern, rtx *operands)
+{
+  output_asm_insn (pattern, operands);
+}
+
+/* Count the number of emitted synchronization instructions.  */
+static unsigned arm_insn_count;
+
+/* An emitter that counts emitted instructions but does not actually
+   emit instruction into the the instruction stream.  */
+static void
+arm_count (int label,
+          const char *pattern ATTRIBUTE_UNUSED,
+          rtx *operands ATTRIBUTE_UNUSED)
+{
+  if (! label)
+    ++ arm_insn_count;
+}
+
+/* Construct a pattern using conventional output formatting and feed
+   it to output_asm_insn.  Provides a mechanism to construct the
+   output pattern on the fly.  Note the hard limit on the pattern
+   buffer size.  */
+static void ATTRIBUTE_PRINTF_4
+arm_output_asm_insn (emit_f emit, int label, rtx *operands,
+                    const char *pattern, ...)
+{
+  va_list ap;
+  char buffer[256];
+
+  va_start (ap, pattern);
+  vsprintf (buffer, pattern, ap);
+  va_end (ap);
+  emit (label, buffer, operands);
+}
+
+/* Emit the memory barrier instruction, if any, provided by this
+   target to a specified emitter.  */
+static void
+arm_process_output_memory_barrier (emit_f emit, rtx *operands)
+{
+  if (TARGET_HAVE_DMB)
+    {
+      /* Note we issue a system level barrier. We should consider
+         issuing a inner shareabilty zone barrier here instead, ie.
+         "DMB ISH".  */
+      emit (0, "dmb\tsy", operands);
+      return;
+    }
+
+  if (TARGET_HAVE_DMB_MCR)
+    {
+      emit (0, "mcr\tp15, 0, r0, c7, c10, 5", operands);
+      return;
+    }
+
+  gcc_unreachable ();
+}
+
+/* Emit the memory barrier instruction, if any, provided by this
+   target.  */
+const char *
+arm_output_memory_barrier (rtx *operands)
+{
+  arm_process_output_memory_barrier (arm_emit, operands);
+  return "";
+}
+
+/* Helper to figure out the instruction suffix required on ldrex/strex
+   for operations on an object of the specified mode.  */
+static const char *
+arm_ldrex_suffix (enum machine_mode mode)
+{
+  switch (mode)
+    {
+    case QImode: return "b";
+    case HImode: return "h";
+    case SImode: return "";
+    case DImode: return "d";
+    default:
+      gcc_unreachable ();
+    }
+  return "";
+}
+
+/* Emit an ldrex{b,h,d, } instruction appropriate for the specified
+   mode.  */
+static void
+arm_output_ldrex (emit_f emit,
+                 enum machine_mode mode,
+                 rtx target,
+                 rtx memory)
+{
+  const char *suffix = arm_ldrex_suffix (mode);
+  rtx operands[2];
+
+  operands[0] = target;
+  operands[1] = memory;
+  arm_output_asm_insn (emit, 0, operands, "ldrex%s\t%%0, %%C1", suffix);
+}
+
+/* Emit a strex{b,h,d, } instruction appropriate for the specified
+   mode.  */
+static void
+arm_output_strex (emit_f emit,
+                 enum machine_mode mode,
+                 const char *cc,
+                 rtx result,
+                 rtx value,
+                 rtx memory)
+{
+  const char *suffix = arm_ldrex_suffix (mode);
+  rtx operands[3];
+
+  operands[0] = result;
+  operands[1] = value;
+  operands[2] = memory;
+  arm_output_asm_insn (emit, 0, operands, "strex%s%s\t%%0, %%1, %%C2", suffix,
+                      cc);
+}
+
+/* Helper to emit a two operand instruction.  */
+static void
+arm_output_op2 (emit_f emit, const char *mnemonic, rtx d, rtx s)
+{
+  rtx operands[2];
+
+  operands[0] = d;
+  operands[1] = s;
+  arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1", mnemonic);
+}
+
+/* Helper to emit a three operand instruction.  */
+static void
+arm_output_op3 (emit_f emit, const char *mnemonic, rtx d, rtx a, rtx b)
+{
+  rtx operands[3];
+
+  operands[0] = d;
+  operands[1] = a;
+  operands[2] = b;
+  arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1, %%2", mnemonic);
+}
+
+/* Emit a load store exclusive synchronization loop.
+
+   do
+     old_value = [mem]
+     if old_value != required_value
+       break;
+     t1 = sync_op (old_value, new_value)
+     [mem] = t1, t2 = [0|1]
+   while ! t2
+
+   Note:
+     t1 == t2 is not permitted
+     t1 == old_value is permitted
+
+   required_value:
+
+   RTX register or const_int representing the required old_value for
+   the modify to continue, if NULL no comparsion is performed.  */
+static void
+arm_output_sync_loop (emit_f emit,
+                     enum machine_mode mode,
+                     rtx old_value,
+                     rtx memory,
+                     rtx required_value,
+                     rtx new_value,
+                     rtx t1,
+                     rtx t2,
+                     enum attr_sync_op sync_op,
+                     int early_barrier_required)
+{
+  rtx operands[1];
+
+  gcc_assert (t1 != t2);
+
+  if (early_barrier_required)
+    arm_process_output_memory_barrier (emit, NULL);
+
+  arm_output_asm_insn (emit, 1, operands, "%sLSYT%%=:", LOCAL_LABEL_PREFIX);
+
+  arm_output_ldrex (emit, mode, old_value, memory);
+
+  if (required_value)
+    {
+      rtx operands[2];
+
+      operands[0] = old_value;
+      operands[1] = required_value;
+      arm_output_asm_insn (emit, 0, operands, "cmp\t%%0, %%1");
+      arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYB%%=", LOCAL_LABEL_PREFIX);
+    }
+
+  switch (sync_op)
+    {
+    case SYNC_OP_ADD:
+      arm_output_op3 (emit, "add", t1, old_value, new_value);
+      break;
+
+    case SYNC_OP_SUB:
+      arm_output_op3 (emit, "sub", t1, old_value, new_value);
+      break;
+
+    case SYNC_OP_IOR:
+      arm_output_op3 (emit, "orr", t1, old_value, new_value);
+      break;
+
+    case SYNC_OP_XOR:
+      arm_output_op3 (emit, "eor", t1, old_value, new_value);
+      break;
+
+    case SYNC_OP_AND:
+      arm_output_op3 (emit,"and", t1, old_value, new_value);
+      break;
+
+    case SYNC_OP_NAND:
+      arm_output_op3 (emit, "and", t1, old_value, new_value);
+      arm_output_op2 (emit, "mvn", t1, t1);
+      break;
+
+    case SYNC_OP_NONE:
+      t1 = new_value;
+      break;
+    }
+
+  arm_output_strex (emit, mode, "", t2, t1, memory);
+  operands[0] = t2;
+  arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
+  arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=", LOCAL_LABEL_PREFIX);
+
+  arm_process_output_memory_barrier (emit, NULL);
+  arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX);
+}
+
+static rtx
+arm_get_sync_operand (rtx *operands, int index, rtx default_value)
+{
+  if (index > 0)
+    default_value = operands[index - 1];
+
+  return default_value;
+}
+
+#define FETCH_SYNC_OPERAND(NAME, DEFAULT) \
+  arm_get_sync_operand (operands, (int) get_attr_sync_##NAME (insn), DEFAULT);
+
+/* Extract the operands for a synchroniztion instruction from the
+   instructions attributes and emit the instruction.  */
+static void
+arm_process_output_sync_insn (emit_f emit, rtx insn, rtx *operands)
+{
+  rtx result, memory, required_value, new_value, t1, t2;
+  int early_barrier;
+  enum machine_mode mode;
+  enum attr_sync_op sync_op;
+
+  result = FETCH_SYNC_OPERAND(result, 0);
+  memory = FETCH_SYNC_OPERAND(memory, 0);
+  required_value = FETCH_SYNC_OPERAND(required_value, 0);
+  new_value = FETCH_SYNC_OPERAND(new_value, 0);
+  t1 = FETCH_SYNC_OPERAND(t1, 0);
+  t2 = FETCH_SYNC_OPERAND(t2, 0);
+  early_barrier =
+    get_attr_sync_release_barrier (insn) == SYNC_RELEASE_BARRIER_YES;
+  sync_op = get_attr_sync_op (insn);
+  mode = GET_MODE (memory);
+
+  arm_output_sync_loop (emit, mode, result, memory, required_value,
+                       new_value, t1, t2, sync_op, early_barrier);
+}
+
+/* Emit a synchronization instruction loop.  */
+const char *
+arm_output_sync_insn (rtx insn, rtx *operands)
+{
+  arm_process_output_sync_insn (arm_emit, insn, operands);
+  return "";
+}
+
+/* Count the number of machine instruction that will be emitted for a
+   synchronization instruction.  Note that the emitter used does not
+   emit instructions, it just counts instructions being carefull not
+   to count labels.  */
+unsigned int
+arm_sync_loop_insns (rtx insn, rtx *operands)
+{
+  arm_insn_count = 0;
+  arm_process_output_sync_insn (arm_count, insn, operands);
+  return arm_insn_count;
+}
+
+/* Helper to call a target sync instruction generator, dealing with
+   the variation in operands required by the different generators.  */
+static rtx
+arm_call_generator (struct arm_sync_generator *generator, rtx old_value,
+                   rtx memory, rtx required_value, rtx new_value)
+{
+  switch (generator->op)
+    {
+    case arm_sync_generator_omn:
+      gcc_assert (! required_value);
+      return generator->u.omn (old_value, memory, new_value);
+
+    case arm_sync_generator_omrn:
+      gcc_assert (required_value);
+      return generator->u.omrn (old_value, memory, required_value, new_value);
+    }
+
+  return NULL;
+}
+
+/* Expand a synchronization loop. The synchronization loop is expanded
+   as an opaque block of instructions in order to ensure that we do
+   not subsequently get extraneous memory accesses inserted within the
+   critical region. The exclusive access property of ldrex/strex is
+   only guaranteed in there are no intervening memory accesses. */
+void
+arm_expand_sync (enum machine_mode mode,
+                struct arm_sync_generator *generator,
+                rtx target, rtx memory, rtx required_value, rtx new_value)
+{
+  if (target == NULL)
+    target = gen_reg_rtx (mode);
+
+  memory = arm_legitimize_sync_memory (memory);
+  if (mode != SImode)
+    {
+      rtx load_temp = gen_reg_rtx (SImode);
+
+      if (required_value)
+       required_value = convert_modes (SImode, mode, required_value, true);
+
+      new_value = convert_modes (SImode, mode, new_value, true);
+      emit_insn (arm_call_generator (generator, load_temp, memory,
+                                    required_value, new_value));
+      emit_move_insn (target, gen_lowpart (mode, load_temp));
+    }
+  else
+    {
+      emit_insn (arm_call_generator (generator, target, memory, required_value,
+                                    new_value));
+    }
+}
+
+static bool
+arm_vector_alignment_reachable (const_tree type, bool is_packed)
+{
+  /* Vectors which aren't in packed structures will not be less aligned than
+     the natural alignment of their element type, so this is safe.  */
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    return !is_packed;
+
+  return default_builtin_vector_alignment_reachable (type, is_packed);
+}
+
+static bool
+arm_builtin_support_vector_misalignment (enum machine_mode mode,
+                                        const_tree type, int misalignment,
+                                        bool is_packed)
+{
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    {
+      HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
+
+      if (is_packed)
+        return align == 1;
+
+      /* If the misalignment is unknown, we should be able to handle the access
+        so long as it is not to a member of a packed data structure.  */
+      if (misalignment == -1)
+        return true;
+
+      /* Return true if the misalignment is a multiple of the natural alignment
+         of the vector's element type.  This is probably always going to be
+        true in practice, since we've already established that this isn't a
+        packed access.  */
+      return ((misalignment % align) == 0);
+    }
+  
+  return default_builtin_support_vector_misalignment (mode, type, misalignment,
+                                                     is_packed);
+}
+
+static void
+arm_conditional_register_usage (void)
+{
+  int regno;
+
+  if (TARGET_SOFT_FLOAT || TARGET_THUMB1 || !TARGET_FPA)
+    {
+      for (regno = FIRST_FPA_REGNUM;
+          regno <= LAST_FPA_REGNUM; ++regno)
+       fixed_regs[regno] = call_used_regs[regno] = 1;
+    }
+
+  if (TARGET_THUMB1 && optimize_size)
+    {
+      /* When optimizing for size on Thumb-1, it's better not
+        to use the HI regs, because of the overhead of
+        stacking them.  */
+      for (regno = FIRST_HI_REGNUM;
+          regno <= LAST_HI_REGNUM; ++regno)
+       fixed_regs[regno] = call_used_regs[regno] = 1;
+    }
+
+  /* The link register can be clobbered by any branch insn,
+     but we have no way to track that at present, so mark
+     it as unavailable.  */
+  if (TARGET_THUMB1)
+    fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
+
+  if (TARGET_32BIT && TARGET_HARD_FLOAT)
+    {
+      if (TARGET_MAVERICK)
+       {
+         for (regno = FIRST_FPA_REGNUM;
+              regno <= LAST_FPA_REGNUM; ++ regno)
+           fixed_regs[regno] = call_used_regs[regno] = 1;
+         for (regno = FIRST_CIRRUS_FP_REGNUM;
+              regno <= LAST_CIRRUS_FP_REGNUM; ++ regno)
+           {
+             fixed_regs[regno] = 0;
+             call_used_regs[regno] = regno < FIRST_CIRRUS_FP_REGNUM + 4;
+           }
+       }
+      if (TARGET_VFP)
+       {
+         /* VFPv3 registers are disabled when earlier VFP
+            versions are selected due to the definition of
+            LAST_VFP_REGNUM.  */
+         for (regno = FIRST_VFP_REGNUM;
+              regno <= LAST_VFP_REGNUM; ++ regno)
+           {
+             fixed_regs[regno] = 0;
+             call_used_regs[regno] = regno < FIRST_VFP_REGNUM + 16
+               || regno >= FIRST_VFP_REGNUM + 32;
+           }
+       }
+    }
+
+  if (TARGET_REALLY_IWMMXT)
+    {
+      regno = FIRST_IWMMXT_GR_REGNUM;
+      /* The 2002/10/09 revision of the XScale ABI has wCG0
+         and wCG1 as call-preserved registers.  The 2002/11/21
+         revision changed this so that all wCG registers are
+         scratch registers.  */
+      for (regno = FIRST_IWMMXT_GR_REGNUM;
+          regno <= LAST_IWMMXT_GR_REGNUM; ++ regno)
+       fixed_regs[regno] = 0;
+      /* The XScale ABI has wR0 - wR9 as scratch registers,
+        the rest as call-preserved registers.  */
+      for (regno = FIRST_IWMMXT_REGNUM;
+          regno <= LAST_IWMMXT_REGNUM; ++ regno)
+       {
+         fixed_regs[regno] = 0;
+         call_used_regs[regno] = regno < FIRST_IWMMXT_REGNUM + 10;
+       }
+    }
+
+  if ((unsigned) PIC_OFFSET_TABLE_REGNUM != INVALID_REGNUM)
+    {
+      fixed_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
+      call_used_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
+    }
+  else if (TARGET_APCS_STACK)
+    {
+      fixed_regs[10]     = 1;
+      call_used_regs[10] = 1;
+    }
+  /* -mcaller-super-interworking reserves r11 for calls to
+     _interwork_r11_call_via_rN().  Making the register global
+     is an easy way of ensuring that it remains valid for all
+     calls.  */
+  if (TARGET_APCS_FRAME || TARGET_CALLER_INTERWORKING
+      || TARGET_TPCS_FRAME || TARGET_TPCS_LEAF_FRAME)
+    {
+      fixed_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+      call_used_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+      if (TARGET_CALLER_INTERWORKING)
+       global_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+    }
+  SUBTARGET_CONDITIONAL_REGISTER_USAGE
+}
+
+static reg_class_t
+arm_preferred_rename_class (reg_class_t class)
+{
+  /* Thumb-2 instructions using LO_REGS may be smaller than instructions
+     using GENERIC_REGS.  During register rename pass, we prefer LO_REGS,
+     and code size can be reduced.  */
+  if (TARGET_THUMB2 && class == GENERAL_REGS)
+    return LO_REGS;
+  else
+    return NO_REGS;
+}
+
  #include "gt-arm.h"