PR target/51968

[pf3gnuchains/gcc-fork.git] / gcc / config / arm / arm.c
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c

index 1037d9d..4a94145 100644 (file)
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -1,6 +1,6 @@
  /* Output routines for GCC for ARM.
     Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-   2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
+   2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
     Free Software Foundation, Inc.
     Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl)
     and Martin Simmons (@harleqn.co.uk).
@@ -64,6 +64,11 @@ typedef struct minipool_fixup   Mfix;
  
  void (*arm_lang_output_object_attributes_hook)(void);
  
+struct four_ints
+{
+  int i[4];
+};
+
  /* Forward function declarations.  */
  static bool arm_needs_doubleword_align (enum machine_mode, const_tree);
  static int arm_compute_static_chain_stack_bytes (void);
@@ -82,7 +87,6 @@ inline static int thumb1_index_register_rtx_p (rtx, int);
  static bool arm_legitimate_address_p (enum machine_mode, rtx, bool);
  static int thumb_far_jump_used_p (void);
  static bool thumb_force_lr_save (void);
-static int const_ok_for_op (HOST_WIDE_INT, enum rtx_code);
  static rtx emit_sfm (int, int);
  static unsigned arm_size_return_regs (void);
  static bool arm_assemble_integer (rtx, unsigned int, int);
@@ -126,11 +130,16 @@ static tree arm_handle_notshared_attribute (tree *, tree, tree, int, bool *);
  #endif
  static void arm_output_function_epilogue (FILE *, HOST_WIDE_INT);
  static void arm_output_function_prologue (FILE *, HOST_WIDE_INT);
-static void thumb1_output_function_prologue (FILE *, HOST_WIDE_INT);
  static int arm_comp_type_attributes (const_tree, const_tree);
  static void arm_set_default_type_attributes (tree);
  static int arm_adjust_cost (rtx, rtx, rtx, int);
-static int count_insns_for_constant (HOST_WIDE_INT, int);
+static int optimal_immediate_sequence (enum rtx_code code,
+                                      unsigned HOST_WIDE_INT val,
+                                      struct four_ints *return_sequence);
+static int optimal_immediate_sequence_1 (enum rtx_code code,
+                                        unsigned HOST_WIDE_INT val,
+                                        struct four_ints *return_sequence,
+                                        int i);
  static int arm_get_strip_length (int);
  static bool arm_function_ok_for_sibcall (tree, tree);
  static enum machine_mode arm_promote_function_mode (const_tree,
@@ -138,8 +147,9 @@ static enum machine_mode arm_promote_function_mode (const_tree,
                                                     const_tree, int);
  static bool arm_return_in_memory (const_tree, const_tree);
  static rtx arm_function_value (const_tree, const_tree, bool);
+static rtx arm_libcall_value_1 (enum machine_mode);
  static rtx arm_libcall_value (enum machine_mode, const_rtx);
-
+static bool arm_function_value_regno_p (const unsigned int);
  static void arm_internal_label (FILE *, const char *, unsigned long);
  static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT,
                                  tree);
@@ -152,8 +162,10 @@ static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, boo
  static bool arm_fastmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
  static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
  static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
-static bool arm_rtx_costs (rtx, int, int, int *, bool);
+static bool arm_rtx_costs (rtx, int, int, int, int *, bool);
  static int arm_address_cost (rtx, bool);
+static int arm_register_move_cost (enum machine_mode, reg_class_t, reg_class_t);
+static int arm_memory_move_cost (enum machine_mode, reg_class_t, bool);
  static bool arm_memory_load_p (rtx);
  static bool arm_cirrus_insn_p (rtx);
  static void cirrus_reorg (rtx);
@@ -166,15 +178,16 @@ static rtx arm_expand_builtin (tree, rtx, rtx, enum machine_mode, int);
  static tree arm_builtin_decl (unsigned, bool);
  static void emit_constant_insn (rtx cond, rtx pattern);
  static rtx emit_set_insn (rtx, rtx);
-static int arm_arg_partial_bytes (CUMULATIVE_ARGS *, enum machine_mode,
+static int arm_arg_partial_bytes (cumulative_args_t, enum machine_mode,
                                   tree, bool);
-static rtx arm_function_arg (CUMULATIVE_ARGS *, enum machine_mode,
+static rtx arm_function_arg (cumulative_args_t, enum machine_mode,
                              const_tree, bool);
-static void arm_function_arg_advance (CUMULATIVE_ARGS *, enum machine_mode,
+static void arm_function_arg_advance (cumulative_args_t, enum machine_mode,
                                       const_tree, bool);
  static unsigned int arm_function_arg_boundary (enum machine_mode, const_tree);
  static rtx aapcs_allocate_return_reg (enum machine_mode, const_tree,
                                       const_tree);
+static rtx aapcs_libcall_value (enum machine_mode);
  static int aapcs_select_return_coproc (const_tree, const_tree);
  
  #ifdef OBJECT_FORMAT_ELF
@@ -188,9 +201,9 @@ static void arm_encode_section_info (tree, rtx, int);
  static void arm_file_end (void);
  static void arm_file_start (void);
  
-static void arm_setup_incoming_varargs (CUMULATIVE_ARGS *, enum machine_mode,
+static void arm_setup_incoming_varargs (cumulative_args_t, enum machine_mode,
                                         tree, int *, int);
-static bool arm_pass_by_reference (CUMULATIVE_ARGS *,
+static bool arm_pass_by_reference (cumulative_args_t,
                                    enum machine_mode, const_tree, bool);
  static bool arm_promote_prototypes (const_tree);
  static bool arm_default_short_enums (void);
@@ -204,8 +217,6 @@ static bool arm_output_ttype (rtx);
  static void arm_asm_emit_except_personality (rtx);
  static void arm_asm_init_sections (void);
  #endif
-static enum unwind_info_type arm_except_unwind_info (struct gcc_options *);
-static void arm_dwarf_handle_frame_unspec (const char *, rtx, int);
  static rtx arm_dwarf_register_span (rtx);
  
  static tree arm_cxx_guard_type (void);
@@ -258,6 +269,9 @@ static unsigned int arm_autovectorize_vector_sizes (void);
  static int arm_default_branch_cost (bool, bool);
  static int arm_cortex_a5_branch_cost (bool, bool);
  
+static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                            const unsigned char *sel);
+
  \f
  /* Table of machine attributes.  */
  static const struct attribute_spec arm_attribute_table[] =
@@ -303,15 +317,6 @@ static const struct attribute_spec arm_attribute_table[] =
  #endif
    { NULL,           0, 0, false, false, false, NULL, false }
  };
-
-/* Set default optimization options.  */
-static const struct default_options arm_option_optimization_table[] =
-  {
-    /* Enable section anchors by default at -O1 or higher.  */
-    { OPT_LEVELS_1_PLUS, OPT_fsection_anchors, NULL, 1 },
-    { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
-    { OPT_LEVELS_NONE, 0, NULL, 0 }
-  };
  \f
  /* Initialize the GCC target structure.  */
  #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
@@ -351,12 +356,8 @@ static const struct default_options arm_option_optimization_table[] =
  #undef  TARGET_ASM_FUNCTION_EPILOGUE
  #define TARGET_ASM_FUNCTION_EPILOGUE arm_output_function_epilogue
  
-#undef  TARGET_DEFAULT_TARGET_FLAGS
-#define TARGET_DEFAULT_TARGET_FLAGS (TARGET_DEFAULT | MASK_SCHED_PROLOG)
  #undef  TARGET_OPTION_OVERRIDE
  #define TARGET_OPTION_OVERRIDE arm_option_override
-#undef  TARGET_OPTION_OPTIMIZATION_TABLE
-#define TARGET_OPTION_OPTIMIZATION_TABLE arm_option_optimization_table
  
  #undef  TARGET_COMP_TYPE_ATTRIBUTES
  #define TARGET_COMP_TYPE_ATTRIBUTES arm_comp_type_attributes
@@ -367,6 +368,12 @@ static const struct default_options arm_option_optimization_table[] =
  #undef  TARGET_SCHED_ADJUST_COST
  #define TARGET_SCHED_ADJUST_COST arm_adjust_cost
  
+#undef TARGET_REGISTER_MOVE_COST
+#define TARGET_REGISTER_MOVE_COST arm_register_move_cost
+
+#undef TARGET_MEMORY_MOVE_COST
+#define TARGET_MEMORY_MOVE_COST arm_memory_move_cost
+
  #undef TARGET_ENCODE_SECTION_INFO
  #ifdef ARM_PE
  #define TARGET_ENCODE_SECTION_INFO  arm_pe_encode_section_info
@@ -389,6 +396,9 @@ static const struct default_options arm_option_optimization_table[] =
  #undef  TARGET_LIBCALL_VALUE
  #define TARGET_LIBCALL_VALUE arm_libcall_value
  
+#undef TARGET_FUNCTION_VALUE_REGNO_P
+#define TARGET_FUNCTION_VALUE_REGNO_P arm_function_value_regno_p
+
  #undef  TARGET_ASM_OUTPUT_MI_THUNK
  #define TARGET_ASM_OUTPUT_MI_THUNK arm_output_mi_thunk
  #undef  TARGET_ASM_CAN_OUTPUT_MI_THUNK
@@ -516,12 +526,6 @@ static const struct default_options arm_option_optimization_table[] =
  #define TARGET_ASM_INIT_SECTIONS arm_asm_init_sections
  #endif /* ARM_UNWIND_INFO */
  
-#undef TARGET_EXCEPT_UNWIND_INFO
-#define TARGET_EXCEPT_UNWIND_INFO  arm_except_unwind_info
-
-#undef TARGET_DWARF_HANDLE_FRAME_UNSPEC
-#define TARGET_DWARF_HANDLE_FRAME_UNSPEC arm_dwarf_handle_frame_unspec
-
  #undef TARGET_DWARF_REGISTER_SPAN
  #define TARGET_DWARF_REGISTER_SPAN arm_dwarf_register_span
  
@@ -611,6 +615,10 @@ static const struct default_options arm_option_optimization_table[] =
  #define TARGET_PREFERRED_RENAME_CLASS \
    arm_preferred_rename_class
  
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
+  arm_vectorize_vec_perm_const_ok
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  \f
  /* Obstack for minipool constant handling.  */
@@ -960,6 +968,17 @@ const struct tune_params arm_cortex_a9_tune =
    arm_default_branch_cost
  };
  
+const struct tune_params arm_cortex_a15_tune =
+{
+  arm_9e_rtx_costs,
+  NULL,
+  1,                                           /* Constant limit.  */
+  1,                                           /* Max cond insns.  */
+  ARM_PREFETCH_NOT_BENEFICIAL,                 /* TODO: Calculate correct values.  */
+  false,                                       /* Prefer constant pool.  */
+  arm_cortex_a5_branch_cost
+};
+
  const struct tune_params arm_fa726te_tune =
  {
    arm_9e_rtx_costs,
@@ -1026,7 +1045,8 @@ enum tls_reloc {
    TLS_LDM32,
    TLS_LDO32,
    TLS_IE32,
-  TLS_LE32
+  TLS_LE32,
+  TLS_DESCSEQ  /* GNU scheme */
  };
  
  /* The maximum number of insns to be used when loading a constant.  */
@@ -1059,11 +1079,58 @@ bit_count (unsigned long value)
    return count;
  }
  
+typedef struct
+{
+  enum machine_mode mode;
+  const char *name;
+} arm_fixed_mode_set;
+
+/* A small helper for setting fixed-point library libfuncs.  */
+
+static void
+arm_set_fixed_optab_libfunc (optab optable, enum machine_mode mode,
+                            const char *funcname, const char *modename,
+                            int num_suffix)
+{
+  char buffer[50];
+
+  if (num_suffix == 0)
+    sprintf (buffer, "__gnu_%s%s", funcname, modename);
+  else
+    sprintf (buffer, "__gnu_%s%s%d", funcname, modename, num_suffix);
+
+  set_optab_libfunc (optable, mode, buffer);
+}
+
+static void
+arm_set_fixed_conv_libfunc (convert_optab optable, enum machine_mode to,
+                           enum machine_mode from, const char *funcname,
+                           const char *toname, const char *fromname)
+{
+  char buffer[50];
+  const char *maybe_suffix_2 = "";
+
+  /* Follow the logic for selecting a "2" suffix in fixed-bit.h.  */
+  if (ALL_FIXED_POINT_MODE_P (from) && ALL_FIXED_POINT_MODE_P (to)
+      && UNSIGNED_FIXED_POINT_MODE_P (from) == UNSIGNED_FIXED_POINT_MODE_P (to)
+      && ALL_FRACT_MODE_P (from) == ALL_FRACT_MODE_P (to))
+    maybe_suffix_2 = "2";
+
+  sprintf (buffer, "__gnu_%s%s%s%s", funcname, fromname, toname,
+          maybe_suffix_2);
+
+  set_conv_libfunc (optable, to, from, buffer);
+}
+
  /* Set up library functions unique to ARM.  */
  
  static void
  arm_init_libfuncs (void)
  {
+  /* For Linux, we have access to kernel support for atomic operations.  */
+  if (arm_abi == ARM_ABI_AAPCS_LINUX)
+    init_sync_libfuncs (2 * UNITS_PER_WORD);
+
    /* There are no special library functions unless we are using the
       ARM BPABI.  */
    if (!TARGET_BPABI)
@@ -1178,11 +1245,11 @@ arm_init_libfuncs (void)
                         (arm_fp16_format == ARM_FP16_FORMAT_IEEE
                          ? "__gnu_f2h_ieee"
                          : "__gnu_f2h_alternative"));
-      set_conv_libfunc (sext_optab, SFmode, HFmode, 
+      set_conv_libfunc (sext_optab, SFmode, HFmode,
                         (arm_fp16_format == ARM_FP16_FORMAT_IEEE
                          ? "__gnu_h2f_ieee"
                          : "__gnu_h2f_alternative"));
-      
+
        /* Arithmetic.  */
        set_optab_libfunc (add_optab, HFmode, NULL);
        set_optab_libfunc (sdiv_optab, HFmode, NULL);
@@ -1204,6 +1271,137 @@ arm_init_libfuncs (void)
        break;
      }
  
+  /* Use names prefixed with __gnu_ for fixed-point helper functions.  */
+  {
+    const arm_fixed_mode_set fixed_arith_modes[] =
+      {
+       { QQmode, "qq" },
+       { UQQmode, "uqq" },
+       { HQmode, "hq" },
+       { UHQmode, "uhq" },
+       { SQmode, "sq" },
+       { USQmode, "usq" },
+       { DQmode, "dq" },
+       { UDQmode, "udq" },
+       { TQmode, "tq" },
+       { UTQmode, "utq" },
+       { HAmode, "ha" },
+       { UHAmode, "uha" },
+       { SAmode, "sa" },
+       { USAmode, "usa" },
+       { DAmode, "da" },
+       { UDAmode, "uda" },
+       { TAmode, "ta" },
+       { UTAmode, "uta" }
+      };
+    const arm_fixed_mode_set fixed_conv_modes[] =
+      {
+       { QQmode, "qq" },
+       { UQQmode, "uqq" },
+       { HQmode, "hq" },
+       { UHQmode, "uhq" },
+       { SQmode, "sq" },
+       { USQmode, "usq" },
+       { DQmode, "dq" },
+       { UDQmode, "udq" },
+       { TQmode, "tq" },
+       { UTQmode, "utq" },
+       { HAmode, "ha" },
+       { UHAmode, "uha" },
+       { SAmode, "sa" },
+       { USAmode, "usa" },
+       { DAmode, "da" },
+       { UDAmode, "uda" },
+       { TAmode, "ta" },
+       { UTAmode, "uta" },
+       { QImode, "qi" },
+       { HImode, "hi" },
+       { SImode, "si" },
+       { DImode, "di" },
+       { TImode, "ti" },
+       { SFmode, "sf" },
+       { DFmode, "df" }
+      };
+    unsigned int i, j;
+
+    for (i = 0; i < ARRAY_SIZE (fixed_arith_modes); i++)
+      {
+       arm_set_fixed_optab_libfunc (add_optab, fixed_arith_modes[i].mode,
+                                    "add", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (ssadd_optab, fixed_arith_modes[i].mode,
+                                    "ssadd", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (usadd_optab, fixed_arith_modes[i].mode,
+                                    "usadd", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (sub_optab, fixed_arith_modes[i].mode,
+                                    "sub", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (sssub_optab, fixed_arith_modes[i].mode,
+                                    "sssub", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (ussub_optab, fixed_arith_modes[i].mode,
+                                    "ussub", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (smul_optab, fixed_arith_modes[i].mode,
+                                    "mul", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (ssmul_optab, fixed_arith_modes[i].mode,
+                                    "ssmul", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (usmul_optab, fixed_arith_modes[i].mode,
+                                    "usmul", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (sdiv_optab, fixed_arith_modes[i].mode,
+                                    "div", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (udiv_optab, fixed_arith_modes[i].mode,
+                                    "udiv", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (ssdiv_optab, fixed_arith_modes[i].mode,
+                                    "ssdiv", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (usdiv_optab, fixed_arith_modes[i].mode,
+                                    "usdiv", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (neg_optab, fixed_arith_modes[i].mode,
+                                    "neg", fixed_arith_modes[i].name, 2);
+       arm_set_fixed_optab_libfunc (ssneg_optab, fixed_arith_modes[i].mode,
+                                    "ssneg", fixed_arith_modes[i].name, 2);
+       arm_set_fixed_optab_libfunc (usneg_optab, fixed_arith_modes[i].mode,
+                                    "usneg", fixed_arith_modes[i].name, 2);
+       arm_set_fixed_optab_libfunc (ashl_optab, fixed_arith_modes[i].mode,
+                                    "ashl", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (ashr_optab, fixed_arith_modes[i].mode,
+                                    "ashr", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (lshr_optab, fixed_arith_modes[i].mode,
+                                    "lshr", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (ssashl_optab, fixed_arith_modes[i].mode,
+                                    "ssashl", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (usashl_optab, fixed_arith_modes[i].mode,
+                                    "usashl", fixed_arith_modes[i].name, 3);
+       arm_set_fixed_optab_libfunc (cmp_optab, fixed_arith_modes[i].mode,
+                                    "cmp", fixed_arith_modes[i].name, 2);
+      }
+
+    for (i = 0; i < ARRAY_SIZE (fixed_conv_modes); i++)
+      for (j = 0; j < ARRAY_SIZE (fixed_conv_modes); j++)
+       {
+         if (i == j
+             || (!ALL_FIXED_POINT_MODE_P (fixed_conv_modes[i].mode)
+                 && !ALL_FIXED_POINT_MODE_P (fixed_conv_modes[j].mode)))
+           continue;
+
+         arm_set_fixed_conv_libfunc (fract_optab, fixed_conv_modes[i].mode,
+                                     fixed_conv_modes[j].mode, "fract",
+                                     fixed_conv_modes[i].name,
+                                     fixed_conv_modes[j].name);
+         arm_set_fixed_conv_libfunc (satfract_optab,
+                                     fixed_conv_modes[i].mode,
+                                     fixed_conv_modes[j].mode, "satfract",
+                                     fixed_conv_modes[i].name,
+                                     fixed_conv_modes[j].name);
+         arm_set_fixed_conv_libfunc (fractuns_optab,
+                                     fixed_conv_modes[i].mode,
+                                     fixed_conv_modes[j].mode, "fractuns",
+                                     fixed_conv_modes[i].name,
+                                     fixed_conv_modes[j].name);
+         arm_set_fixed_conv_libfunc (satfractuns_optab,
+                                     fixed_conv_modes[i].mode,
+                                     fixed_conv_modes[j].mode, "satfractuns",
+                                     fixed_conv_modes[i].name,
+                                     fixed_conv_modes[j].name);
+       }
+  }
+
    if (TARGET_AAPCS_BASED)
      synchronize_libfunc = init_one_libfunc ("__sync_synchronize");
  }
@@ -1217,14 +1415,14 @@ arm_build_builtin_va_list (void)
  {
    tree va_list_name;
    tree ap_field;
-  
+
    if (!TARGET_AAPCS_BASED)
      return std_build_builtin_va_list ();
  
    /* AAPCS \S 7.1.4 requires that va_list be a typedef for a type
       defined as:
  
-       struct __va_list 
+       struct __va_list
         {
          void *__ap;
         };
@@ -1248,7 +1446,7 @@ arm_build_builtin_va_list (void)
    TYPE_STUB_DECL (va_list_type) = va_list_name;
    /* Create the __ap field.  */
    ap_field = build_decl (BUILTINS_LOCATION,
-                        FIELD_DECL, 
+                        FIELD_DECL,
                          get_identifier ("__ap"),
                          ptr_type_node);
    DECL_ARTIFICIAL (ap_field) = 1;
@@ -1274,7 +1472,7 @@ arm_extract_valist_ptr (tree valist)
    if (TARGET_AAPCS_BASED)
      {
        tree ap_field = TYPE_FIELDS (TREE_TYPE (valist));
-      valist = build3 (COMPONENT_REF, TREE_TYPE (ap_field), 
+      valist = build3 (COMPONENT_REF, TREE_TYPE (ap_field),
                        valist, ap_field, NULL_TREE);
      }
  
@@ -1291,7 +1489,7 @@ arm_expand_builtin_va_start (tree valist, rtx nextarg)
  
  /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
  static tree
-arm_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p, 
+arm_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
                           gimple_seq *post_p)
  {
    valist = arm_extract_valist_ptr (valist);
@@ -1500,6 +1698,10 @@ arm_option_override (void)
    if (TARGET_APCS_FLOAT)
      warning (0, "passing floating point arguments in fp regs not yet supported");
  
+  if (TARGET_LITTLE_WORDS)
+    warning (OPT_Wdeprecated, "%<mwords-little-endian%> is deprecated and "
+            "will be removed in a future release");
+
    /* Initialize boolean versions of the flags, for use in the arm.md file.  */
    arm_arch3m = (insn_flags & FL_ARCH3M) != 0;
    arm_arch4 = (insn_flags & FL_ARCH4) != 0;
@@ -1748,6 +1950,28 @@ arm_option_override (void)
         fix_cm3_ldrd = 0;
      }
  
+  /* Enable -munaligned-access by default for
+     - all ARMv6 architecture-based processors
+     - ARMv7-A, ARMv7-R, and ARMv7-M architecture-based processors.
+
+     Disable -munaligned-access by default for
+     - all pre-ARMv6 architecture-based processors
+     - ARMv6-M architecture-based processors.  */
+
+  if (unaligned_access == 2)
+    {
+      if (arm_arch6 && (arm_arch_notm || arm_arch7))
+       unaligned_access = 1;
+      else
+       unaligned_access = 0;
+    }
+  else if (unaligned_access == 1
+          && !(arm_arch6 && (arm_arch_notm || arm_arch7)))
+    {
+      warning (0, "target CPU does not support unaligned accesses");
+      unaligned_access = 0;
+    }
+
    if (TARGET_THUMB1 && flag_schedule_insns)
      {
        /* Don't warn since it's on by default in -O2.  */
@@ -1783,7 +2007,8 @@ arm_option_override (void)
                            global_options_set.x_param_values);
  
    /* ARM EABI defaults to strict volatile bitfields.  */
-  if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0)
+  if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0
+      && abi_version_at_least(2))
      flag_strict_volatile_bitfields = 1;
  
    /* Enable sw prefetching at -O3 for CPUS that have prefetch, and we have deemed
@@ -2123,7 +2348,7 @@ use_return_insn (int iscond, rtx sibling)
        if (saved_int_regs != 0 && saved_int_regs != (1 << LR_REGNUM))
         return 0;
  
-      if (flag_pic 
+      if (flag_pic
           && arm_pic_register != INVALID_REGNUM
           && df_regs_ever_live_p (PIC_OFFSET_TABLE_REGNUM))
         return 0;
@@ -2179,7 +2404,7 @@ const_ok_for_arm (HOST_WIDE_INT i)
  
    /* Get the number of trailing zeros.  */
    lowbit = ffs((int) i) - 1;
-  
+
    /* Only even shifts are allowed in ARM mode so round down to the
       nearest even number.  */
    if (TARGET_ARM)
@@ -2218,7 +2443,7 @@ const_ok_for_arm (HOST_WIDE_INT i)
  }
  
  /* Return true if I is a valid constant for the operation CODE.  */
-static int
+int
  const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code)
  {
    if (const_ok_for_arm (i))
@@ -2231,9 +2456,17 @@ const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code)
        if (arm_arch_thumb2 && (i & 0xffff0000) == 0)
         return 1;
        else
-       return 0;
+       /* Otherwise, try mvn.  */
+       return const_ok_for_arm (ARM_SIGN_EXTEND (~i));
  
      case PLUS:
+      /* See if we can use addw or subw.  */
+      if (TARGET_THUMB2
+         && ((i & 0xfffff000) == 0
+             || ((-i) & 0xfffff000) == 0))
+       return 1;
+      /* else fall through.  */
+
      case COMPARE:
      case EQ:
      case NE:
@@ -2349,68 +2582,41 @@ arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn,
                            1);
  }
  
-/* Return the number of instructions required to synthesize the given
-   constant, if we start emitting them from bit-position I.  */
-static int
-count_insns_for_constant (HOST_WIDE_INT remainder, int i)
-{
-  HOST_WIDE_INT temp1;
-  int step_size = TARGET_ARM ? 2 : 1;
-  int num_insns = 0;
-
-  gcc_assert (TARGET_ARM || i == 0);
-
-  do
-    {
-      int end;
-
-      if (i <= 0)
-       i += 32;
-      if (remainder & (((1 << step_size) - 1) << (i - step_size)))
-       {
-         end = i - 8;
-         if (end < 0)
-           end += 32;
-         temp1 = remainder & ((0x0ff << end)
-                                   | ((i < end) ? (0xff >> (32 - end)) : 0));
-         remainder &= ~temp1;
-         num_insns++;
-         i -= 8 - step_size;
-       }
-      i -= step_size;
-    } while (remainder);
-  return num_insns;
-}
-
+/* Return a sequence of integers, in RETURN_SEQUENCE that fit into
+   ARM/THUMB2 immediates, and add up to VAL.
+   Thr function return value gives the number of insns required.  */
  static int
-find_best_start (unsigned HOST_WIDE_INT remainder)
+optimal_immediate_sequence (enum rtx_code code, unsigned HOST_WIDE_INT val,
+                           struct four_ints *return_sequence)
  {
    int best_consecutive_zeros = 0;
    int i;
    int best_start = 0;
+  int insns1, insns2;
+  struct four_ints tmp_sequence;
  
    /* If we aren't targetting ARM, the best place to start is always at
-     the bottom.  */
-  if (! TARGET_ARM)
-    return 0;
-
-  for (i = 0; i < 32; i += 2)
+     the bottom, otherwise look more closely.  */
+  if (TARGET_ARM)
      {
-      int consecutive_zeros = 0;
-
-      if (!(remainder & (3 << i)))
+      for (i = 0; i < 32; i += 2)
         {
-         while ((i < 32) && !(remainder & (3 << i)))
-           {
-             consecutive_zeros += 2;
-             i += 2;
-           }
-         if (consecutive_zeros > best_consecutive_zeros)
+         int consecutive_zeros = 0;
+
+         if (!(val & (3 << i)))
             {
-             best_consecutive_zeros = consecutive_zeros;
-             best_start = i - consecutive_zeros;
+             while ((i < 32) && !(val & (3 << i)))
+               {
+                 consecutive_zeros += 2;
+                 i += 2;
+               }
+             if (consecutive_zeros > best_consecutive_zeros)
+               {
+                 best_consecutive_zeros = consecutive_zeros;
+                 best_start = i - consecutive_zeros;
+               }
+             i -= 2;
             }
-         i -= 2;
         }
      }
  
@@ -2437,13 +2643,161 @@ find_best_start (unsigned HOST_WIDE_INT remainder)
       the constant starting from `best_start', and also starting from
       zero (i.e. with bit 31 first to be output).  If `best_start' doesn't
       yield a shorter sequence, we may as well use zero.  */
+  insns1 = optimal_immediate_sequence_1 (code, val, return_sequence, best_start);
    if (best_start != 0
-      && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder)
-      && (count_insns_for_constant (remainder, 0) <=
-         count_insns_for_constant (remainder, best_start)))
-    best_start = 0;
+      && ((((unsigned HOST_WIDE_INT) 1) << best_start) < val))
+    {
+      insns2 = optimal_immediate_sequence_1 (code, val, &tmp_sequence, 0);
+      if (insns2 <= insns1)
+       {
+         *return_sequence = tmp_sequence;
+         insns1 = insns2;
+       }
+    }
+
+  return insns1;
+}
+
+/* As for optimal_immediate_sequence, but starting at bit-position I.  */
+static int
+optimal_immediate_sequence_1 (enum rtx_code code, unsigned HOST_WIDE_INT val,
+                            struct four_ints *return_sequence, int i)
+{
+  int remainder = val & 0xffffffff;
+  int insns = 0;
+
+  /* Try and find a way of doing the job in either two or three
+     instructions.
+
+     In ARM mode we can use 8-bit constants, rotated to any 2-bit aligned
+     location.  We start at position I.  This may be the MSB, or
+     optimial_immediate_sequence may have positioned it at the largest block
+     of zeros that are aligned on a 2-bit boundary. We then fill up the temps,
+     wrapping around to the top of the word when we drop off the bottom.
+     In the worst case this code should produce no more than four insns.
+
+     In Thumb2 mode, we can use 32/16-bit replicated constants, and 8-bit
+     constants, shifted to any arbitrary location.  We should always start
+     at the MSB.  */
+  do
+    {
+      int end;
+      unsigned int b1, b2, b3, b4;
+      unsigned HOST_WIDE_INT result;
+      int loc;
+
+      gcc_assert (insns < 4);
+
+      if (i <= 0)
+       i += 32;
+
+      /* First, find the next normal 12/8-bit shifted/rotated immediate.  */
+      if (remainder & ((TARGET_ARM ? (3 << (i - 2)) : (1 << (i - 1)))))
+       {
+         loc = i;
+         if (i <= 12 && TARGET_THUMB2 && code == PLUS)
+           /* We can use addw/subw for the last 12 bits.  */
+           result = remainder;
+         else
+           {
+             /* Use an 8-bit shifted/rotated immediate.  */
+             end = i - 8;
+             if (end < 0)
+               end += 32;
+             result = remainder & ((0x0ff << end)
+                                  | ((i < end) ? (0xff >> (32 - end))
+                                               : 0));
+             i -= 8;
+           }
+       }
+      else
+       {
+         /* Arm allows rotates by a multiple of two. Thumb-2 allows
+            arbitrary shifts.  */
+         i -= TARGET_ARM ? 2 : 1;
+         continue;
+       }
+
+      /* Next, see if we can do a better job with a thumb2 replicated
+        constant.
+
+         We do it this way around to catch the cases like 0x01F001E0 where
+        two 8-bit immediates would work, but a replicated constant would
+        make it worse.
+
+         TODO: 16-bit constants that don't clear all the bits, but still win.
+         TODO: Arithmetic splitting for set/add/sub, rather than bitwise.  */
+      if (TARGET_THUMB2)
+       {
+         b1 = (remainder & 0xff000000) >> 24;
+         b2 = (remainder & 0x00ff0000) >> 16;
+         b3 = (remainder & 0x0000ff00) >> 8;
+         b4 = remainder & 0xff;
+
+         if (loc > 24)
+           {
+             /* The 8-bit immediate already found clears b1 (and maybe b2),
+                but must leave b3 and b4 alone.  */
+
+             /* First try to find a 32-bit replicated constant that clears
+                almost everything.  We can assume that we can't do it in one,
+                or else we wouldn't be here.  */
+             unsigned int tmp = b1 & b2 & b3 & b4;
+             unsigned int tmp2 = tmp + (tmp << 8) + (tmp << 16)
+                                 + (tmp << 24);
+             unsigned int matching_bytes = (tmp == b1) + (tmp == b2)
+                                           + (tmp == b3) + (tmp == b4);
+             if (tmp
+                 && (matching_bytes >= 3
+                     || (matching_bytes == 2
+                         && const_ok_for_op (remainder & ~tmp2, code))))
+               {
+                 /* At least 3 of the bytes match, and the fourth has at
+                    least as many bits set, or two of the bytes match
+                    and it will only require one more insn to finish.  */
+                 result = tmp2;
+                 i = tmp != b1 ? 32
+                     : tmp != b2 ? 24
+                     : tmp != b3 ? 16
+                     : 8;
+               }
+
+             /* Second, try to find a 16-bit replicated constant that can
+                leave three of the bytes clear.  If b2 or b4 is already
+                zero, then we can.  If the 8-bit from above would not
+                clear b2 anyway, then we still win.  */
+             else if (b1 == b3 && (!b2 || !b4
+                              || (remainder & 0x00ff0000 & ~result)))
+               {
+                 result = remainder & 0xff00ff00;
+                 i = 24;
+               }
+           }
+         else if (loc > 16)
+           {
+             /* The 8-bit immediate already found clears b2 (and maybe b3)
+                and we don't get here unless b1 is alredy clear, but it will
+                leave b4 unchanged.  */
+
+             /* If we can clear b2 and b4 at once, then we win, since the
+                8-bits couldn't possibly reach that far.  */
+             if (b2 == b4)
+               {
+                 result = remainder & 0x00ff00ff;
+                 i = 16;
+               }
+           }
+       }
+
+      return_sequence->i[insns++] = result;
+      remainder &= ~result;
+
+      if (code == SET || code == MINUS)
+       code = PLUS;
+    }
+  while (remainder);
  
-  return best_start;
+  return insns;
  }
  
  /* Emit an instruction with the indicated PATTERN.  If COND is
@@ -2460,7 +2814,6 @@ emit_constant_insn (rtx cond, rtx pattern)
  
  /* As above, but extra parameter GENERATE which, if clear, suppresses
     RTL generation.  */
-/* ??? This needs more work for thumb2.  */
  
  static int
  arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
@@ -2471,15 +2824,15 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
    int can_negate = 0;
    int final_invert = 0;
    int i;
-  int num_bits_set = 0;
    int set_sign_bit_copies = 0;
    int clear_sign_bit_copies = 0;
    int clear_zero_bit_copies = 0;
    int set_zero_bit_copies = 0;
-  int insns = 0;
+  int insns = 0, neg_insns, inv_insns;
    unsigned HOST_WIDE_INT temp1, temp2;
    unsigned HOST_WIDE_INT remainder = val & 0xffffffff;
-  int step_size = TARGET_ARM ? 2 : 1;
+  struct four_ints *immediates;
+  struct four_ints pos_immediates, neg_immediates, inv_immediates;
  
    /* Find out which operations are safe for a given CODE.  Also do a quick
       check for degenerate cases; these can occur when DImode operations
@@ -2488,7 +2841,6 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
      {
      case SET:
        can_invert = 1;
-      can_negate = 1;
        break;
  
      case PLUS:
@@ -2556,6 +2908,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
                                              gen_rtx_NOT (mode, source)));
           return 1;
         }
+      final_invert = 1;
        break;
  
      case MINUS:
@@ -2578,7 +2931,6 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
                                                             source)));
           return 1;
         }
-      can_negate = 1;
  
        break;
  
@@ -2990,120 +3342,97 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond,
        break;
      }
  
-  for (i = 0; i < 32; i++)
-    if (remainder & (1 << i))
-      num_bits_set++;
-
-  if ((code == AND) || (can_invert && num_bits_set > 16))
-    remainder ^= 0xffffffff;
-  else if (code == PLUS && num_bits_set > 16)
-    remainder = (-remainder) & 0xffffffff;
-
-  /* For XOR, if more than half the bits are set and there's a sequence
-     of more than 8 consecutive ones in the pattern then we can XOR by the
-     inverted constant and then invert the final result; this may save an
-     instruction and might also lead to the final mvn being merged with
-     some other operation.  */
-  else if (code == XOR && num_bits_set > 16
-          && (count_insns_for_constant (remainder ^ 0xffffffff,
-                                        find_best_start
-                                        (remainder ^ 0xffffffff))
-              < count_insns_for_constant (remainder,
-                                          find_best_start (remainder))))
-    {
-      remainder ^= 0xffffffff;
-      final_invert = 1;
+  /* Calculate what the instruction sequences would be if we generated it
+     normally, negated, or inverted.  */
+  if (code == AND)
+    /* AND cannot be split into multiple insns, so invert and use BIC.  */
+    insns = 99;
+  else
+    insns = optimal_immediate_sequence (code, remainder, &pos_immediates);
+
+  if (can_negate)
+    neg_insns = optimal_immediate_sequence (code, (-remainder) & 0xffffffff,
+                                           &neg_immediates);
+  else
+    neg_insns = 99;
+
+  if (can_invert || final_invert)
+    inv_insns = optimal_immediate_sequence (code, remainder ^ 0xffffffff,
+                                           &inv_immediates);
+  else
+    inv_insns = 99;
+
+  immediates = &pos_immediates;
+
+  /* Is the negated immediate sequence more efficient?  */
+  if (neg_insns < insns && neg_insns <= inv_insns)
+    {
+      insns = neg_insns;
+      immediates = &neg_immediates;
+    }
+  else
+    can_negate = 0;
+
+  /* Is the inverted immediate sequence more efficient?
+     We must allow for an extra NOT instruction for XOR operations, although
+     there is some chance that the final 'mvn' will get optimized later.  */
+  if ((inv_insns + 1) < insns || (!final_invert && inv_insns < insns))
+    {
+      insns = inv_insns;
+      immediates = &inv_immediates;
      }
    else
      {
        can_invert = 0;
-      can_negate = 0;
+      final_invert = 0;
      }
  
-  /* Now try and find a way of doing the job in either two or three
-     instructions.
-     We start by looking for the largest block of zeros that are aligned on
-     a 2-bit boundary, we then fill up the temps, wrapping around to the
-     top of the word when we drop off the bottom.
-     In the worst case this code should produce no more than four insns.
-     Thumb-2 constants are shifted, not rotated, so the MSB is always the
-     best place to start.  */
+  /* Now output the chosen sequence as instructions.  */
+  if (generate)
+    {
+      for (i = 0; i < insns; i++)
+       {
+         rtx new_src, temp1_rtx;
  
-  /* ??? Use thumb2 replicated constants when the high and low halfwords are
-     the same.  */
-  {
-    /* Now start emitting the insns.  */
-    i = find_best_start (remainder);
-    do
-      {
-       int end;
+         temp1 = immediates->i[i];
  
-       if (i <= 0)
-         i += 32;
-       if (remainder & (3 << (i - 2)))
-         {
-           end = i - 8;
-           if (end < 0)
-             end += 32;
-           temp1 = remainder & ((0x0ff << end)
-                                | ((i < end) ? (0xff >> (32 - end)) : 0));
-           remainder &= ~temp1;
-
-           if (generate)
-             {
-               rtx new_src, temp1_rtx;
+         if (code == SET || code == MINUS)
+           new_src = (subtargets ? gen_reg_rtx (mode) : target);
+         else if ((final_invert || i < (insns - 1)) && subtargets)
+           new_src = gen_reg_rtx (mode);
+         else
+           new_src = target;
  
-               if (code == SET || code == MINUS)
-                 {
-                   new_src = (subtargets ? gen_reg_rtx (mode) : target);
-                   if (can_invert && code != MINUS)
-                     temp1 = ~temp1;
-                 }
-               else
-                 {
-                   if ((final_invert || remainder) && subtargets)
-                     new_src = gen_reg_rtx (mode);
-                   else
-                     new_src = target;
-                   if (can_invert)
-                     temp1 = ~temp1;
-                   else if (can_negate)
-                     temp1 = -temp1;
-                 }
+         if (can_invert)
+           temp1 = ~temp1;
+         else if (can_negate)
+           temp1 = -temp1;
  
-               temp1 = trunc_int_for_mode (temp1, mode);
-               temp1_rtx = GEN_INT (temp1);
+         temp1 = trunc_int_for_mode (temp1, mode);
+         temp1_rtx = GEN_INT (temp1);
  
-               if (code == SET)
-                 ;
-               else if (code == MINUS)
-                 temp1_rtx = gen_rtx_MINUS (mode, temp1_rtx, source);
-               else
-                 temp1_rtx = gen_rtx_fmt_ee (code, mode, source, temp1_rtx);
+         if (code == SET)
+           ;
+         else if (code == MINUS)
+           temp1_rtx = gen_rtx_MINUS (mode, temp1_rtx, source);
+         else
+           temp1_rtx = gen_rtx_fmt_ee (code, mode, source, temp1_rtx);
  
-               emit_constant_insn (cond,
-                                   gen_rtx_SET (VOIDmode, new_src,
-                                                temp1_rtx));
-               source = new_src;
-             }
+         emit_constant_insn (cond,
+                             gen_rtx_SET (VOIDmode, new_src,
+                                          temp1_rtx));
+         source = new_src;
  
-           if (code == SET)
-             {
-               can_invert = 0;
-               code = PLUS;
-             }
-           else if (code == MINUS)
+         if (code == SET)
+           {
+             can_negate = can_invert;
+             can_invert = 0;
               code = PLUS;
-
-           insns++;
-           i -= 8 - step_size;
-         }
-       /* Arm allows rotates by a multiple of two. Thumb-2 allows arbitrary
-          shifts.  */
-       i -= step_size;
-      }
-    while (remainder);
-  }
+           }
+         else if (code == MINUS)
+           code = PLUS;
+       }
+    }
  
    if (final_invert)
      {
@@ -3188,6 +3517,19 @@ arm_canonicalize_comparison (enum rtx_code code, rtx *op0, rtx *op1)
        return code;
      }
  
+  /* If *op0 is (zero_extend:SI (subreg:QI (reg:SI) 0)) and comparing
+     with const0_rtx, change it to (and:SI (reg:SI) (const_int 255)),
+     to facilitate possible combining with a cmp into 'ands'.  */
+  if (mode == SImode
+      && GET_CODE (*op0) == ZERO_EXTEND
+      && GET_CODE (XEXP (*op0, 0)) == SUBREG
+      && GET_MODE (XEXP (*op0, 0)) == QImode
+      && GET_MODE (SUBREG_REG (XEXP (*op0, 0))) == SImode
+      && subreg_lowpart_p (XEXP (*op0, 0))
+      && *op1 == const0_rtx)
+    *op0 = gen_rtx_AND (SImode, SUBREG_REG (XEXP (*op0, 0)),
+                       GEN_INT (255));
+
    /* Comparisons smaller than DImode.  Only adjust comparisons against
       an out-of-range constant.  */
    if (GET_CODE (*op1) != CONST_INT
@@ -3282,7 +3624,7 @@ arm_function_value(const_tree type, const_tree func,
         }
      }
  
-  return LIBCALL_VALUE (mode);
+  return arm_libcall_value_1 (mode);
  }
  
  static int
@@ -3323,7 +3665,7 @@ arm_libcall_uses_aapcs_base (const_rtx libcall)
                    convert_optab_libfunc (sfloat_optab, SFmode, DImode));
        add_libcall (libcall_htab,
                    convert_optab_libfunc (sfloat_optab, DFmode, DImode));
-      
+
        add_libcall (libcall_htab,
                    convert_optab_libfunc (ufloat_optab, SFmode, SImode));
        add_libcall (libcall_htab,
@@ -3345,12 +3687,59 @@ arm_libcall_uses_aapcs_base (const_rtx libcall)
                    convert_optab_libfunc (sfix_optab, DImode, SFmode));
        add_libcall (libcall_htab,
                    convert_optab_libfunc (ufix_optab, DImode, SFmode));
+
+      /* Values from double-precision helper functions are returned in core
+        registers if the selected core only supports single-precision
+        arithmetic, even if we are using the hard-float ABI.  The same is
+        true for single-precision helpers, but we will never be using the
+        hard-float ABI on a CPU which doesn't support single-precision
+        operations in hardware.  */
+      add_libcall (libcall_htab, optab_libfunc (add_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (sdiv_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (smul_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (neg_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (sub_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (eq_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (lt_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (le_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (ge_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (gt_optab, DFmode));
+      add_libcall (libcall_htab, optab_libfunc (unord_optab, DFmode));
+      add_libcall (libcall_htab, convert_optab_libfunc (sext_optab, DFmode,
+                                                       SFmode));
+      add_libcall (libcall_htab, convert_optab_libfunc (trunc_optab, SFmode,
+                                                       DFmode));
      }
  
    return libcall && htab_find (libcall_htab, libcall) != NULL;
  }
  
-rtx
+static rtx
+arm_libcall_value_1 (enum machine_mode mode)
+{
+  if (TARGET_AAPCS_BASED)
+    return aapcs_libcall_value (mode);
+  else if (TARGET_32BIT
+          && TARGET_HARD_FLOAT_ABI
+          && TARGET_FPA
+          && GET_MODE_CLASS (mode) == MODE_FLOAT)
+    return gen_rtx_REG (mode, FIRST_FPA_REGNUM);
+  else if (TARGET_32BIT
+          && TARGET_HARD_FLOAT_ABI
+          && TARGET_MAVERICK
+          && GET_MODE_CLASS (mode) == MODE_FLOAT)
+    return gen_rtx_REG (mode, FIRST_CIRRUS_FP_REGNUM);
+  else if (TARGET_IWMMXT_ABI
+          && arm_vector_mode_supported_p (mode))
+    return gen_rtx_REG (mode, FIRST_IWMMXT_REGNUM);
+  else
+    return gen_rtx_REG (mode, ARG_REGISTER (1));
+}
+
+/* Define how to find the value returned by a library function
+   assuming the value has mode MODE.  */
+
+static rtx
  arm_libcall_value (enum machine_mode mode, const_rtx libcall)
  {
    if (TARGET_AAPCS_BASED && arm_pcs_default != ARM_PCS_AAPCS
@@ -3363,7 +3752,33 @@ arm_libcall_value (enum machine_mode mode, const_rtx libcall)
  
      }
  
-  return LIBCALL_VALUE (mode);
+  return arm_libcall_value_1 (mode);
+}
+
+/* Implement TARGET_FUNCTION_VALUE_REGNO_P.  */
+
+static bool
+arm_function_value_regno_p (const unsigned int regno)
+{
+  if (regno == ARG_REGISTER (1)
+      || (TARGET_32BIT
+         && TARGET_AAPCS_BASED
+         && TARGET_VFP
+         && TARGET_HARD_FLOAT
+         && regno == FIRST_VFP_REGNUM)
+      || (TARGET_32BIT
+         && TARGET_HARD_FLOAT_ABI
+         && TARGET_MAVERICK
+         && regno == FIRST_CIRRUS_FP_REGNUM)
+      || (TARGET_IWMMXT_ABI
+         && regno == FIRST_IWMMXT_REGNUM)
+      || (TARGET_32BIT
+         && TARGET_HARD_FLOAT_ABI
+         && TARGET_FPA
+         && regno == FIRST_FPA_REGNUM))
+    return true;
+
+  return false;
  }
  
  /* Determine the amount of memory needed to store the possible return
@@ -3621,7 +4036,7 @@ arm_get_pcs_model (const_tree type, const_tree decl)
          (no argument is ever a candidate for a co-processor
          register).  */
        bool base_rules = stdarg_p (type);
-      
+
        if (user_convention)
         {
           if (user_pcs > ARM_PCS_AAPCS_LOCAL)
@@ -3656,7 +4071,7 @@ arm_get_pcs_model (const_tree type, const_tree decl)
  static void
  aapcs_vfp_cum_init (CUMULATIVE_ARGS *pcum  ATTRIBUTE_UNUSED,
                     const_tree fntype ATTRIBUTE_UNUSED,
-                   rtx libcall ATTRIBUTE_UNUSED, 
+                   rtx libcall ATTRIBUTE_UNUSED,
                     const_tree fndecl ATTRIBUTE_UNUSED)
  {
    /* Record the unallocated VFP registers.  */
@@ -3761,7 +4176,7 @@ aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
  
         return count;
        }
-      
+
      case RECORD_TYPE:
        {
         int count = 0;
@@ -3909,7 +4324,7 @@ aapcs_vfp_is_return_candidate (enum arm_pcs pcs_variant,
  }
  
  static bool
-aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, enum machine_mode mode, 
+aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
                              const_tree type)
  {
    if (!use_vfp_abi (pcum->pcs_variant, false))
@@ -3927,7 +4342,7 @@ aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
    int shift = GET_MODE_SIZE (pcum->aapcs_vfp_rmode) / GET_MODE_SIZE (SFmode);
    unsigned mask = (1 << (shift * pcum->aapcs_vfp_rcount)) - 1;
    int regno;
-  
+
    for (regno = 0; regno < NUM_VFP_ARG_REGS; regno += shift)
      if (((pcum->aapcs_vfp_regs_free >> regno) & mask) == mask)
        {
@@ -3954,10 +4369,10 @@ aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
             par = gen_rtx_PARALLEL (mode, rtvec_alloc (rcount));
             for (i = 0; i < rcount; i++)
               {
-               rtx tmp = gen_rtx_REG (rmode, 
+               rtx tmp = gen_rtx_REG (rmode,
                                        FIRST_VFP_REGNUM + regno + i * rshift);
                 tmp = gen_rtx_EXPR_LIST
-                 (VOIDmode, tmp, 
+                 (VOIDmode, tmp,
                    GEN_INT (i * GET_MODE_SIZE (rmode)));
                 XVECEXP (par, 0, i) = tmp;
               }
@@ -3977,7 +4392,7 @@ aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED,
                                const_tree type ATTRIBUTE_UNUSED)
  {
    if (!use_vfp_abi (pcs_variant, false))
-    return false;
+    return NULL;
  
    if (mode == BLKmode || (mode == TImode && !TARGET_NEON))
      {
@@ -3986,7 +4401,7 @@ aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED,
        int i;
        rtx par;
        int shift;
-      
+
        aapcs_vfp_is_call_or_return_candidate (pcs_variant, mode, type,
                                              &ag_mode, &count);
  
@@ -4005,7 +4420,7 @@ aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED,
        for (i = 0; i < count; i++)
         {
           rtx tmp = gen_rtx_REG (ag_mode, FIRST_VFP_REGNUM + i * shift);
-         tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, 
+         tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
                                    GEN_INT (i * GET_MODE_SIZE (ag_mode)));
           XVECEXP (par, 0, i) = tmp;
         }
@@ -4042,7 +4457,7 @@ aapcs_vfp_advance (CUMULATIVE_ARGS *pcum  ATTRIBUTE_UNUSED,
     and stops after the first match.  If that entry then fails to put
     the argument into a co-processor register, the argument will go on
     the stack.  */
-static struct 
+static struct
  {
    /* Initialize co-processor related state in CUMULATIVE_ARGS structure.  */
    void (*cum_init) (CUMULATIVE_ARGS *, const_tree, rtx, const_tree);
@@ -4078,7 +4493,7 @@ static struct
  #undef AAPCS_CP
  
  static int
-aapcs_select_call_coproc (CUMULATIVE_ARGS *pcum, enum machine_mode mode, 
+aapcs_select_call_coproc (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
                           const_tree type)
  {
    int i;
@@ -4118,7 +4533,7 @@ aapcs_select_return_coproc (const_tree type, const_tree fntype)
        int i;
  
        for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++)
-       if (aapcs_cp_arg_layout[i].is_return_candidate (pcs_variant, 
+       if (aapcs_cp_arg_layout[i].is_return_candidate (pcs_variant,
                                                         TYPE_MODE (type),
                                                         type))
           return i;
@@ -4181,9 +4596,13 @@ aapcs_allocate_return_reg (enum machine_mode mode, const_tree type,
    return gen_rtx_REG (mode, R0_REGNUM);
  }
  
-rtx
+static rtx
  aapcs_libcall_value (enum machine_mode mode)
  {
+  if (BYTES_BIG_ENDIAN && ALL_FIXED_POINT_MODE_P (mode)
+      && GET_MODE_SIZE (mode) <= 4)
+    mode = SImode;
+
    return aapcs_allocate_return_reg (mode, NULL_TREE, NULL_TREE);
  }
  
@@ -4206,7 +4625,7 @@ aapcs_layout_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
       anonymous argument which is on the stack.  */
    if (!named)
      return;
-  
+
    /* Is this a potential co-processor register candidate?  */
    if (pcum->pcs_variant != ARM_PCS_AAPCS)
      {
@@ -4306,7 +4725,7 @@ arm_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree fntype,
      {
        if (arm_libcall_uses_aapcs_base (libname))
         pcum->pcs_variant = ARM_PCS_AAPCS;
- 
+
        pcum->aapcs_ncrn = pcum->aapcs_next_ncrn = 0;
        pcum->aapcs_reg = NULL_RTX;
        pcum->aapcs_partial = 0;
@@ -4383,9 +4802,10 @@ arm_needs_doubleword_align (enum machine_mode mode, const_tree type)
     indeed make it pass in the stack if necessary).  */
  
  static rtx
-arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
+arm_function_arg (cumulative_args_t pcum_v, enum machine_mode mode,
                   const_tree type, bool named)
  {
+  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
    int nregs;
  
    /* Handle the special case quickly.  Pick an arbitrary value for op2 of
@@ -4443,9 +4863,10 @@ arm_function_arg_boundary (enum machine_mode mode, const_tree type)
  }
  
  static int
-arm_arg_partial_bytes (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
+arm_arg_partial_bytes (cumulative_args_t pcum_v, enum machine_mode mode,
                        tree type, bool named)
  {
+  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
    int nregs = pcum->nregs;
  
    if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
@@ -4470,9 +4891,11 @@ arm_arg_partial_bytes (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
     (TYPE is null for libcalls where that information may not be available.)  */
  
  static void
-arm_function_arg_advance (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
+arm_function_arg_advance (cumulative_args_t pcum_v, enum machine_mode mode,
                           const_tree type, bool named)
  {
+  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
+
    if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
      {
        aapcs_layout_arg (pcum, mode, type, named);
@@ -4506,7 +4929,7 @@ arm_function_arg_advance (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
     extension to the ARM ABI.  */
  
  static bool
-arm_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED,
+arm_pass_by_reference (cumulative_args_t cum ATTRIBUTE_UNUSED,
                        enum machine_mode mode ATTRIBUTE_UNUSED,
                        const_tree type, bool named ATTRIBUTE_UNUSED)
  {
@@ -4845,6 +5268,14 @@ arm_function_ok_for_sibcall (tree decl, tree exp)
    if (IS_STACKALIGN (func_type))
      return false;
  
+  /* The AAPCS says that, on bare-metal, calls to unresolved weak
+     references should become a NOP.  Don't convert such calls into
+     sibling calls.  */
+  if (TARGET_AAPCS_BASED
+      && arm_abi == ARM_ABI_AAPCS
+      && DECL_WEAK (decl))
+    return false;
+
    /* Everything else is ok.  */
    return true;
  }
@@ -5147,11 +5578,7 @@ arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
  
        if (TARGET_32BIT)
         {
-         emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx));
-         if (TARGET_ARM)
-           emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno));
-         else
-           emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
+         emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
         }
        else /* TARGET_THUMB1 */
         {
@@ -5164,10 +5591,10 @@ arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
                                      thumb_find_work_register (saved_regs));
               emit_insn (gen_pic_load_addr_thumb1 (pic_tmp, pic_rtx));
               emit_insn (gen_movsi (pic_offset_table_rtx, pic_tmp));
+             emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
             }
           else
-           emit_insn (gen_pic_load_addr_thumb1 (pic_reg, pic_rtx));
-         emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
+           emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
         }
      }
  
@@ -5197,20 +5624,7 @@ arm_pic_static_addr (rtx orig, rtx reg)
                                 UNSPEC_SYMBOL_OFFSET);
    offset_rtx = gen_rtx_CONST (Pmode, offset_rtx);
  
-  if (TARGET_32BIT)
-    {
-      emit_insn (gen_pic_load_addr_32bit (reg, offset_rtx));
-      if (TARGET_ARM)
-        insn = emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno));
-      else
-        insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
-    }
-  else /* TARGET_THUMB1 */
-    {
-      emit_insn (gen_pic_load_addr_thumb1 (reg, offset_rtx));
-      insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
-    }
-
+  insn = emit_insn (gen_pic_load_addr_unified (reg, offset_rtx, labelno));
    return insn;
  }
  
@@ -5253,7 +5667,7 @@ static bool
  will_be_in_index_register (const_rtx x)
  {
    /* arm.md: calculate_pic_address will split this into a register.  */
-  return GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_PIC_SYM;
+  return GET_CODE (x) == UNSPEC && (XINT (x, 1) == UNSPEC_PIC_SYM);
  }
  
  /* Return nonzero if X is a valid ARM state address operand.  */
@@ -5349,7 +5763,7 @@ thumb2_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p)
  {
    bool use_ldrd;
    enum rtx_code code = GET_CODE (x);
-  
+
    if (arm_address_register_rtx_p (x, strict_p))
      return 1;
  
@@ -5377,7 +5791,7 @@ thumb2_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p)
        offset = INTVAL(addend);
        if (GET_MODE_SIZE (mode) <= 4)
         return (offset > -256 && offset < 256);
-      
+
        return (use_ldrd && offset > -1024 && offset < 1024
               && (offset & 3) == 0);
      }
@@ -5533,14 +5947,14 @@ static bool
  thumb2_index_mul_operand (rtx op)
  {
    HOST_WIDE_INT val;
-  
+
    if (GET_CODE(op) != CONST_INT)
      return false;
  
    val = INTVAL(op);
    return (val == 1 || val == 2 || val == 4 || val == 8);
  }
-  
+
  /* Return nonzero if INDEX is a valid Thumb-2 address index operand.  */
  static int
  thumb2_legitimate_index_p (enum machine_mode mode, rtx index, int strict_p)
@@ -5871,6 +6285,7 @@ arm_call_tls_get_addr (rtx x, rtx reg, rtx *valuep, int reloc)
  {
    rtx insns, label, labelno, sum;
  
+  gcc_assert (reloc != TLS_DESCSEQ);
    start_sequence ();
  
    labelno = GEN_INT (pic_labelno++);
@@ -5885,12 +6300,11 @@ arm_call_tls_get_addr (rtx x, rtx reg, rtx *valuep, int reloc)
  
    if (TARGET_ARM)
      emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno));
-  else if (TARGET_THUMB2)
-    emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
-  else /* TARGET_THUMB1 */
+  else
      emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
  
-  *valuep = emit_library_call_value (get_tls_get_addr (), NULL_RTX, LCT_PURE, /* LCT_CONST?  */
+  *valuep = emit_library_call_value (get_tls_get_addr (), NULL_RTX,
+                                    LCT_PURE, /* LCT_CONST?  */
                                      Pmode, 1, reg, Pmode);
  
    insns = get_insns ();
@@ -5899,6 +6313,29 @@ arm_call_tls_get_addr (rtx x, rtx reg, rtx *valuep, int reloc)
    return insns;
  }
  
+static rtx
+arm_tls_descseq_addr (rtx x, rtx reg)
+{
+  rtx labelno = GEN_INT (pic_labelno++);
+  rtx label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL);
+  rtx sum = gen_rtx_UNSPEC (Pmode,
+                           gen_rtvec (4, x, GEN_INT (TLS_DESCSEQ),
+                                      gen_rtx_CONST (VOIDmode, label),
+                                      GEN_INT (!TARGET_ARM)),
+                           UNSPEC_TLS);
+  rtx reg0 = load_tls_operand (sum, gen_rtx_REG (SImode, 0));
+
+  emit_insn (gen_tlscall (x, labelno));
+  if (!reg)
+    reg = gen_reg_rtx (SImode);
+  else
+    gcc_assert (REGNO (reg) != 0);
+
+  emit_move_insn (reg, reg0);
+
+  return reg;
+}
+
  rtx
  legitimize_tls_address (rtx x, rtx reg)
  {
@@ -5908,26 +6345,51 @@ legitimize_tls_address (rtx x, rtx reg)
    switch (model)
      {
      case TLS_MODEL_GLOBAL_DYNAMIC:
-      insns = arm_call_tls_get_addr (x, reg, &ret, TLS_GD32);
-      dest = gen_reg_rtx (Pmode);
-      emit_libcall_block (insns, dest, ret, x);
+      if (TARGET_GNU2_TLS)
+       {
+         reg = arm_tls_descseq_addr (x, reg);
+
+         tp = arm_load_tp (NULL_RTX);
+
+         dest = gen_rtx_PLUS (Pmode, tp, reg);
+       }
+      else
+       {
+         /* Original scheme */
+         insns = arm_call_tls_get_addr (x, reg, &ret, TLS_GD32);
+         dest = gen_reg_rtx (Pmode);
+         emit_libcall_block (insns, dest, ret, x);
+       }
        return dest;
  
      case TLS_MODEL_LOCAL_DYNAMIC:
-      insns = arm_call_tls_get_addr (x, reg, &ret, TLS_LDM32);
+      if (TARGET_GNU2_TLS)
+       {
+         reg = arm_tls_descseq_addr (x, reg);
  
-      /* Attach a unique REG_EQUIV, to allow the RTL optimizers to
-        share the LDM result with other LD model accesses.  */
-      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const1_rtx),
-                           UNSPEC_TLS);
-      dest = gen_reg_rtx (Pmode);
-      emit_libcall_block (insns, dest, ret, eqv);
+         tp = arm_load_tp (NULL_RTX);
+
+         dest = gen_rtx_PLUS (Pmode, tp, reg);
+       }
+      else
+       {
+         insns = arm_call_tls_get_addr (x, reg, &ret, TLS_LDM32);
+
+         /* Attach a unique REG_EQUIV, to allow the RTL optimizers to
+            share the LDM result with other LD model accesses.  */
+         eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const1_rtx),
+                               UNSPEC_TLS);
+         dest = gen_reg_rtx (Pmode);
+         emit_libcall_block (insns, dest, ret, eqv);
  
-      /* Load the addend.  */
-      addend = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, x, GEN_INT (TLS_LDO32)),
-                              UNSPEC_TLS);
-      addend = force_reg (SImode, gen_rtx_CONST (SImode, addend));
-      return gen_rtx_PLUS (Pmode, dest, addend);
+         /* Load the addend.  */
+         addend = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, x,
+                                                    GEN_INT (TLS_LDO32)),
+                                  UNSPEC_TLS);
+         addend = force_reg (SImode, gen_rtx_CONST (SImode, addend));
+         dest = gen_rtx_PLUS (Pmode, dest, addend);
+       }
+      return dest;
  
      case TLS_MODEL_INITIAL_EXEC:
        labelno = GEN_INT (pic_labelno++);
@@ -6166,9 +6628,26 @@ arm_legitimize_reload_address (rtx *p,
                                int opnum, int type,
                                int ind_levels ATTRIBUTE_UNUSED)
  {
+  /* We must recognize output that we have already generated ourselves.  */
+  if (GET_CODE (*p) == PLUS
+      && GET_CODE (XEXP (*p, 0)) == PLUS
+      && GET_CODE (XEXP (XEXP (*p, 0), 0)) == REG
+      && GET_CODE (XEXP (XEXP (*p, 0), 1)) == CONST_INT
+      && GET_CODE (XEXP (*p, 1)) == CONST_INT)
+    {
+      push_reload (XEXP (*p, 0), NULL_RTX, &XEXP (*p, 0), NULL,
+                  MODE_BASE_REG_CLASS (mode), GET_MODE (*p),
+                  VOIDmode, 0, 0, opnum, (enum reload_type) type);
+      return true;
+    }
+
    if (GET_CODE (*p) == PLUS
        && GET_CODE (XEXP (*p, 0)) == REG
        && ARM_REGNO_OK_FOR_BASE_P (REGNO (XEXP (*p, 0)))
+      /* If the base register is equivalent to a constant, let the generic
+        code handle it.  Otherwise we will run into problems if a future
+        reload pass decides to rematerialize the constant.  */
+      && !reg_equiv_constant (ORIGINAL_REGNO (XEXP (*p, 0)))
        && GET_CODE (XEXP (*p, 1)) == CONST_INT)
      {
        HOST_WIDE_INT val = INTVAL (XEXP (*p, 1));
@@ -6632,7 +7111,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        if (GET_CODE (XEXP (x, 1)) == REG)
         *total = COSTS_N_INSNS (1); /* Need to subtract from 32 */
        else if (GET_CODE (XEXP (x, 1)) != CONST_INT)
-       *total = rtx_cost (XEXP (x, 1), code, speed);
+       *total = rtx_cost (XEXP (x, 1), code, 1, speed);
  
        /* Fall through */
      case ROTATERT:
@@ -6644,7 +7123,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
  
        /* Fall through */
      case ASHIFT: case LSHIFTRT: case ASHIFTRT:
-      *total += rtx_cost (XEXP (x, 0), code, speed);
+      *total += rtx_cost (XEXP (x, 0), code, 0, speed);
        if (mode == DImode)
         {
           *total += COSTS_N_INSNS (3);
@@ -6667,14 +7146,14 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
           if (GET_CODE (XEXP (x, 0)) == CONST_INT
               && const_ok_for_arm (INTVAL (XEXP (x, 0))))
             {
-             *total += rtx_cost (XEXP (x, 1), code, speed);
+             *total += rtx_cost (XEXP (x, 1), code, 1, speed);
               return true;
             }
  
           if (GET_CODE (XEXP (x, 1)) == CONST_INT
               && const_ok_for_arm (INTVAL (XEXP (x, 1))))
             {
-             *total += rtx_cost (XEXP (x, 0), code, speed);
+             *total += rtx_cost (XEXP (x, 0), code, 0, speed);
               return true;
             }
  
@@ -6691,14 +7170,14 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
               if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE
                   && arm_const_double_rtx (XEXP (x, 0)))
                 {
-                 *total += rtx_cost (XEXP (x, 1), code, speed);
+                 *total += rtx_cost (XEXP (x, 1), code, 1, speed);
                   return true;
                 }
  
               if (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE
                   && arm_const_double_rtx (XEXP (x, 1)))
                 {
-                 *total += rtx_cost (XEXP (x, 0), code, speed);
+                 *total += rtx_cost (XEXP (x, 0), code, 0, speed);
                   return true;
                 }
  
@@ -6712,7 +7191,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        if (GET_CODE (XEXP (x, 0)) == CONST_INT
           && const_ok_for_arm (INTVAL (XEXP (x, 0))))
         {
-         *total += rtx_cost (XEXP (x, 1), code, speed);
+         *total += rtx_cost (XEXP (x, 1), code, 1, speed);
           return true;
         }
  
@@ -6721,8 +7200,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
           || subcode == LSHIFTRT
           || subcode == ROTATE || subcode == ROTATERT)
         {
-         *total += rtx_cost (XEXP (x, 0), code, speed);
-         *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, speed);
+         *total += rtx_cost (XEXP (x, 0), code, 0, speed);
+         *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, 0, speed);
           return true;
         }
  
@@ -6730,23 +7209,23 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        if (GET_CODE (XEXP (x, 0)) == MULT
           && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
         {
-         *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, speed);
-         *total += rtx_cost (XEXP (x, 1), code, speed);
+         *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, 0, speed);
+         *total += rtx_cost (XEXP (x, 1), code, 1, speed);
           return true;
         }
  
        if (subcode == MULT
           && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode))
         {
-         *total += rtx_cost (XEXP (x, 0), code, speed);
-         *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, speed);
+         *total += rtx_cost (XEXP (x, 0), code, 0, speed);
+         *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, 0, speed);
           return true;
         }
  
        if (GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMPARE
           || GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMM_COMPARE)
         {
-         *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed);
+         *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, 0, speed);
           if (GET_CODE (XEXP (XEXP (x, 1), 0)) == REG
               && REGNO (XEXP (XEXP (x, 1), 0)) != CC_REGNUM)
             *total += COSTS_N_INSNS (1);
@@ -6763,8 +7242,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
         {
           *total = COSTS_N_INSNS (1);
           *total += rtx_cost (XEXP (XEXP (x, 0), 0), GET_CODE (XEXP (x, 0)),
-                             speed);
-         *total += rtx_cost (XEXP (x, 1), code, speed);
+                             0, speed);
+         *total += rtx_cost (XEXP (x, 1), code, 1, speed);
           return true;
         }
  
@@ -6788,7 +7267,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
               if (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE
                   && arm_const_double_rtx (XEXP (x, 1)))
                 {
-                 *total += rtx_cost (XEXP (x, 0), code, speed);
+                 *total += rtx_cost (XEXP (x, 0), code, 0, speed);
                   return true;
                 }
  
@@ -6802,7 +7281,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        if (GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMPARE
           || GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMM_COMPARE)
         {
-         *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 1), code, speed);
+         *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 1), code, 1, speed);
           if (GET_CODE (XEXP (XEXP (x, 0), 0)) == REG
               && REGNO (XEXP (XEXP (x, 0), 0)) != CC_REGNUM)
             *total += COSTS_N_INSNS (1);
@@ -6829,7 +7308,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
           if (GET_CODE (XEXP (x, 1)) == CONST_INT
               && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
             {
-             *total += rtx_cost (XEXP (x, 0), code, speed);
+             *total += rtx_cost (XEXP (x, 0), code, 0, speed);
               return true;
             }
  
@@ -6840,7 +7319,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        if (GET_CODE (XEXP (x, 1)) == CONST_INT
           && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
         {
-         *total += rtx_cost (XEXP (x, 0), code, speed);
+         *total += rtx_cost (XEXP (x, 0), code, 0, speed);
           return true;
         }
        subcode = GET_CODE (XEXP (x, 0));
@@ -6848,16 +7327,16 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
           || subcode == LSHIFTRT
           || subcode == ROTATE || subcode == ROTATERT)
         {
-         *total += rtx_cost (XEXP (x, 1), code, speed);
-         *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+         *total += rtx_cost (XEXP (x, 1), code, 1, speed);
+         *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
           return true;
         }
  
        if (subcode == MULT
           && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
         {
-         *total += rtx_cost (XEXP (x, 1), code, speed);
-         *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+         *total += rtx_cost (XEXP (x, 1), code, 1, speed);
+         *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
           return true;
         }
  
@@ -6883,7 +7362,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
           && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
               || GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND))
         {
-         *total = rtx_cost (XEXP (XEXP (x, 0), 0), LSHIFTRT, speed);
+         *total = rtx_cost (XEXP (XEXP (x, 0), 0), LSHIFTRT, 0, speed);
           return true;
         }
        *total = COSTS_N_INSNS (2); /* Plus the cost of the MULT */
@@ -6915,11 +7394,11 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
               || (subcode == MULT
                   && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)))
             {
-             *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+             *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
               /* Register shifts cost an extra cycle.  */
               if (GET_CODE (XEXP (XEXP (x, 0), 1)) != CONST_INT)
                 *total += COSTS_N_INSNS (1) + rtx_cost (XEXP (XEXP (x, 0), 1),
-                                                       subcode, speed);
+                                                       subcode, 1, speed);
               return true;
             }
         }
@@ -6940,14 +7419,14 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
             && GET_CODE (XEXP (operand, 0)) == REG
             && REGNO (XEXP (operand, 0)) == CC_REGNUM))
         *total += COSTS_N_INSNS (1);
-      *total += (rtx_cost (XEXP (x, 1), code, speed)
-                + rtx_cost (XEXP (x, 2), code, speed));
+      *total += (rtx_cost (XEXP (x, 1), code, 1, speed)
+                + rtx_cost (XEXP (x, 2), code, 2, speed));
        return true;
  
      case NE:
        if (mode == SImode && XEXP (x, 1) == const0_rtx)
         {
-         *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed);
+         *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, 0, speed);
           return true;
         }
        goto scc_insn;
@@ -6956,7 +7435,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        if ((GET_CODE (XEXP (x, 0)) != REG || REGNO (XEXP (x, 0)) != CC_REGNUM)
           && mode == SImode && XEXP (x, 1) == const0_rtx)
         {
-         *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed);
+         *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, 0, speed);
           return true;
         }
        goto scc_insn;
@@ -6965,7 +7444,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        if ((GET_CODE (XEXP (x, 0)) != REG || REGNO (XEXP (x, 0)) != CC_REGNUM)
           && mode == SImode && XEXP (x, 1) == const0_rtx)
         {
-         *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed);
+         *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, 0, speed);
           return true;
         }
        goto scc_insn;
@@ -7006,7 +7485,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
        if (GET_CODE (XEXP (x, 1)) == CONST_INT
           && const_ok_for_op (INTVAL (XEXP (x, 1)), code))
         {
-         *total += rtx_cost (XEXP (x, 0), code, speed);
+         *total += rtx_cost (XEXP (x, 0), code, 0, speed);
           return true;
         }
  
@@ -7015,29 +7494,29 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
           || subcode == LSHIFTRT
           || subcode == ROTATE || subcode == ROTATERT)
         {
-         *total += rtx_cost (XEXP (x, 1), code, speed);
-         *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+         *total += rtx_cost (XEXP (x, 1), code, 1, speed);
+         *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
           return true;
         }
  
        if (subcode == MULT
           && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
         {
-         *total += rtx_cost (XEXP (x, 1), code, speed);
-         *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed);
+         *total += rtx_cost (XEXP (x, 1), code, 1, speed);
+         *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, 0, speed);
           return true;
         }
-      
+
        return false;
  
      case UMIN:
      case UMAX:
      case SMIN:
      case SMAX:
-      *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed);
+      *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, 0, speed);
        if (GET_CODE (XEXP (x, 1)) != CONST_INT
           || !const_ok_for_arm (INTVAL (XEXP (x, 1))))
-       *total += rtx_cost (XEXP (x, 1), code, speed);
+       *total += rtx_cost (XEXP (x, 1), code, 1, speed);
        return true;
  
      case ABS:
@@ -7114,7 +7593,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
  
      case ZERO_EXTRACT:
      case SIGN_EXTRACT:
-      *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed);
+      *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, 0, speed);
        return true;
  
      case CONST_INT:
@@ -7139,7 +7618,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
  
      case LO_SUM:
        *total = COSTS_N_INSNS (1);
-      *total += rtx_cost (XEXP (x, 0), code, speed);
+      *total += rtx_cost (XEXP (x, 0), code, 0, speed);
        return true;
  
      case CONST_DOUBLE:
@@ -7150,6 +7629,18 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
         *total = COSTS_N_INSNS (4);
        return true;
  
+    case SET:
+      return false;
+      
+    case UNSPEC:
+      /* We cost this as high as our memory costs to allow this to
+        be hoisted from loops.  */
+      if (XINT (x, 1) == UNSPEC_PIC_UNIFIED)
+       {
+         *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
+       }
+      return true;
+
      default:
        *total = COSTS_N_INSNS (4);
        return false;
@@ -7325,7 +7816,7 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
      case ROTATE:
        if (mode == SImode && GET_CODE (XEXP (x, 1)) == REG)
         {
-         *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, false);
+         *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, 0, false);
           return true;
         }
        /* Fall through */
@@ -7335,15 +7826,15 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
      case ASHIFTRT:
        if (mode == DImode && GET_CODE (XEXP (x, 1)) == CONST_INT)
         {
-         *total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), code, false);
+         *total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), code, 0, false);
           return true;
         }
        else if (mode == SImode)
         {
-         *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, false);
+         *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, 0, false);
           /* Slightly disparage register shifts, but not by much.  */
           if (GET_CODE (XEXP (x, 1)) != CONST_INT)
-           *total += 1 + rtx_cost (XEXP (x, 1), code, false);
+           *total += 1 + rtx_cost (XEXP (x, 1), code, 1, false);
           return true;
         }
  
@@ -7395,8 +7886,8 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
           && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))
         {
           *total = COSTS_N_INSNS (TARGET_THUMB2 ? 2 : 1);
-         *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, false);
-         *total += rtx_cost (XEXP (x, 1), code, false);
+         *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, 0, false);
+         *total += rtx_cost (XEXP (x, 1), code, 1, false);
           return true;
         }
  
@@ -7497,6 +7988,9 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
        *total = COSTS_N_INSNS (1) + 1;
        return true;
  
+    case SET:
+      return false;
+
      default:
        if (mode != VOIDmode)
         *total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
@@ -7508,8 +8002,8 @@ arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
  
  /* RTX costs when optimizing for size.  */
  static bool
-arm_rtx_costs (rtx x, int code, int outer_code, int *total,
-              bool speed)
+arm_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
+              int *total, bool speed)
  {
    if (!speed)
      return arm_size_rtx_costs (x, (enum rtx_code) code,
@@ -7562,7 +8056,7 @@ arm_slowmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
             }
  
           *total = COSTS_N_INSNS (cost);
-         *total += rtx_cost (XEXP (x, 0), code, speed);
+         *total += rtx_cost (XEXP (x, 0), code, 0, speed);
           return true;
         }
  
@@ -7931,9 +8425,9 @@ cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
           {
             if (GET_CODE (PATTERN (insn)) == SET)
               {
-               if (GET_MODE_CLASS 
+               if (GET_MODE_CLASS
                     (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT
-                 || GET_MODE_CLASS 
+                 || GET_MODE_CLASS
                     (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT)
                   {
                     enum attr_type attr_type_insn = get_attr_type (insn);
@@ -7953,7 +8447,7 @@ cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
                       {
                         /* FMACS is a special case where the dependant
                            instruction can be issued 3 cycles before
-                          the normal latency in case of an output 
+                          the normal latency in case of an output
                            dependency.  */
                         if ((attr_type_insn == TYPE_FMACS
                              || attr_type_insn == TYPE_FMACD)
@@ -8017,12 +8511,69 @@ fa726te_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
    return true;
  }
  
+/* Implement TARGET_REGISTER_MOVE_COST.
+
+   Moves between FPA_REGS and GENERAL_REGS are two memory insns.
+   Moves between VFP_REGS and GENERAL_REGS are a single insn, but
+   it is typically more expensive than a single memory access.  We set
+   the cost to less than two memory accesses so that floating
+   point to integer conversion does not go through memory.  */
+
+int
+arm_register_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
+                       reg_class_t from, reg_class_t to)
+{
+  if (TARGET_32BIT)
+    {
+      if ((from == FPA_REGS && to != FPA_REGS)
+         || (from != FPA_REGS && to == FPA_REGS))
+       return 20;
+      else if ((IS_VFP_CLASS (from) && !IS_VFP_CLASS (to))
+              || (!IS_VFP_CLASS (from) && IS_VFP_CLASS (to)))
+       return 15;
+      else if ((from == IWMMXT_REGS && to != IWMMXT_REGS)
+              || (from != IWMMXT_REGS && to == IWMMXT_REGS))
+       return 4;
+      else if (from == IWMMXT_GR_REGS || to == IWMMXT_GR_REGS)
+       return 20;
+      else if ((from == CIRRUS_REGS && to != CIRRUS_REGS)
+              || (from != CIRRUS_REGS && to == CIRRUS_REGS))
+       return 20;
+      else
+       return 2;
+    }
+  else
+    {
+      if (from == HI_REGS || to == HI_REGS)
+       return 4;
+      else
+       return 2;
+    }
+}
+
+/* Implement TARGET_MEMORY_MOVE_COST.  */
+
+int
+arm_memory_move_cost (enum machine_mode mode, reg_class_t rclass,
+                     bool in ATTRIBUTE_UNUSED)
+{
+  if (TARGET_32BIT)
+    return 10;
+  else
+    {
+      if (GET_MODE_SIZE (mode) < 4)
+       return 8;
+      else
+       return ((2 * GET_MODE_SIZE (mode)) * (rclass == LO_REGS ? 1 : 2));
+    }
+}
+
  /* This function implements the target macro TARGET_SCHED_ADJUST_COST.
     It corrects the value of COST based on the relationship between
     INSN and DEP through the dependence LINK.  It returns the new
     value. There is a per-core adjust_cost hook to adjust scheduler costs
-   and the per-core hook can choose to completely override the generic 
-   adjust_cost function. Only put bits of code into arm_adjust_cost that 
+   and the per-core hook can choose to completely override the generic
+   adjust_cost function. Only put bits of code into arm_adjust_cost that
     are common across all cores.  */
  static int
  arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
@@ -8066,7 +8617,7 @@ arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
          constant pool are cached, and that others will miss.  This is a
          hack.  */
  
-      if ((GET_CODE (src_mem) == SYMBOL_REF 
+      if ((GET_CODE (src_mem) == SYMBOL_REF
            && CONSTANT_POOL_ADDRESS_P (src_mem))
           || reg_mentioned_p (stack_pointer_rtx, src_mem)
           || reg_mentioned_p (frame_pointer_rtx, src_mem)
@@ -8549,6 +9100,66 @@ neon_immediate_valid_for_logic (rtx op, enum machine_mode mode, int inverse,
    return 1;
  }
  
+/* Return TRUE if rtx OP is legal for use in a VSHR or VSHL instruction.  If
+   the immediate is valid, write a constant suitable for using as an operand
+   to VSHR/VSHL to *MODCONST and the corresponding element width to
+   *ELEMENTWIDTH. ISLEFTSHIFT is for determine left or right shift,
+   because they have different limitations.  */
+
+int
+neon_immediate_valid_for_shift (rtx op, enum machine_mode mode,
+                               rtx *modconst, int *elementwidth,
+                               bool isleftshift)
+{
+  unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
+  unsigned int n_elts = CONST_VECTOR_NUNITS (op), i;
+  unsigned HOST_WIDE_INT last_elt = 0;
+  unsigned HOST_WIDE_INT maxshift;
+
+  /* Split vector constant out into a byte vector.  */
+  for (i = 0; i < n_elts; i++)
+    {
+      rtx el = CONST_VECTOR_ELT (op, i);
+      unsigned HOST_WIDE_INT elpart;
+
+      if (GET_CODE (el) == CONST_INT)
+        elpart = INTVAL (el);
+      else if (GET_CODE (el) == CONST_DOUBLE)
+        return 0;
+      else
+        gcc_unreachable ();
+
+      if (i != 0 && elpart != last_elt)
+        return 0;
+
+      last_elt = elpart;
+    }
+
+  /* Shift less than element size.  */
+  maxshift = innersize * 8;
+
+  if (isleftshift)
+    {
+      /* Left shift immediate value can be from 0 to <size>-1.  */
+      if (last_elt >= maxshift)
+        return 0;
+    }
+  else
+    {
+      /* Right shift immediate value can be from 1 to <size>.  */
+      if (last_elt == 0 || last_elt > maxshift)
+       return 0;
+    }
+
+  if (elementwidth)
+    *elementwidth = innersize * 8;
+
+  if (modconst)
+    *modconst = CONST_VECTOR_ELT (op, 0);
+
+  return 1;
+}
+
  /* Return a string suitable for output of Neon immediate logic operation
     MNEM.  */
  
@@ -8571,6 +9182,28 @@ neon_output_logic_immediate (const char *mnem, rtx *op2, enum machine_mode mode,
    return templ;
  }
  
+/* Return a string suitable for output of Neon immediate shift operation
+   (VSHR or VSHL) MNEM.  */
+
+char *
+neon_output_shift_immediate (const char *mnem, char sign, rtx *op2,
+                            enum machine_mode mode, int quad,
+                            bool isleftshift)
+{
+  int width, is_valid;
+  static char templ[40];
+
+  is_valid = neon_immediate_valid_for_shift (*op2, mode, op2, &width, isleftshift);
+  gcc_assert (is_valid != 0);
+
+  if (quad)
+    sprintf (templ, "%s.%c%d\t%%q0, %%q1, %%2", mnem, sign, width);
+  else
+    sprintf (templ, "%s.%c%d\t%%P0, %%P1, %%2", mnem, sign, width);
+
+  return templ;
+}
+
  /* Output a sequence of pairwise operations to implement a reduction.
     NOTE: We do "too much work" here, because pairwise operations work on two
     registers-worth of operands in one go. Unfortunately we can't exploit those
@@ -9099,8 +9732,9 @@ arm_return_in_msb (const_tree valtype)
  {
    return (TARGET_AAPCS_BASED
            && BYTES_BIG_ENDIAN
-          && (AGGREGATE_TYPE_P (valtype)
-              || TREE_CODE (valtype) == COMPLEX_TYPE));
+         && (AGGREGATE_TYPE_P (valtype)
+             || TREE_CODE (valtype) == COMPLEX_TYPE
+             || FIXED_POINT_TYPE_P (valtype)));
  }
  
  /* Returns TRUE if INSN is an "LDR REG, ADDR" instruction.
@@ -9366,7 +10000,8 @@ static int
  arm_note_pic_base (rtx *x, void *date ATTRIBUTE_UNUSED)
  {
    if (GET_CODE (*x) == UNSPEC
-      && XINT (*x, 1) == UNSPEC_PIC_BASE)
+      && (XINT (*x, 1) == UNSPEC_PIC_BASE
+         || XINT (*x, 1) == UNSPEC_PIC_UNIFIED))
      return 1;
    return 0;
  }
@@ -9374,6 +10009,11 @@ arm_note_pic_base (rtx *x, void *date ATTRIBUTE_UNUSED)
  static bool
  arm_cannot_copy_insn_p (rtx insn)
  {
+  /* The tls call insn cannot be copied, as it is paired with a data
+     word.  */
+  if (recog_memoized (insn) == CODE_FOR_tlscall)
+    return true;
+
    return for_each_rtx (&PATTERN (insn), arm_note_pic_base, NULL);
  }
  
@@ -9752,6 +10392,9 @@ store_multiple_sequence (rtx *operands, int nops, int nops_total,
    rtx base_reg_rtx = NULL;
    int i, stm_case;
  
+  /* Write back of base register is currently only supported for Thumb 1.  */
+  int base_writeback = TARGET_THUMB1;
+
    /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
       easily extended if required.  */
    gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
@@ -9809,7 +10452,9 @@ store_multiple_sequence (rtx *operands, int nops, int nops_total,
           /* If it isn't an integer register, then we can't do this.  */
           if (unsorted_regs[i] < 0
               || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM)
-             || (TARGET_THUMB2 && unsorted_regs[i] == base_reg)
+             /* The effects are unpredictable if the base register is
+                both updated and stored.  */
+             || (base_writeback && unsorted_regs[i] == base_reg)
               || (TARGET_THUMB2 && unsorted_regs[i] == SP_REGNUM)
               || unsorted_regs[i] > 14)
             return 0;
@@ -10271,108 +10916,442 @@ gen_const_stm_seq (rtx *operands, int nops)
    return true;
  }
  
-int
-arm_gen_movmemqi (rtx *operands)
-{
-  HOST_WIDE_INT in_words_to_go, out_words_to_go, last_bytes;
-  HOST_WIDE_INT srcoffset, dstoffset;
-  int i;
-  rtx src, dst, srcbase, dstbase;
-  rtx part_bytes_reg = NULL;
-  rtx mem;
+/* Copy a block of memory using plain ldr/str/ldrh/strh instructions, to permit
+   unaligned copies on processors which support unaligned semantics for those
+   instructions.  INTERLEAVE_FACTOR can be used to attempt to hide load latency
+   (using more registers) by doing e.g. load/load/store/store for a factor of 2.
+   An interleave factor of 1 (the minimum) will perform no interleaving. 
+   Load/store multiple are used for aligned addresses where possible.  */
  
-  if (GET_CODE (operands[2]) != CONST_INT
-      || GET_CODE (operands[3]) != CONST_INT
-      || INTVAL (operands[2]) > 64
-      || INTVAL (operands[3]) & 3)
-    return 0;
+static void
+arm_block_move_unaligned_straight (rtx dstbase, rtx srcbase,
+                                  HOST_WIDE_INT length,
+                                  unsigned int interleave_factor)
+{
+  rtx *regs = XALLOCAVEC (rtx, interleave_factor);
+  int *regnos = XALLOCAVEC (int, interleave_factor);
+  HOST_WIDE_INT block_size_bytes = interleave_factor * UNITS_PER_WORD;
+  HOST_WIDE_INT i, j;
+  HOST_WIDE_INT remaining = length, words;
+  rtx halfword_tmp = NULL, byte_tmp = NULL;
+  rtx dst, src;
+  bool src_aligned = MEM_ALIGN (srcbase) >= BITS_PER_WORD;
+  bool dst_aligned = MEM_ALIGN (dstbase) >= BITS_PER_WORD;
+  HOST_WIDE_INT srcoffset, dstoffset;
+  HOST_WIDE_INT src_autoinc, dst_autoinc;
+  rtx mem, addr;
+  
+  gcc_assert (1 <= interleave_factor && interleave_factor <= 4);
+  
+  /* Use hard registers if we have aligned source or destination so we can use
+     load/store multiple with contiguous registers.  */
+  if (dst_aligned || src_aligned)
+    for (i = 0; i < interleave_factor; i++)
+      regs[i] = gen_rtx_REG (SImode, i);
+  else
+    for (i = 0; i < interleave_factor; i++)
+      regs[i] = gen_reg_rtx (SImode);
  
-  dstbase = operands[0];
-  srcbase = operands[1];
+  dst = copy_addr_to_reg (XEXP (dstbase, 0));
+  src = copy_addr_to_reg (XEXP (srcbase, 0));
  
-  dst = copy_to_mode_reg (SImode, XEXP (dstbase, 0));
-  src = copy_to_mode_reg (SImode, XEXP (srcbase, 0));
+  srcoffset = dstoffset = 0;
+  
+  /* Calls to arm_gen_load_multiple and arm_gen_store_multiple update SRC/DST.
+     For copying the last bytes we want to subtract this offset again.  */
+  src_autoinc = dst_autoinc = 0;
  
-  in_words_to_go = ARM_NUM_INTS (INTVAL (operands[2]));
-  out_words_to_go = INTVAL (operands[2]) / 4;
-  last_bytes = INTVAL (operands[2]) & 3;
-  dstoffset = srcoffset = 0;
+  for (i = 0; i < interleave_factor; i++)
+    regnos[i] = i;
  
-  if (out_words_to_go != in_words_to_go && ((in_words_to_go - 1) & 3) != 0)
-    part_bytes_reg = gen_rtx_REG (SImode, (in_words_to_go - 1) & 3);
+  /* Copy BLOCK_SIZE_BYTES chunks.  */
  
-  for (i = 0; in_words_to_go >= 2; i+=4)
+  for (i = 0; i + block_size_bytes <= length; i += block_size_bytes)
      {
-      if (in_words_to_go > 4)
-       emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, src,
-                                         TRUE, srcbase, &srcoffset));
+      /* Load words.  */
+      if (src_aligned && interleave_factor > 1)
+       {
+         emit_insn (arm_gen_load_multiple (regnos, interleave_factor, src,
+                                           TRUE, srcbase, &srcoffset));
+         src_autoinc += UNITS_PER_WORD * interleave_factor;
+       }
        else
-       emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, in_words_to_go,
-                                         src, FALSE, srcbase,
-                                         &srcoffset));
+       {
+         for (j = 0; j < interleave_factor; j++)
+           {
+             addr = plus_constant (src, srcoffset + j * UNITS_PER_WORD
+                                        - src_autoinc);
+             mem = adjust_automodify_address (srcbase, SImode, addr,
+                                              srcoffset + j * UNITS_PER_WORD);
+             emit_insn (gen_unaligned_loadsi (regs[j], mem));
+           }
+         srcoffset += block_size_bytes;
+       }
  
-      if (out_words_to_go)
+      /* Store words.  */
+      if (dst_aligned && interleave_factor > 1)
         {
-         if (out_words_to_go > 4)
-           emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, dst,
-                                              TRUE, dstbase, &dstoffset));
-         else if (out_words_to_go != 1)
-           emit_insn (arm_gen_store_multiple (arm_regs_in_sequence,
-                                              out_words_to_go, dst,
-                                              (last_bytes == 0
-                                               ? FALSE : TRUE),
-                                              dstbase, &dstoffset));
-         else
+         emit_insn (arm_gen_store_multiple (regnos, interleave_factor, dst,
+                                            TRUE, dstbase, &dstoffset));
+         dst_autoinc += UNITS_PER_WORD * interleave_factor;
+       }
+      else
+       {
+         for (j = 0; j < interleave_factor; j++)
             {
-             mem = adjust_automodify_address (dstbase, SImode, dst, dstoffset);
-             emit_move_insn (mem, gen_rtx_REG (SImode, 0));
-             if (last_bytes != 0)
-               {
-                 emit_insn (gen_addsi3 (dst, dst, GEN_INT (4)));
-                 dstoffset += 4;
-               }
+             addr = plus_constant (dst, dstoffset + j * UNITS_PER_WORD
+                                        - dst_autoinc);
+             mem = adjust_automodify_address (dstbase, SImode, addr,
+                                              dstoffset + j * UNITS_PER_WORD);
+             emit_insn (gen_unaligned_storesi (mem, regs[j]));
             }
+         dstoffset += block_size_bytes;
         }
  
-      in_words_to_go -= in_words_to_go < 4 ? in_words_to_go : 4;
-      out_words_to_go -= out_words_to_go < 4 ? out_words_to_go : 4;
+      remaining -= block_size_bytes;
      }
+  
+  /* Copy any whole words left (note these aren't interleaved with any
+     subsequent halfword/byte load/stores in the interests of simplicity).  */
+  
+  words = remaining / UNITS_PER_WORD;
  
-  /* OUT_WORDS_TO_GO will be zero here if there are byte stores to do.  */
-  if (out_words_to_go)
+  gcc_assert (words < interleave_factor);
+  
+  if (src_aligned && words > 1)
      {
-      rtx sreg;
-
-      mem = adjust_automodify_address (srcbase, SImode, src, srcoffset);
-      sreg = copy_to_reg (mem);
-
-      mem = adjust_automodify_address (dstbase, SImode, dst, dstoffset);
-      emit_move_insn (mem, sreg);
-      in_words_to_go--;
+      emit_insn (arm_gen_load_multiple (regnos, words, src, TRUE, srcbase,
+                                       &srcoffset));
+      src_autoinc += UNITS_PER_WORD * words;
+    }
+  else
+    {
+      for (j = 0; j < words; j++)
+       {
+         addr = plus_constant (src,
+                               srcoffset + j * UNITS_PER_WORD - src_autoinc);
+         mem = adjust_automodify_address (srcbase, SImode, addr,
+                                          srcoffset + j * UNITS_PER_WORD);
+         emit_insn (gen_unaligned_loadsi (regs[j], mem));
+       }
+      srcoffset += words * UNITS_PER_WORD;
+    }
  
-      gcc_assert (!in_words_to_go);    /* Sanity check */
+  if (dst_aligned && words > 1)
+    {
+      emit_insn (arm_gen_store_multiple (regnos, words, dst, TRUE, dstbase,
+                                        &dstoffset));
+      dst_autoinc += words * UNITS_PER_WORD;
+    }
+  else
+    {
+      for (j = 0; j < words; j++)
+       {
+         addr = plus_constant (dst,
+                               dstoffset + j * UNITS_PER_WORD - dst_autoinc);
+         mem = adjust_automodify_address (dstbase, SImode, addr,
+                                          dstoffset + j * UNITS_PER_WORD);
+         emit_insn (gen_unaligned_storesi (mem, regs[j]));
+       }
+      dstoffset += words * UNITS_PER_WORD;
      }
  
-  if (in_words_to_go)
+  remaining -= words * UNITS_PER_WORD;
+  
+  gcc_assert (remaining < 4);
+  
+  /* Copy a halfword if necessary.  */
+  
+  if (remaining >= 2)
      {
-      gcc_assert (in_words_to_go > 0);
+      halfword_tmp = gen_reg_rtx (SImode);
  
-      mem = adjust_automodify_address (srcbase, SImode, src, srcoffset);
-      part_bytes_reg = copy_to_mode_reg (SImode, mem);
-    }
+      addr = plus_constant (src, srcoffset - src_autoinc);
+      mem = adjust_automodify_address (srcbase, HImode, addr, srcoffset);
+      emit_insn (gen_unaligned_loadhiu (halfword_tmp, mem));
  
-  gcc_assert (!last_bytes || part_bytes_reg);
+      /* Either write out immediately, or delay until we've loaded the last
+        byte, depending on interleave factor.  */
+      if (interleave_factor == 1)
+       {
+         addr = plus_constant (dst, dstoffset - dst_autoinc);
+         mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+         emit_insn (gen_unaligned_storehi (mem,
+                      gen_lowpart (HImode, halfword_tmp)));
+         halfword_tmp = NULL;
+         dstoffset += 2;
+       }
  
-  if (BYTES_BIG_ENDIAN && last_bytes)
+      remaining -= 2;
+      srcoffset += 2;
+    }
+  
+  gcc_assert (remaining < 2);
+  
+  /* Copy last byte.  */
+  
+  if ((remaining & 1) != 0)
      {
-      rtx tmp = gen_reg_rtx (SImode);
+      byte_tmp = gen_reg_rtx (SImode);
  
-      /* The bytes we want are in the top end of the word.  */
-      emit_insn (gen_lshrsi3 (tmp, part_bytes_reg,
-                             GEN_INT (8 * (4 - last_bytes))));
-      part_bytes_reg = tmp;
+      addr = plus_constant (src, srcoffset - src_autoinc);
+      mem = adjust_automodify_address (srcbase, QImode, addr, srcoffset);
+      emit_move_insn (gen_lowpart (QImode, byte_tmp), mem);
  
-      while (last_bytes)
+      if (interleave_factor == 1)
+       {
+         addr = plus_constant (dst, dstoffset - dst_autoinc);
+         mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+         emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+         byte_tmp = NULL;
+         dstoffset++;
+       }
+
+      remaining--;
+      srcoffset++;
+    }
+  
+  /* Store last halfword if we haven't done so already.  */
+  
+  if (halfword_tmp)
+    {
+      addr = plus_constant (dst, dstoffset - dst_autoinc);
+      mem = adjust_automodify_address (dstbase, HImode, addr, dstoffset);
+      emit_insn (gen_unaligned_storehi (mem,
+                  gen_lowpart (HImode, halfword_tmp)));
+      dstoffset += 2;
+    }
+
+  /* Likewise for last byte.  */
+
+  if (byte_tmp)
+    {
+      addr = plus_constant (dst, dstoffset - dst_autoinc);
+      mem = adjust_automodify_address (dstbase, QImode, addr, dstoffset);
+      emit_move_insn (mem, gen_lowpart (QImode, byte_tmp));
+      dstoffset++;
+    }
+  
+  gcc_assert (remaining == 0 && srcoffset == dstoffset);
+}
+
+/* From mips_adjust_block_mem:
+
+   Helper function for doing a loop-based block operation on memory
+   reference MEM.  Each iteration of the loop will operate on LENGTH
+   bytes of MEM.
+
+   Create a new base register for use within the loop and point it to
+   the start of MEM.  Create a new memory reference that uses this
+   register.  Store them in *LOOP_REG and *LOOP_MEM respectively.  */
+
+static void
+arm_adjust_block_mem (rtx mem, HOST_WIDE_INT length, rtx *loop_reg,
+                     rtx *loop_mem)
+{
+  *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
+  
+  /* Although the new mem does not refer to a known location,
+     it does keep up to LENGTH bytes of alignment.  */
+  *loop_mem = change_address (mem, BLKmode, *loop_reg);
+  set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
+}
+
+/* From mips_block_move_loop:
+
+   Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
+   bytes at a time.  LENGTH must be at least BYTES_PER_ITER.  Assume that
+   the memory regions do not overlap.  */
+
+static void
+arm_block_move_unaligned_loop (rtx dest, rtx src, HOST_WIDE_INT length,
+                              unsigned int interleave_factor,
+                              HOST_WIDE_INT bytes_per_iter)
+{
+  rtx label, src_reg, dest_reg, final_src, test;
+  HOST_WIDE_INT leftover;
+  
+  leftover = length % bytes_per_iter;
+  length -= leftover;
+  
+  /* Create registers and memory references for use within the loop.  */
+  arm_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
+  arm_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
+  
+  /* Calculate the value that SRC_REG should have after the last iteration of
+     the loop.  */
+  final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
+                                  0, 0, OPTAB_WIDEN);
+
+  /* Emit the start of the loop.  */
+  label = gen_label_rtx ();
+  emit_label (label);
+  
+  /* Emit the loop body.  */
+  arm_block_move_unaligned_straight (dest, src, bytes_per_iter,
+                                    interleave_factor);
+
+  /* Move on to the next block.  */
+  emit_move_insn (src_reg, plus_constant (src_reg, bytes_per_iter));
+  emit_move_insn (dest_reg, plus_constant (dest_reg, bytes_per_iter));
+  
+  /* Emit the loop condition.  */
+  test = gen_rtx_NE (VOIDmode, src_reg, final_src);
+  emit_jump_insn (gen_cbranchsi4 (test, src_reg, final_src, label));
+  
+  /* Mop up any left-over bytes.  */
+  if (leftover)
+    arm_block_move_unaligned_straight (dest, src, leftover, interleave_factor);
+}
+
+/* Emit a block move when either the source or destination is unaligned (not
+   aligned to a four-byte boundary).  This may need further tuning depending on
+   core type, optimize_size setting, etc.  */
+
+static int
+arm_movmemqi_unaligned (rtx *operands)
+{
+  HOST_WIDE_INT length = INTVAL (operands[2]);
+  
+  if (optimize_size)
+    {
+      bool src_aligned = MEM_ALIGN (operands[1]) >= BITS_PER_WORD;
+      bool dst_aligned = MEM_ALIGN (operands[0]) >= BITS_PER_WORD;
+      /* Inlined memcpy using ldr/str/ldrh/strh can be quite big: try to limit
+        size of code if optimizing for size.  We'll use ldm/stm if src_aligned
+        or dst_aligned though: allow more interleaving in those cases since the
+        resulting code can be smaller.  */
+      unsigned int interleave_factor = (src_aligned || dst_aligned) ? 2 : 1;
+      HOST_WIDE_INT bytes_per_iter = (src_aligned || dst_aligned) ? 8 : 4;
+      
+      if (length > 12)
+       arm_block_move_unaligned_loop (operands[0], operands[1], length,
+                                      interleave_factor, bytes_per_iter);
+      else
+       arm_block_move_unaligned_straight (operands[0], operands[1], length,
+                                          interleave_factor);
+    }
+  else
+    {
+      /* Note that the loop created by arm_block_move_unaligned_loop may be
+        subject to loop unrolling, which makes tuning this condition a little
+        redundant.  */
+      if (length > 32)
+       arm_block_move_unaligned_loop (operands[0], operands[1], length, 4, 16);
+      else
+       arm_block_move_unaligned_straight (operands[0], operands[1], length, 4);
+    }
+  
+  return 1;
+}
+
+int
+arm_gen_movmemqi (rtx *operands)
+{
+  HOST_WIDE_INT in_words_to_go, out_words_to_go, last_bytes;
+  HOST_WIDE_INT srcoffset, dstoffset;
+  int i;
+  rtx src, dst, srcbase, dstbase;
+  rtx part_bytes_reg = NULL;
+  rtx mem;
+
+  if (GET_CODE (operands[2]) != CONST_INT
+      || GET_CODE (operands[3]) != CONST_INT
+      || INTVAL (operands[2]) > 64)
+    return 0;
+
+  if (unaligned_access && (INTVAL (operands[3]) & 3) != 0)
+    return arm_movmemqi_unaligned (operands);
+
+  if (INTVAL (operands[3]) & 3)
+    return 0;
+
+  dstbase = operands[0];
+  srcbase = operands[1];
+
+  dst = copy_to_mode_reg (SImode, XEXP (dstbase, 0));
+  src = copy_to_mode_reg (SImode, XEXP (srcbase, 0));
+
+  in_words_to_go = ARM_NUM_INTS (INTVAL (operands[2]));
+  out_words_to_go = INTVAL (operands[2]) / 4;
+  last_bytes = INTVAL (operands[2]) & 3;
+  dstoffset = srcoffset = 0;
+
+  if (out_words_to_go != in_words_to_go && ((in_words_to_go - 1) & 3) != 0)
+    part_bytes_reg = gen_rtx_REG (SImode, (in_words_to_go - 1) & 3);
+
+  for (i = 0; in_words_to_go >= 2; i+=4)
+    {
+      if (in_words_to_go > 4)
+       emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, src,
+                                         TRUE, srcbase, &srcoffset));
+      else
+       emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, in_words_to_go,
+                                         src, FALSE, srcbase,
+                                         &srcoffset));
+
+      if (out_words_to_go)
+       {
+         if (out_words_to_go > 4)
+           emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, dst,
+                                              TRUE, dstbase, &dstoffset));
+         else if (out_words_to_go != 1)
+           emit_insn (arm_gen_store_multiple (arm_regs_in_sequence,
+                                              out_words_to_go, dst,
+                                              (last_bytes == 0
+                                               ? FALSE : TRUE),
+                                              dstbase, &dstoffset));
+         else
+           {
+             mem = adjust_automodify_address (dstbase, SImode, dst, dstoffset);
+             emit_move_insn (mem, gen_rtx_REG (SImode, 0));
+             if (last_bytes != 0)
+               {
+                 emit_insn (gen_addsi3 (dst, dst, GEN_INT (4)));
+                 dstoffset += 4;
+               }
+           }
+       }
+
+      in_words_to_go -= in_words_to_go < 4 ? in_words_to_go : 4;
+      out_words_to_go -= out_words_to_go < 4 ? out_words_to_go : 4;
+    }
+
+  /* OUT_WORDS_TO_GO will be zero here if there are byte stores to do.  */
+  if (out_words_to_go)
+    {
+      rtx sreg;
+
+      mem = adjust_automodify_address (srcbase, SImode, src, srcoffset);
+      sreg = copy_to_reg (mem);
+
+      mem = adjust_automodify_address (dstbase, SImode, dst, dstoffset);
+      emit_move_insn (mem, sreg);
+      in_words_to_go--;
+
+      gcc_assert (!in_words_to_go);    /* Sanity check */
+    }
+
+  if (in_words_to_go)
+    {
+      gcc_assert (in_words_to_go > 0);
+
+      mem = adjust_automodify_address (srcbase, SImode, src, srcoffset);
+      part_bytes_reg = copy_to_mode_reg (SImode, mem);
+    }
+
+  gcc_assert (!last_bytes || part_bytes_reg);
+
+  if (BYTES_BIG_ENDIAN && last_bytes)
+    {
+      rtx tmp = gen_reg_rtx (SImode);
+
+      /* The bytes we want are in the top end of the word.  */
+      emit_insn (gen_lshrsi3 (tmp, part_bytes_reg,
+                             GEN_INT (8 * (4 - last_bytes))));
+      part_bytes_reg = tmp;
+
+      while (last_bytes)
         {
           mem = adjust_automodify_address (dstbase, QImode,
                                            plus_constant (dst, last_bytes - 1),
@@ -10602,7 +11581,7 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
  
    /* A compare with a shifted operand.  Because of canonicalization, the
       comparison will have to be swapped when we emit the assembler.  */
-  if (GET_MODE (y) == SImode 
+  if (GET_MODE (y) == SImode
        && (REG_P (y) || (GET_CODE (y) == SUBREG))
        && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
           || GET_CODE (x) == LSHIFTRT || GET_CODE (x) == ROTATE
@@ -10611,7 +11590,7 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
  
    /* This operation is performed swapped, but since we only rely on the Z
       flag we don't need an additional mode.  */
-  if (GET_MODE (y) == SImode 
+  if (GET_MODE (y) == SImode
        && (REG_P (y) || (GET_CODE (y) == SUBREG))
        && GET_CODE (x) == NEG
        && (op ==        EQ || op == NE))
@@ -10712,7 +11691,7 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
             return CC_Zmode;
  
           /* We can do an equality test in three Thumb instructions.  */
-         if (!TARGET_ARM)
+         if (!TARGET_32BIT)
             return CC_Zmode;
  
           /* FALLTHROUGH */
@@ -10724,7 +11703,7 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
           /* DImode unsigned comparisons can be implemented by cmp +
              cmpeq without a scratch register.  Not worth doing in
              Thumb-2.  */
-         if (TARGET_ARM)
+         if (TARGET_32BIT)
             return CC_CZmode;
  
           /* FALLTHROUGH */
@@ -10751,7 +11730,7 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y)
     return the rtx for register 0 in the proper mode.  FP means this is a
     floating point compare: I don't think that it is needed on the arm.  */
  rtx
-arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y)
+arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y, rtx scratch)
  {
    enum machine_mode mode;
    rtx cc_reg;
@@ -10776,11 +11755,18 @@ arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y)
          CC_CZmode is cheaper.  */
        if (mode == CC_Zmode && y != const0_rtx)
         {
+         gcc_assert (!reload_completed);
           x = expand_binop (DImode, xor_optab, x, y, NULL_RTX, 0, OPTAB_WIDEN);
           y = const0_rtx;
         }
+
        /* A scratch register is required.  */
-      clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (SImode));
+      if (reload_completed)
+       gcc_assert (scratch != NULL && GET_MODE (scratch) == SImode);
+      else
+       scratch = gen_rtx_SCRATCH (SImode);
+
+      clobber = gen_rtx_CLOBBER (VOIDmode, scratch);
        set = gen_rtx_SET (VOIDmode, cc_reg, gen_rtx_COMPARE (mode, x, y));
        emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
      }
@@ -11095,7 +12081,7 @@ arm_must_pass_in_stack (enum machine_mode mode, const_tree type)
     aggregate types are placed in the lowest memory address.  */
  
  bool
-arm_pad_arg_upward (enum machine_mode mode, const_tree type)
+arm_pad_arg_upward (enum machine_mode mode ATTRIBUTE_UNUSED, const_tree type)
  {
    if (!TARGET_AAPCS_BASED)
      return DEFAULT_FUNCTION_ARG_PADDING(mode, type) == upward;
@@ -11108,21 +12094,33 @@ arm_pad_arg_upward (enum machine_mode mode, const_tree type)
  
  
  /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
-   For non-AAPCS, return !BYTES_BIG_ENDIAN if the least significant
-   byte of the register has useful data, and return the opposite if the
-   most significant byte does.
-   For AAPCS, small aggregates and small complex types are always padded
-   upwards.  */
+   Return !BYTES_BIG_ENDIAN if the least significant byte of the
+   register has useful data, and return the opposite if the most
+   significant byte does.  */
  
  bool
-arm_pad_reg_upward (enum machine_mode mode ATTRIBUTE_UNUSED,
+arm_pad_reg_upward (enum machine_mode mode,
                      tree type, int first ATTRIBUTE_UNUSED)
  {
-  if (TARGET_AAPCS_BASED
-      && BYTES_BIG_ENDIAN
-      && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE)
-      && int_size_in_bytes (type) <= 4)
-    return true;
+  if (TARGET_AAPCS_BASED && BYTES_BIG_ENDIAN)
+    {
+      /* For AAPCS, small aggregates, small fixed-point types,
+        and small complex types are always padded upwards.  */
+      if (type)
+       {
+         if ((AGGREGATE_TYPE_P (type)
+              || TREE_CODE (type) == COMPLEX_TYPE
+              || FIXED_POINT_TYPE_P (type))
+             && int_size_in_bytes (type) <= 4)
+           return true;
+       }
+      else
+       {
+         if ((COMPLEX_MODE_P (mode) || ALL_FIXED_POINT_MODE_P (mode))
+             && GET_MODE_SIZE (mode) <= 4)
+           return true;
+       }
+    }
  
    /* Otherwise, use default padding.  */
    return !BYTES_BIG_ENDIAN;
@@ -11319,8 +12317,7 @@ is_jump_table (rtx insn)
  {
    rtx table;
  
-  if (GET_CODE (insn) == JUMP_INSN
-      && JUMP_LABEL (insn) != NULL
+  if (jump_to_label_p (insn)
        && ((table = next_real_insn (JUMP_LABEL (insn)))
           == next_real_insn (insn))
        && table != NULL
@@ -11373,6 +12370,19 @@ get_jump_table_size (rtx insn)
    return 0;
  }
  
+/* Return the maximum amount of padding that will be inserted before
+   label LABEL.  */
+
+static HOST_WIDE_INT
+get_label_padding (rtx label)
+{
+  HOST_WIDE_INT align, min_insn_size;
+
+  align = 1 << label_to_alignment (label);
+  min_insn_size = TARGET_THUMB ? 2 : 4;
+  return align > min_insn_size ? align - min_insn_size : 0;
+}
+
  /* Move a minipool fix MP from its current location to before MAX_MP.
     If MAX_MP is NULL, then MP doesn't need moving, but the addressing
     constraints may need updating.  */
@@ -11919,8 +12929,12 @@ create_fix_barrier (Mfix *fix, HOST_WIDE_INT max_address)
          within range.  */
        gcc_assert (GET_CODE (from) != BARRIER);
  
-      /* Count the length of this insn.  */
-      count += get_attr_length (from);
+      /* Count the length of this insn.  This must stay in sync with the
+        code that pushes minipool fixes.  */
+      if (LABEL_P (from))
+       count += get_label_padding (from);
+      else
+       count += get_attr_length (from);
  
        /* If there is a jump table, add its length.  */
        tmp = is_jump_table (from);
@@ -11933,7 +12947,7 @@ create_fix_barrier (Mfix *fix, HOST_WIDE_INT max_address)
              still put the pool after the table.  */
           new_cost = arm_barrier_cost (from);
  
-         if (count < max_count 
+         if (count < max_count
               && (!selected || new_cost <= selected_cost))
             {
               selected = tmp;
@@ -12315,7 +13329,7 @@ arm_reorg (void)
  
    if (TARGET_THUMB2)
      thumb2_reorg ();
-  
+
    minipool_fix_head = minipool_fix_tail = NULL;
  
    /* The first insn must always be a note, or the code below won't
@@ -12350,6 +13364,11 @@ arm_reorg (void)
               insn = table;
             }
         }
+      else if (LABEL_P (insn))
+       /* Add the worst-case padding due to alignment.  We don't add
+          the _current_ padding because the minipool insertions
+          themselves might change it.  */
+       address += get_label_padding (insn);
      }
  
    fix = minipool_fix_head;
@@ -12597,7 +13616,7 @@ vfp_output_fstmd (rtx * operands)
    int base;
    int i;
  
-  strcpy (pattern, "fstmfdd\t%m0!, {%P1");
+  strcpy (pattern, "fstmfdd%?\t%m0!, {%P1");
    p = strlen (pattern);
  
    gcc_assert (GET_CODE (operands[1]) == REG);
@@ -12916,11 +13935,23 @@ output_mov_double_arm_from_fpa (rtx *operands)
  /* Output a move between double words.  It must be REG<-MEM
     or MEM<-REG.  */
  const char *
-output_move_double (rtx *operands)
+output_move_double (rtx *operands, bool emit, int *count)
  {
    enum rtx_code code0 = GET_CODE (operands[0]);
    enum rtx_code code1 = GET_CODE (operands[1]);
    rtx otherops[3];
+  if (count)
+    *count = 1;
+
+  /* The only case when this might happen is when
+     you are looking at the length of a DImode instruction
+     that has an invalid constant in it.  */
+  if (code0 == REG && code1 != MEM)
+    {
+      gcc_assert (!emit);
+      *count = 2;
+      return "";
+    }
  
    if (code0 == REG)
      {
@@ -12933,35 +13964,47 @@ output_move_double (rtx *operands)
        switch (GET_CODE (XEXP (operands[1], 0)))
         {
         case REG:
-         if (TARGET_LDRD
-             && !(fix_cm3_ldrd && reg0 == REGNO(XEXP (operands[1], 0))))
-           output_asm_insn ("ldr%(d%)\t%0, [%m1]", operands);
-         else
-           output_asm_insn ("ldm%(ia%)\t%m1, %M0", operands);
+
+         if (emit)
+           {
+             if (TARGET_LDRD
+                 && !(fix_cm3_ldrd && reg0 == REGNO(XEXP (operands[1], 0))))
+               output_asm_insn ("ldr%(d%)\t%0, [%m1]", operands);
+             else
+               output_asm_insn ("ldm%(ia%)\t%m1, %M0", operands);
+           }
           break;
  
         case PRE_INC:
           gcc_assert (TARGET_LDRD);
-         output_asm_insn ("ldr%(d%)\t%0, [%m1, #8]!", operands);
+         if (emit)
+           output_asm_insn ("ldr%(d%)\t%0, [%m1, #8]!", operands);
           break;
  
         case PRE_DEC:
-         if (TARGET_LDRD)
-           output_asm_insn ("ldr%(d%)\t%0, [%m1, #-8]!", operands);
-         else
-           output_asm_insn ("ldm%(db%)\t%m1!, %M0", operands);
+         if (emit)
+           {
+             if (TARGET_LDRD)
+               output_asm_insn ("ldr%(d%)\t%0, [%m1, #-8]!", operands);
+             else
+               output_asm_insn ("ldm%(db%)\t%m1!, %M0", operands);
+           }
           break;
  
         case POST_INC:
-         if (TARGET_LDRD)
-           output_asm_insn ("ldr%(d%)\t%0, [%m1], #8", operands);
-         else
-           output_asm_insn ("ldm%(ia%)\t%m1!, %M0", operands);
+         if (emit)
+           {
+             if (TARGET_LDRD)
+               output_asm_insn ("ldr%(d%)\t%0, [%m1], #8", operands);
+             else
+               output_asm_insn ("ldm%(ia%)\t%m1!, %M0", operands);
+           }
           break;
  
         case POST_DEC:
           gcc_assert (TARGET_LDRD);
-         output_asm_insn ("ldr%(d%)\t%0, [%m1], #-8", operands);
+         if (emit)
+           output_asm_insn ("ldr%(d%)\t%0, [%m1], #-8", operands);
           break;
  
         case PRE_MODIFY:
@@ -12979,8 +14022,13 @@ output_move_double (rtx *operands)
               if (reg_overlap_mentioned_p (otherops[0], otherops[2]))
                 {
                   /* Registers overlap so split out the increment.  */
-                 output_asm_insn ("add%?\t%1, %1, %2", otherops);
-                 output_asm_insn ("ldr%(d%)\t%0, [%1] @split", otherops);
+                 if (emit)
+                   {
+                     output_asm_insn ("add%?\t%1, %1, %2", otherops);
+                     output_asm_insn ("ldr%(d%)\t%0, [%1] @split", otherops);
+                   }
+                 if (count)
+                   *count = 2;
                 }
               else
                 {
@@ -12991,11 +14039,20 @@ output_move_double (rtx *operands)
                       || GET_CODE (otherops[2]) != CONST_INT
                       || (INTVAL (otherops[2]) > -256
                           && INTVAL (otherops[2]) < 256))
-                   output_asm_insn ("ldr%(d%)\t%0, [%1, %2]!", otherops);
+                   {
+                     if (emit)
+                       output_asm_insn ("ldr%(d%)\t%0, [%1, %2]!", otherops);
+                   }
                   else
                     {
-                     output_asm_insn ("ldr%?\t%0, [%1, %2]!", otherops);
-                     output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
+                     if (emit)
+                       {
+                         output_asm_insn ("ldr%?\t%0, [%1, %2]!", otherops);
+                         output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
+                       }
+                     if (count)
+                       *count = 2;
+
                     }
                 }
             }
@@ -13008,11 +14065,19 @@ output_move_double (rtx *operands)
                   || GET_CODE (otherops[2]) != CONST_INT
                   || (INTVAL (otherops[2]) > -256
                       && INTVAL (otherops[2]) < 256))
-               output_asm_insn ("ldr%(d%)\t%0, [%1], %2", otherops);
+               {
+                 if (emit)
+                   output_asm_insn ("ldr%(d%)\t%0, [%1], %2", otherops);
+               }
               else
                 {
-                 output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
-                 output_asm_insn ("ldr%?\t%0, [%1], %2", otherops);
+                 if (emit)
+                   {
+                     output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
+                     output_asm_insn ("ldr%?\t%0, [%1], %2", otherops);
+                   }
+                 if (count)
+                   *count = 2;
                 }
             }
           break;
@@ -13025,12 +14090,19 @@ output_move_double (rtx *operands)
           /* Use the second register of the pair to avoid problematic
              overlap.  */
           otherops[1] = operands[1];
-         output_asm_insn ("adr%?\t%0, %1", otherops);
+         if (emit)
+           output_asm_insn ("adr%?\t%0, %1", otherops);
           operands[1] = otherops[0];
-         if (TARGET_LDRD)
-           output_asm_insn ("ldr%(d%)\t%0, [%1]", operands);
-         else
-           output_asm_insn ("ldm%(ia%)\t%1, %M0", operands);
+         if (emit)
+           {
+             if (TARGET_LDRD)
+               output_asm_insn ("ldr%(d%)\t%0, [%1]", operands);
+             else
+               output_asm_insn ("ldm%(ia%)\t%1, %M0", operands);
+           }
+
+         if (count)
+           *count = 2;
           break;
  
           /* ??? This needs checking for thumb2.  */
@@ -13049,17 +14121,20 @@ output_move_double (rtx *operands)
                       switch ((int) INTVAL (otherops[2]))
                         {
                         case -8:
-                         output_asm_insn ("ldm%(db%)\t%1, %M0", otherops);
+                         if (emit)
+                           output_asm_insn ("ldm%(db%)\t%1, %M0", otherops);
                           return "";
                         case -4:
                           if (TARGET_THUMB2)
                             break;
-                         output_asm_insn ("ldm%(da%)\t%1, %M0", otherops);
+                         if (emit)
+                           output_asm_insn ("ldm%(da%)\t%1, %M0", otherops);
                           return "";
                         case 4:
                           if (TARGET_THUMB2)
                             break;
-                         output_asm_insn ("ldm%(ib%)\t%1, %M0", otherops);
+                         if (emit)
+                           output_asm_insn ("ldm%(ib%)\t%1, %M0", otherops);
                           return "";
                         }
                     }
@@ -13087,29 +14162,44 @@ output_move_double (rtx *operands)
                       if (reg_overlap_mentioned_p (operands[0], otherops[2])
                           || (fix_cm3_ldrd && reg0 == REGNO (otherops[1])))
                         {
-                         output_asm_insn ("add%?\t%0, %1, %2", otherops);
-                         output_asm_insn ("ldr%(d%)\t%0, [%1]", operands);
+                         if (emit)
+                           {
+                             output_asm_insn ("add%?\t%0, %1, %2", otherops);
+                             output_asm_insn ("ldr%(d%)\t%0, [%1]", operands);
+                           }
+                         if (count)
+                           *count = 2;
                         }
                       else
                         {
                           otherops[0] = operands[0];
-                         output_asm_insn ("ldr%(d%)\t%0, [%1, %2]", otherops);
+                         if (emit)
+                           output_asm_insn ("ldr%(d%)\t%0, [%1, %2]", otherops);
                         }
                       return "";
                     }
  
                   if (GET_CODE (otherops[2]) == CONST_INT)
                     {
-                     if (!(const_ok_for_arm (INTVAL (otherops[2]))))
-                       output_asm_insn ("sub%?\t%0, %1, #%n2", otherops);
-                     else
-                       output_asm_insn ("add%?\t%0, %1, %2", otherops);
+                     if (emit)
+                       {
+                         if (!(const_ok_for_arm (INTVAL (otherops[2]))))
+                           output_asm_insn ("sub%?\t%0, %1, #%n2", otherops);
+                         else
+                           output_asm_insn ("add%?\t%0, %1, %2", otherops);
+                       }
                     }
                   else
-                   output_asm_insn ("add%?\t%0, %1, %2", otherops);
+                   {
+                     if (emit)
+                       output_asm_insn ("add%?\t%0, %1, %2", otherops);
+                   }
                 }
               else
-               output_asm_insn ("sub%?\t%0, %1, %2", otherops);
+               {
+                 if (emit)
+                   output_asm_insn ("sub%?\t%0, %1, %2", otherops);
+               }
  
               if (TARGET_LDRD)
                 return "ldr%(d%)\t%0, [%1]";
@@ -13122,13 +14212,24 @@ output_move_double (rtx *operands)
               /* Take care of overlapping base/data reg.  */
               if (reg_mentioned_p (operands[0], operands[1]))
                 {
-                 output_asm_insn ("ldr%?\t%0, %1", otherops);
-                 output_asm_insn ("ldr%?\t%0, %1", operands);
+                 if (emit)
+                   {
+                     output_asm_insn ("ldr%?\t%0, %1", otherops);
+                     output_asm_insn ("ldr%?\t%0, %1", operands);
+                   }
+                 if (count)
+                   *count = 2;
+
                 }
               else
                 {
-                 output_asm_insn ("ldr%?\t%0, %1", operands);
-                 output_asm_insn ("ldr%?\t%0, %1", otherops);
+                 if (emit)
+                   {
+                     output_asm_insn ("ldr%?\t%0, %1", operands);
+                     output_asm_insn ("ldr%?\t%0, %1", otherops);
+                   }
+                 if (count)
+                   *count = 2;
                 }
             }
         }
@@ -13142,34 +14243,45 @@ output_move_double (rtx *operands)
        switch (GET_CODE (XEXP (operands[0], 0)))
          {
         case REG:
-         if (TARGET_LDRD)
-           output_asm_insn ("str%(d%)\t%1, [%m0]", operands);
-         else
-           output_asm_insn ("stm%(ia%)\t%m0, %M1", operands);
+         if (emit)
+           {
+             if (TARGET_LDRD)
+               output_asm_insn ("str%(d%)\t%1, [%m0]", operands);
+             else
+               output_asm_insn ("stm%(ia%)\t%m0, %M1", operands);
+           }
           break;
  
          case PRE_INC:
           gcc_assert (TARGET_LDRD);
-         output_asm_insn ("str%(d%)\t%1, [%m0, #8]!", operands);
+         if (emit)
+           output_asm_insn ("str%(d%)\t%1, [%m0, #8]!", operands);
           break;
  
          case PRE_DEC:
-         if (TARGET_LDRD)
-           output_asm_insn ("str%(d%)\t%1, [%m0, #-8]!", operands);
-         else
-           output_asm_insn ("stm%(db%)\t%m0!, %M1", operands);
+         if (emit)
+           {
+             if (TARGET_LDRD)
+               output_asm_insn ("str%(d%)\t%1, [%m0, #-8]!", operands);
+             else
+               output_asm_insn ("stm%(db%)\t%m0!, %M1", operands);
+           }
           break;
  
          case POST_INC:
-         if (TARGET_LDRD)
-           output_asm_insn ("str%(d%)\t%1, [%m0], #8", operands);
-         else
-           output_asm_insn ("stm%(ia%)\t%m0!, %M1", operands);
+         if (emit)
+           {
+             if (TARGET_LDRD)
+               output_asm_insn ("str%(d%)\t%1, [%m0], #8", operands);
+             else
+               output_asm_insn ("stm%(ia%)\t%m0!, %M1", operands);
+           }
           break;
  
          case POST_DEC:
           gcc_assert (TARGET_LDRD);
-         output_asm_insn ("str%(d%)\t%1, [%m0], #-8", operands);
+         if (emit)
+           output_asm_insn ("str%(d%)\t%1, [%m0], #-8", operands);
           break;
  
         case PRE_MODIFY:
@@ -13187,19 +14299,35 @@ output_move_double (rtx *operands)
             {
               if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY)
                 {
-                 output_asm_insn ("str%?\t%0, [%1, %2]!", otherops);
-                 output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
+                 if (emit)
+                   {
+                     output_asm_insn ("str%?\t%0, [%1, %2]!", otherops);
+                     output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
+                   }
+                 if (count)
+                   *count = 2;
                 }
               else
                 {
-                 output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
-                 output_asm_insn ("str%?\t%0, [%1], %2", otherops);
+                 if (emit)
+                   {
+                     output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
+                     output_asm_insn ("str%?\t%0, [%1], %2", otherops);
+                   }
+                 if (count)
+                   *count = 2;
                 }
             }
           else if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY)
-           output_asm_insn ("str%(d%)\t%0, [%1, %2]!", otherops);
+           {
+             if (emit)
+               output_asm_insn ("str%(d%)\t%0, [%1, %2]!", otherops);
+           }
           else
-           output_asm_insn ("str%(d%)\t%0, [%1], %2", otherops);
+           {
+             if (emit)
+               output_asm_insn ("str%(d%)\t%0, [%1], %2", otherops);
+           }
           break;
  
         case PLUS:
@@ -13209,19 +14337,22 @@ output_move_double (rtx *operands)
               switch ((int) INTVAL (XEXP (XEXP (operands[0], 0), 1)))
                 {
                 case -8:
-                 output_asm_insn ("stm%(db%)\t%m0, %M1", operands);
+                 if (emit)
+                   output_asm_insn ("stm%(db%)\t%m0, %M1", operands);
                   return "";
  
                 case -4:
                   if (TARGET_THUMB2)
                     break;
-                 output_asm_insn ("stm%(da%)\t%m0, %M1", operands);
+                 if (emit)
+                   output_asm_insn ("stm%(da%)\t%m0, %M1", operands);
                   return "";
  
                 case 4:
                   if (TARGET_THUMB2)
                     break;
-                 output_asm_insn ("stm%(ib%)\t%m0, %M1", operands);
+                 if (emit)
+                   output_asm_insn ("stm%(ib%)\t%m0, %M1", operands);
                   return "";
                 }
             }
@@ -13234,7 +14365,8 @@ output_move_double (rtx *operands)
             {
               otherops[0] = operands[1];
               otherops[1] = XEXP (XEXP (operands[0], 0), 0);
-             output_asm_insn ("str%(d%)\t%0, [%1, %2]", otherops);
+             if (emit)
+               output_asm_insn ("str%(d%)\t%0, [%1, %2]", otherops);
               return "";
             }
           /* Fall through */
@@ -13242,8 +14374,13 @@ output_move_double (rtx *operands)
          default:
           otherops[0] = adjust_address (operands[0], SImode, 4);
           otherops[1] = operands[1];
-         output_asm_insn ("str%?\t%1, %0", operands);
-         output_asm_insn ("str%?\t%H1, %0", otherops);
+         if (emit)
+           {
+             output_asm_insn ("str%?\t%1, %0", operands);
+             output_asm_insn ("str%?\t%H1, %0", otherops);
+           }
+         if (count)
+           *count = 2;
         }
      }
  
@@ -13453,7 +14590,7 @@ output_move_neon (rtx *operands)
        ops[0] = XEXP (addr, 0);
        ops[1] = reg;
        break;
-    
+
      case POST_MODIFY:
        /* FIXME: Not currently enabled in neon_vector_mem_operand.  */
        gcc_unreachable ();
@@ -13945,7 +15082,7 @@ arm_compute_save_reg0_reg12_mask (void)
  }
  
  
-/* Compute the number of bytes used to store the static chain register on the 
+/* Compute the number of bytes used to store the static chain register on the
     stack, above the stack frame. We need to know this accurately to get the
     alignment of the rest of the stack frame correct. */
  
@@ -14284,7 +15421,7 @@ output_return_instruction (rtx operand, int really_return, int reverse)
                      then try to pop r3 instead.  */
                   if (stack_adjust)
                     live_regs_mask |= 1 << 3;
-                 
+
                   if (TARGET_UNIFIED_ASM)
                     sprintf (instr, "ldmfd%s\t%%|sp, {", conditional);
                   else
@@ -14425,11 +15562,9 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size)
  {
    unsigned long func_type;
  
+  /* ??? Do we want to print some of the below anyway?  */
    if (TARGET_THUMB1)
-    {
-      thumb1_output_function_prologue (f, frame_size);
-      return;
-    }
+    return;
  
    /* Sanity check.  */
    gcc_assert (!arm_ccfsm_state && !arm_target_insn);
@@ -14500,7 +15635,7 @@ arm_output_epilogue (rtx sibling)
  
    /* If we have already generated the return instruction
       then it is futile to generate anything else.  */
-  if (use_return_insn (FALSE, sibling) && 
+  if (use_return_insn (FALSE, sibling) &&
        (cfun->machine->return_used_this_function != 0))
      return "";
  
@@ -14708,7 +15843,7 @@ arm_output_epilogue (rtx sibling)
         {
           operands[0] = stack_pointer_rtx;
           operands[1] = hard_frame_pointer_rtx;
-         
+
           operands[2] = GEN_INT (offsets->frame - offsets->saved_regs);
           output_add_immediate (operands);
         }
@@ -14735,6 +15870,7 @@ arm_output_epilogue (rtx sibling)
                   && !crtl->calls_eh_return
                   && bit_count(saved_regs_mask) * 4 == count
                   && !IS_INTERRUPT (func_type)
+                 && !IS_STACKALIGN (func_type)
                   && !crtl->tail_call_emit)
                 {
                   unsigned long mask;
@@ -14755,7 +15891,7 @@ arm_output_epilogue (rtx sibling)
                   }
                 }
             }
-         
+
           if (amount)
             {
               operands[1] = operands[0];
@@ -15387,7 +16523,7 @@ arm_get_frame_offsets (void)
         {
           int reg = -1;
  
-         /* If it is safe to use r3, then do so.  This sometimes 
+         /* If it is safe to use r3, then do so.  This sometimes
              generates better code on Thumb-2 by avoiding the need to
              use 32-bit push/pop instructions.  */
           if (! any_sibcall_uses_r3 ()
@@ -15687,9 +16823,8 @@ arm_expand_prologue (void)
  
    if (IS_STACKALIGN (func_type))
      {
-      rtx dwarf;
-      rtx r0;
-      rtx r1;
+      rtx r0, r1;
+
        /* Handle a word-aligned stack pointer.  We generate the following:
  
           mov r0, sp
@@ -15705,15 +16840,18 @@ arm_expand_prologue (void)
  
        r0 = gen_rtx_REG (SImode, 0);
        r1 = gen_rtx_REG (SImode, 1);
-      /* Use a real rtvec rather than NULL_RTVEC so the rest of the
-        compiler won't choke.  */
-      dwarf = gen_rtx_UNSPEC (SImode, rtvec_alloc (0), UNSPEC_STACK_ALIGN);
-      dwarf = gen_rtx_SET (VOIDmode, r0, dwarf);
-      insn = gen_movsi (r0, stack_pointer_rtx);
+
+      insn = emit_insn (gen_movsi (r0, stack_pointer_rtx));
        RTX_FRAME_RELATED_P (insn) = 1;
-      add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
-      emit_insn (insn);
+      add_reg_note (insn, REG_CFA_REGISTER, NULL);
+
        emit_insn (gen_andsi3 (r1, r0, GEN_INT (~(HOST_WIDE_INT)7)));
+
+      /* ??? The CFA changes here, which may cause GDB to conclude that it
+        has entered a different function.  That said, the unwind info is
+        correct, individually, before and after this instruction because
+        we've described the save of SP, which will override the default
+        handling of SP as restoring from the CFA.  */
        emit_insn (gen_movsi (stack_pointer_rtx, r1));
      }
  
@@ -15727,7 +16865,7 @@ arm_expand_prologue (void)
           /* Interrupt functions must not corrupt any registers.
              Creating a frame pointer however, corrupts the IP
              register, so we must push it first.  */
-         insn = emit_multi_reg_push (1 << IP_REGNUM);
+         emit_multi_reg_push (1 << IP_REGNUM);
  
           /* Do not set RTX_FRAME_RELATED_P on this insn.
              The dwarf stack unwinding code only wants to see one
@@ -15831,7 +16969,7 @@ arm_expand_prologue (void)
        && TARGET_ARM)
      {
        rtx lr = gen_rtx_REG (SImode, LR_REGNUM);
-      
+
        emit_set_insn (lr, plus_constant (lr, -4));
      }
  
@@ -16042,7 +17180,7 @@ arm_print_operand (FILE *stream, rtx x, int code)
        if (TARGET_UNIFIED_ASM)
         arm_print_condition (stream);
        break;
-  
+
      case '.':
        /* The current condition code for a condition code setting instruction.
          Preceded by 's' in unified syntax, otherwise followed by 's'.  */
@@ -16096,8 +17234,17 @@ arm_print_operand (FILE *stream, rtx x, int code)
           output_addr_const (stream, x);
           break;
  
+       case CONST:
+         if (GET_CODE (XEXP (x, 0)) == PLUS
+             && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
+           {
+             output_addr_const (stream, x);
+             break;
+           }
+         /* Fall through.  */
+
         default:
-         gcc_unreachable ();
+         output_operand_lossage ("Unsupported operand for code '%c'", code);
         }
        return;
  
@@ -16559,18 +17706,18 @@ arm_print_operand (FILE *stream, rtx x, int code)
            instruction (for some alignments) as an aid to the memory subsystem
            of the target.  */
         align = MEM_ALIGN (x) >> 3;
-       memsize = INTVAL (MEM_SIZE (x));
-       
+       memsize = MEM_SIZE (x);
+
         /* Only certain alignment specifiers are supported by the hardware.  */
         if (memsize == 16 && (align % 32) == 0)
           align_bits = 256;
-       else if ((memsize == 8 || memsize == 16) && (align % 16) == 0)
+       else if (memsize == 16 && (align % 16) == 0)
           align_bits = 128;
-       else if ((align % 8) == 0)
+       else if (memsize >= 8 && (align % 8) == 0)
           align_bits = 64;
         else
           align_bits = 0;
-       
+
         if (align_bits != 0)
           asm_fprintf (stream, ":%d", align_bits);
  
@@ -16616,6 +17763,11 @@ arm_print_operand (FILE *stream, rtx x, int code)
        }
        return;
  
+    case 'v':
+       gcc_assert (GET_CODE (x) == CONST_DOUBLE);
+       fprintf (stream, "#%d", vfp3_const_double_for_fract_bits (x));
+       return;
+
      /* Register specifier for vld1.16/vst1.16.  Translate the S register
         number into a D register number and element index.  */
      case 'z':
@@ -16640,7 +17792,7 @@ arm_print_operand (FILE *stream, rtx x, int code)
         fprintf (stream, "d%d[%d]", regno/2, ((regno % 2) ? 2 : 0));
        }
        return;
-      
+
      default:
        if (x == 0)
         {
@@ -16679,7 +17831,7 @@ arm_print_operand (FILE *stream, rtx x, int code)
               fputs (":lower16:", stream);
               x = XEXP (x, 0);
             }
-           
+
           output_addr_const (stream, x);
           break;
         }
@@ -16893,8 +18045,8 @@ arm_elf_asm_cdtor (rtx symbol, int priority, bool is_ctor)
  
    if (!TARGET_AAPCS_BASED)
      {
-      (is_ctor ? 
-       default_named_section_asm_out_constructor 
+      (is_ctor ?
+       default_named_section_asm_out_constructor
         : default_named_section_asm_out_destructor) (symbol, priority);
        return;
      }
@@ -16903,7 +18055,7 @@ arm_elf_asm_cdtor (rtx symbol, int priority, bool is_ctor)
    if (priority != DEFAULT_INIT_PRIORITY)
      {
        char buf[18];
-      sprintf (buf, "%s.%.5u", 
+      sprintf (buf, "%s.%.5u",
                is_ctor ? ".init_array" : ".fini_array",
                priority);
        s = get_section (buf, SECTION_WRITE, NULL_TREE);
@@ -16975,10 +18127,11 @@ arm_elf_asm_destructor (rtx symbol, int priority)
     decremented/zeroed by arm_asm_output_opcode as the insns are output.  */
  
  /* Returns the index of the ARM condition code string in
-   `arm_condition_codes'.  COMPARISON should be an rtx like
-   `(eq (...) (...))'.  */
-static enum arm_cond_code
-get_arm_condition_code (rtx comparison)
+   `arm_condition_codes', or ARM_NV if the comparison is invalid.
+   COMPARISON should be an rtx like `(eq (...) (...))'.  */
+
+enum arm_cond_code
+maybe_get_arm_condition_code (rtx comparison)
  {
    enum machine_mode mode = GET_MODE (XEXP (comparison, 0));
    enum arm_cond_code code;
@@ -17002,11 +18155,11 @@ get_arm_condition_code (rtx comparison)
      case CC_DLTUmode: code = ARM_CC;
  
      dominance:
-      gcc_assert (comp_code == EQ || comp_code == NE);
-
        if (comp_code == EQ)
         return ARM_INVERSE_CONDITION_CODE (code);
-      return code;
+      if (comp_code == NE)
+       return code;
+      return ARM_NV;
  
      case CC_NOOVmode:
        switch (comp_code)
@@ -17015,7 +18168,7 @@ get_arm_condition_code (rtx comparison)
         case EQ: return ARM_EQ;
         case GE: return ARM_PL;
         case LT: return ARM_MI;
-       default: gcc_unreachable ();
+       default: return ARM_NV;
         }
  
      case CC_Zmode:
@@ -17023,7 +18176,7 @@ get_arm_condition_code (rtx comparison)
         {
         case NE: return ARM_NE;
         case EQ: return ARM_EQ;
-       default: gcc_unreachable ();
+       default: return ARM_NV;
         }
  
      case CC_Nmode:
@@ -17031,7 +18184,7 @@ get_arm_condition_code (rtx comparison)
         {
         case NE: return ARM_MI;
         case EQ: return ARM_PL;
-       default: gcc_unreachable ();
+       default: return ARM_NV;
         }
  
      case CCFPEmode:
@@ -17056,7 +18209,7 @@ get_arm_condition_code (rtx comparison)
           /* UNEQ and LTGT do not have a representation.  */
         case UNEQ: /* Fall through.  */
         case LTGT: /* Fall through.  */
-       default: gcc_unreachable ();
+       default: return ARM_NV;
         }
  
      case CC_SWPmode:
@@ -17072,7 +18225,7 @@ get_arm_condition_code (rtx comparison)
         case GTU: return ARM_CC;
         case LEU: return ARM_CS;
         case LTU: return ARM_HI;
-       default: gcc_unreachable ();
+       default: return ARM_NV;
         }
  
      case CC_Cmode:
@@ -17080,7 +18233,7 @@ get_arm_condition_code (rtx comparison)
         {
         case LTU: return ARM_CS;
         case GEU: return ARM_CC;
-       default: gcc_unreachable ();
+       default: return ARM_NV;
         }
  
      case CC_CZmode:
@@ -17092,7 +18245,7 @@ get_arm_condition_code (rtx comparison)
         case GTU: return ARM_HI;
         case LEU: return ARM_LS;
         case LTU: return ARM_CC;
-       default: gcc_unreachable ();
+       default: return ARM_NV;
         }
  
      case CC_NCVmode:
@@ -17102,7 +18255,7 @@ get_arm_condition_code (rtx comparison)
         case LT: return ARM_LT;
         case GEU: return ARM_CS;
         case LTU: return ARM_CC;
-       default: gcc_unreachable ();
+       default: return ARM_NV;
         }
  
      case CCmode:
@@ -17118,13 +18271,22 @@ get_arm_condition_code (rtx comparison)
         case GTU: return ARM_HI;
         case LEU: return ARM_LS;
         case LTU: return ARM_CC;
-       default: gcc_unreachable ();
+       default: return ARM_NV;
         }
  
      default: gcc_unreachable ();
      }
  }
  
+/* Like maybe_get_arm_condition_code, but never return ARM_NV.  */
+static enum arm_cond_code
+get_arm_condition_code (rtx comparison)
+{
+  enum arm_cond_code code = maybe_get_arm_condition_code (comparison);
+  gcc_assert (code != ARM_NV);
+  return code;
+}
+
  /* Tell arm_asm_output_opcode to output IT blocks for conditionally executed
     instructions.  */
  void
@@ -17216,6 +18378,7 @@ arm_final_prescan_insn (rtx insn)
  
    /* If we start with a return insn, we only succeed if we find another one.  */
    int seeking_return = 0;
+  enum rtx_code return_code = UNKNOWN;
  
    /* START_INSN will hold the insn from where we start looking.  This is the
       first insn after the following code_label if REVERSE is true.  */
@@ -17254,7 +18417,7 @@ arm_final_prescan_insn (rtx insn)
           else
             return;
         }
-      else if (GET_CODE (body) == RETURN)
+      else if (ANY_RETURN_P (body))
          {
           start_insn = next_nonnote_insn (start_insn);
           if (GET_CODE (start_insn) == BARRIER)
@@ -17265,6 +18428,7 @@ arm_final_prescan_insn (rtx insn)
             {
               reverse = TRUE;
               seeking_return = 1;
+             return_code = GET_CODE (body);
             }
           else
             return;
@@ -17305,11 +18469,15 @@ arm_final_prescan_insn (rtx insn)
           label = XEXP (XEXP (SET_SRC (body), 2), 0);
           then_not_else = FALSE;
         }
-      else if (GET_CODE (XEXP (SET_SRC (body), 1)) == RETURN)
-       seeking_return = 1;
-      else if (GET_CODE (XEXP (SET_SRC (body), 2)) == RETURN)
+      else if (ANY_RETURN_P (XEXP (SET_SRC (body), 1)))
+       {
+         seeking_return = 1;
+         return_code = GET_CODE (XEXP (SET_SRC (body), 1));
+       }
+      else if (ANY_RETURN_P (XEXP (SET_SRC (body), 2)))
          {
           seeking_return = 1;
+         return_code = GET_CODE (XEXP (SET_SRC (body), 2));
           then_not_else = FALSE;
          }
        else
@@ -17406,12 +18574,11 @@ arm_final_prescan_insn (rtx insn)
                 }
               /* Fail if a conditional return is undesirable (e.g. on a
                  StrongARM), but still allow this if optimizing for size.  */
-             else if (GET_CODE (scanbody) == RETURN
+             else if (GET_CODE (scanbody) == return_code
                        && !use_return_insn (TRUE, NULL)
                        && !optimize_size)
                 fail = TRUE;
-             else if (GET_CODE (scanbody) == RETURN
-                      && seeking_return)
+             else if (GET_CODE (scanbody) == return_code)
                 {
                   arm_ccfsm_state = 2;
                   succeed = TRUE;
@@ -17580,7 +18747,7 @@ arm_hard_regno_mode_ok (unsigned int regno, enum machine_mode mode)
        if (IS_IWMMXT_REGNUM (regno))
         return VALID_IWMMXT_REG_MODE (mode);
      }
-  
+
    /* We allow almost any value to be stored in the general registers.
       Restrict doubleword quantities to even register pairs so that we can
       use ldrd.  Do not allow very large Neon structure opaque modes in
@@ -17602,6 +18769,29 @@ arm_hard_regno_mode_ok (unsigned int regno, enum machine_mode mode)
           && regno <= LAST_FPA_REGNUM);
  }
  
+/* Implement MODES_TIEABLE_P.  */
+
+bool
+arm_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
+{
+  if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
+    return true;
+
+  /* We specifically want to allow elements of "structure" modes to
+     be tieable to the structure.  This more general condition allows
+     other rarer situations too.  */
+  if (TARGET_NEON
+      && (VALID_NEON_DREG_MODE (mode1)
+         || VALID_NEON_QREG_MODE (mode1)
+         || VALID_NEON_STRUCT_MODE (mode1))
+      && (VALID_NEON_DREG_MODE (mode2)
+         || VALID_NEON_QREG_MODE (mode2)
+         || VALID_NEON_STRUCT_MODE (mode2)))
+    return true;
+
+  return false;
+}
+
  /* For efficiency and historical reasons LO_REGS, HI_REGS and CC_REGS are
     not used in arm mode.  */
  
@@ -18742,7 +19932,7 @@ arm_init_neon_builtins (void)
         }                                                               \
      }                                                                  \
    while (0)
-  
+
  struct builtin_description
  {
    const unsigned int       mask;
@@ -18758,7 +19948,7 @@ static const struct builtin_description bdesc_2arg[] =
  #define IWMMXT_BUILTIN(code, string, builtin) \
    { FL_IWMMXT, CODE_FOR_##code, "__builtin_arm_" string, \
      ARM_BUILTIN_##builtin, UNKNOWN, 0 },
-  
+
    IWMMXT_BUILTIN (addv8qi3, "waddb", WADDB)
    IWMMXT_BUILTIN (addv4hi3, "waddh", WADDH)
    IWMMXT_BUILTIN (addv2si3, "waddw", WADDW)
@@ -18817,10 +20007,10 @@ static const struct builtin_description bdesc_2arg[] =
    IWMMXT_BUILTIN (iwmmxt_wunpckihw, "wunpckihw", WUNPCKIHW)
    IWMMXT_BUILTIN (iwmmxt_wmadds, "wmadds", WMADDS)
    IWMMXT_BUILTIN (iwmmxt_wmaddu, "wmaddu", WMADDU)
-  
+
  #define IWMMXT_BUILTIN2(code, builtin) \
    { FL_IWMMXT, CODE_FOR_##code, NULL, ARM_BUILTIN_##builtin, UNKNOWN, 0 },
-  
+
    IWMMXT_BUILTIN2 (iwmmxt_wpackhss, WPACKHSS)
    IWMMXT_BUILTIN2 (iwmmxt_wpackwss, WPACKWSS)
    IWMMXT_BUILTIN2 (iwmmxt_wpackdss, WPACKDSS)
@@ -18854,7 +20044,7 @@ static const struct builtin_description bdesc_2arg[] =
    IWMMXT_BUILTIN2 (iwmmxt_wmacuz,   WMACUZ)
    IWMMXT_BUILTIN2 (iwmmxt_wmacsz,   WMACSZ)
  };
-  
+
  static const struct builtin_description bdesc_1arg[] =
  {
    IWMMXT_BUILTIN (iwmmxt_tmovmskb, "tmovmskb", TMOVMSKB)
@@ -18876,7 +20066,7 @@ static const struct builtin_description bdesc_1arg[] =
    IWMMXT_BUILTIN (iwmmxt_wunpckelsh, "wunpckelsh", WUNPCKELSH)
    IWMMXT_BUILTIN (iwmmxt_wunpckelsw, "wunpckelsw", WUNPCKELSW)
  };
-  
+
  /* Set up all the iWMMXt builtins.  This is not called if
     TARGET_IWMMXT is zero.  */
  
@@ -19000,7 +20190,7 @@ arm_init_iwmmxt_builtins (void)
      = build_function_type_list (long_long_unsigned_type_node,
                                 V4HI_type_node,V4HI_type_node,
                                 NULL_TREE);
-  
+
    /* Normal vector binops.  */
    tree v8qi_ftype_v8qi_v8qi
      = build_function_type_list (V8QI_type_node,
@@ -19016,7 +20206,7 @@ arm_init_iwmmxt_builtins (void)
                                 long_long_unsigned_type_node,
                                 long_long_unsigned_type_node,
                                 NULL_TREE);
-  
+
    /* Add all builtins that are more or less simple operations on two
       operands.  */
    for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
@@ -19258,6 +20448,8 @@ arm_scalar_mode_supported_p (enum machine_mode mode)
  {
    if (mode == HFmode)
      return (arm_fp16_format != ARM_FP16_FORMAT_NONE);
+  else if (ALL_FIXED_POINT_MODE_P (mode))
+    return true;
    else
      return default_scalar_mode_supported_p (mode);
  }
@@ -19693,39 +20885,85 @@ neon_emit_pair_result_insn (enum machine_mode mode,
    emit_move_insn (mem, tmp2);
  }
  
-/* Set up operands for a register copy from src to dest, taking care not to
-   clobber registers in the process.
-   FIXME: This has rather high polynomial complexity (O(n^3)?) but shouldn't
-   be called with a large N, so that should be OK.  */
+/* Set up OPERANDS for a register copy from SRC to DEST, taking care
+   not to early-clobber SRC registers in the process.
  
+   We assume that the operands described by SRC and DEST represent a
+   decomposed copy of OPERANDS[1] into OPERANDS[0].  COUNT is the
+   number of components into which the copy has been decomposed.  */
  void
  neon_disambiguate_copy (rtx *operands, rtx *dest, rtx *src, unsigned int count)
  {
-  unsigned int copied = 0, opctr = 0;
-  unsigned int done = (1 << count) - 1;
-  unsigned int i, j;
+  unsigned int i;
  
-  while (copied != done)
+  if (!reg_overlap_mentioned_p (operands[0], operands[1])
+      || REGNO (operands[0]) < REGNO (operands[1]))
      {
        for (i = 0; i < count; i++)
-        {
-          int good = 1;
+       {
+         operands[2 * i] = dest[i];
+         operands[2 * i + 1] = src[i];
+       }
+    }
+  else
+    {
+      for (i = 0; i < count; i++)
+       {
+         operands[2 * i] = dest[count - i - 1];
+         operands[2 * i + 1] = src[count - i - 1];
+       }
+    }
+}
  
-          for (j = 0; good && j < count; j++)
-            if (i != j && (copied & (1 << j)) == 0
-                && reg_overlap_mentioned_p (src[j], dest[i]))
-              good = 0;
+/* Split operands into moves from op[1] + op[2] into op[0].  */
  
-          if (good)
-            {
-              operands[opctr++] = dest[i];
-              operands[opctr++] = src[i];
-              copied |= 1 << i;
-            }
-        }
+void
+neon_split_vcombine (rtx operands[3])
+{
+  unsigned int dest = REGNO (operands[0]);
+  unsigned int src1 = REGNO (operands[1]);
+  unsigned int src2 = REGNO (operands[2]);
+  enum machine_mode halfmode = GET_MODE (operands[1]);
+  unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
+  rtx destlo, desthi;
+
+  if (src1 == dest && src2 == dest + halfregs)
+    {
+      /* No-op move.  Can't split to nothing; emit something.  */
+      emit_note (NOTE_INSN_DELETED);
+      return;
      }
  
-  gcc_assert (opctr == count * 2);
+  /* Preserve register attributes for variable tracking.  */
+  destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
+  desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
+                              GET_MODE_SIZE (halfmode));
+
+  /* Special case of reversed high/low parts.  Use VSWP.  */
+  if (src2 == dest && src1 == dest + halfregs)
+    {
+      rtx x = gen_rtx_SET (VOIDmode, destlo, operands[1]);
+      rtx y = gen_rtx_SET (VOIDmode, desthi, operands[2]);
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, x, y)));
+      return;
+    }
+
+  if (!reg_overlap_mentioned_p (operands[2], destlo))
+    {
+      /* Try to avoid unnecessary moves if part of the result
+        is in the right place already.  */
+      if (src1 != dest)
+       emit_move_insn (destlo, operands[1]);
+      if (src2 != dest + halfregs)
+       emit_move_insn (desthi, operands[2]);
+    }
+  else
+    {
+      if (src2 != dest + halfregs)
+       emit_move_insn (desthi, operands[2]);
+      if (src1 != dest)
+       emit_move_insn (destlo, operands[1]);
+    }
  }
  
  /* Expand an expression EXP that calls a built-in function,
@@ -19966,27 +21204,79 @@ arm_expand_builtin (tree exp,
  inline static int
  number_of_first_bit_set (unsigned mask)
  {
-  int bit;
+  return ctz_hwi (mask);
+}
+
+/* Like emit_multi_reg_push, but allowing for a different set of
+   registers to be described as saved.  MASK is the set of registers
+   to be saved; REAL_REGS is the set of registers to be described as
+   saved.  If REAL_REGS is 0, only describe the stack adjustment.  */
+
+static rtx
+thumb1_emit_multi_reg_push (unsigned long mask, unsigned long real_regs)
+{
+  unsigned long regno;
+  rtx par[10], tmp, reg, insn;
+  int i, j;
  
-  for (bit = 0;
-       (mask & (1 << bit)) == 0;
-       ++bit)
-    continue;
+  /* Build the parallel of the registers actually being stored.  */
+  for (i = 0; mask; ++i, mask &= mask - 1)
+    {
+      regno = ctz_hwi (mask);
+      reg = gen_rtx_REG (SImode, regno);
+
+      if (i == 0)
+       tmp = gen_rtx_UNSPEC (BLKmode, gen_rtvec (1, reg), UNSPEC_PUSH_MULT);
+      else
+       tmp = gen_rtx_USE (VOIDmode, reg);
+
+      par[i] = tmp;
+    }
+
+  tmp = plus_constant (stack_pointer_rtx, -4 * i);
+  tmp = gen_rtx_PRE_MODIFY (Pmode, stack_pointer_rtx, tmp);
+  tmp = gen_frame_mem (BLKmode, tmp);
+  tmp = gen_rtx_SET (VOIDmode, tmp, par[0]);
+  par[0] = tmp;
+
+  tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (i, par));
+  insn = emit_insn (tmp);
+
+  /* Always build the stack adjustment note for unwind info.  */
+  tmp = plus_constant (stack_pointer_rtx, -4 * i);
+  tmp = gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp);
+  par[0] = tmp;
+
+  /* Build the parallel of the registers recorded as saved for unwind.  */
+  for (j = 0; real_regs; ++j, real_regs &= real_regs - 1)
+    {
+      regno = ctz_hwi (real_regs);
+      reg = gen_rtx_REG (SImode, regno);
+
+      tmp = plus_constant (stack_pointer_rtx, j * 4);
+      tmp = gen_frame_mem (SImode, tmp);
+      tmp = gen_rtx_SET (VOIDmode, tmp, reg);
+      RTX_FRAME_RELATED_P (tmp) = 1;
+      par[j + 1] = tmp;
+    }
+
+  if (j == 0)
+    tmp = par[0];
+  else
+    {
+      RTX_FRAME_RELATED_P (par[0]) = 1;
+      tmp = gen_rtx_SEQUENCE (VOIDmode, gen_rtvec_v (j + 1, par));
+    }
+
+  add_reg_note (insn, REG_FRAME_RELATED_EXPR, tmp);
  
-  return bit;
+  return insn;
  }
  
  /* Emit code to push or pop registers to or from the stack.  F is the
-   assembly file.  MASK is the registers to push or pop.  PUSH is
-   nonzero if we should push, and zero if we should pop.  For debugging
-   output, if pushing, adjust CFA_OFFSET by the amount of space added
-   to the stack.  REAL_REGS should have the same number of bits set as
-   MASK, and will be used instead (in the same order) to describe which
-   registers were saved - this is used to mark the save slots when we
-   push high registers after moving them to low registers.  */
+   assembly file.  MASK is the registers to pop.  */
  static void
-thumb_pushpop (FILE *f, unsigned long mask, int push, int *cfa_offset,
-              unsigned long real_regs)
+thumb_pop (FILE *f, unsigned long mask)
  {
    int regno;
    int lo_mask = mask & 0xFF;
@@ -19994,7 +21284,7 @@ thumb_pushpop (FILE *f, unsigned long mask, int push, int *cfa_offset,
  
    gcc_assert (mask);
  
-  if (lo_mask == 0 && !push && (mask & (1 << PC_REGNUM)))
+  if (lo_mask == 0 && (mask & (1 << PC_REGNUM)))
      {
        /* Special case.  Do not generate a POP PC statement here, do it in
          thumb_exit() */
@@ -20002,22 +21292,7 @@ thumb_pushpop (FILE *f, unsigned long mask, int push, int *cfa_offset,
        return;
      }
  
-  if (push && arm_except_unwind_info (&global_options) == UI_TARGET)
-    {
-      fprintf (f, "\t.save\t{");
-      for (regno = 0; regno < 15; regno++)
-       {
-         if (real_regs & (1 << regno))
-           {
-             if (real_regs & ((1 << regno) -1))
-               fprintf (f, ", ");
-             asm_fprintf (f, "%r", regno);
-           }
-       }
-      fprintf (f, "}\n");
-    }
-
-  fprintf (f, "\t%s\t{", push ? "push" : "pop");
+  fprintf (f, "\tpop\t{");
  
    /* Look at the low registers first.  */
    for (regno = 0; regno <= LAST_LO_REGNUM; regno++, lo_mask >>= 1)
@@ -20033,17 +21308,7 @@ thumb_pushpop (FILE *f, unsigned long mask, int push, int *cfa_offset,
         }
      }
  
-  if (push && (mask & (1 << LR_REGNUM)))
-    {
-      /* Catch pushing the LR.  */
-      if (mask & 0xFF)
-       fprintf (f, ", ");
-
-      asm_fprintf (f, "%r", LR_REGNUM);
-
-      pushed_words++;
-    }
-  else if (!push && (mask & (1 << PC_REGNUM)))
+  if (mask & (1 << PC_REGNUM))
      {
        /* Catch popping the PC.  */
        if (TARGET_INTERWORK || TARGET_BACKTRACE
@@ -20067,23 +21332,6 @@ thumb_pushpop (FILE *f, unsigned long mask, int push, int *cfa_offset,
      }
  
    fprintf (f, "}\n");
-
-  if (push && pushed_words && dwarf2out_do_frame ())
-    {
-      char *l = dwarf2out_cfi_label (false);
-      int pushed_mask = real_regs;
-
-      *cfa_offset += pushed_words * 4;
-      dwarf2out_def_cfa (l, SP_REGNUM, *cfa_offset);
-
-      pushed_words = 0;
-      pushed_mask = real_regs;
-      for (regno = 0; regno <= 14; regno++, pushed_mask >>= 1)
-       {
-         if (pushed_mask & 1)
-           dwarf2out_reg_save (l, regno, 4 * pushed_words++ - *cfa_offset);
-       }
-    }
  }
  
  /* Generate code to return from a thumb function.
@@ -20229,8 +21477,7 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
      }
  
    /* Pop as many registers as we can.  */
-  thumb_pushpop (f, regs_available_for_popping, FALSE, NULL,
-                regs_available_for_popping);
+  thumb_pop (f, regs_available_for_popping);
  
    /* Process the registers we popped.  */
    if (reg_containing_return_addr == -1)
@@ -20311,8 +21558,7 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
        int  popped_into;
        int  move_to;
  
-      thumb_pushpop (f, regs_available_for_popping, FALSE, NULL,
-                    regs_available_for_popping);
+      thumb_pop (f, regs_available_for_popping);
  
        /* We have popped either FP or SP.
          Move whichever one it is into the correct register.  */
@@ -20332,8 +21578,7 @@ thumb_exit (FILE *f, int reg_containing_return_addr)
      {
        int  popped_into;
  
-      thumb_pushpop (f, regs_available_for_popping, FALSE, NULL,
-                    regs_available_for_popping);
+      thumb_pop (f, regs_available_for_popping);
  
        popped_into = number_of_first_bit_set (regs_available_for_popping);
  
@@ -20615,7 +21860,8 @@ thumb_unexpanded_epilogue (void)
    if (extra_pop > 0)
      {
        unsigned long extra_mask = (1 << extra_pop) - 1;
-      live_regs_mask |= extra_mask << (size / UNITS_PER_WORD);
+      live_regs_mask |= extra_mask << ((size + UNITS_PER_WORD - 1) 
+                                      / UNITS_PER_WORD);
      }
  
    /* The prolog may have pushed some high registers to use as
@@ -20665,7 +21911,7 @@ thumb_unexpanded_epilogue (void)
           mask &= (2 << regno) - 1;     /* A noop if regno == 8 */
  
           /* Pop the values into the low register(s).  */
-         thumb_pushpop (asm_out_file, mask, 0, NULL, mask);
+         thumb_pop (asm_out_file, mask);
  
           /* Move the value(s) into the high registers.  */
           for (regno = 0; regno <= LAST_LO_REGNUM; regno++)
@@ -20697,12 +21943,11 @@ thumb_unexpanded_epilogue (void)
          structure was created which includes an adjusted stack
          pointer, so just pop everything.  */
        if (live_regs_mask)
-       thumb_pushpop (asm_out_file, live_regs_mask, FALSE, NULL,
-                      live_regs_mask);
+       thumb_pop (asm_out_file, live_regs_mask);
  
        /* We have either just popped the return address into the
          PC or it is was kept in LR for the entire function.
-        Note that thumb_pushpop has already called thumb_exit if the
+        Note that thumb_pop has already called thumb_exit if the
          PC was in the list.  */
        if (!had_to_push_lr)
         thumb_exit (asm_out_file, LR_REGNUM);
@@ -20711,8 +21956,7 @@ thumb_unexpanded_epilogue (void)
      {
        /* Pop everything but the return address.  */
        if (live_regs_mask)
-       thumb_pushpop (asm_out_file, live_regs_mask, FALSE, NULL,
-                      live_regs_mask);
+       thumb_pop (asm_out_file, live_regs_mask);
  
        if (had_to_push_lr)
         {
@@ -20724,8 +21968,7 @@ thumb_unexpanded_epilogue (void)
             }
  
           /* Get the return address into a temporary register.  */
-         thumb_pushpop (asm_out_file, 1 << LAST_ARG_REGNUM, 0, NULL,
-                        1 << LAST_ARG_REGNUM);
+         thumb_pop (asm_out_file, 1 << LAST_ARG_REGNUM);
  
           if (size > 12)
             {
@@ -20849,17 +22092,20 @@ thumb_compute_initial_elimination_offset (unsigned int from, unsigned int to)
      }
  }
  
-/* Generate the rest of a function's prologue.  */
+/* Generate the function's prologue.  */
+
  void
  thumb1_expand_prologue (void)
  {
-  rtx insn, dwarf;
+  rtx insn;
  
    HOST_WIDE_INT amount;
    arm_stack_offsets *offsets;
    unsigned long func_type;
    int regno;
    unsigned long live_regs_mask;
+  unsigned long l_mask;
+  unsigned high_regs_pushed = 0;
  
    func_type = arm_current_func_type ();
  
@@ -20873,72 +22119,270 @@ thumb1_expand_prologue (void)
        return;
      }
  
+  if (is_called_in_ARM_mode (current_function_decl))
+    emit_insn (gen_prologue_thumb1_interwork ());
+
    offsets = arm_get_frame_offsets ();
    live_regs_mask = offsets->saved_regs_mask;
-  /* Load the pic register before setting the frame pointer,
-     so we can use r7 as a temporary work register.  */
-  if (flag_pic && arm_pic_register != INVALID_REGNUM)
-    arm_load_pic_register (live_regs_mask);
-
-  if (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)
-    emit_move_insn (gen_rtx_REG (Pmode, ARM_HARD_FRAME_POINTER_REGNUM),
-                   stack_pointer_rtx);
  
-  if (flag_stack_usage_info)
-    current_function_static_stack_size
-      = offsets->outgoing_args - offsets->saved_args;
+  /* Extract a mask of the ones we can give to the Thumb's push instruction.  */
+  l_mask = live_regs_mask & 0x40ff;
+  /* Then count how many other high registers will need to be pushed.  */
+  high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
  
-  amount = offsets->outgoing_args - offsets->saved_regs;
-  amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
-  if (amount)
+  if (crtl->args.pretend_args_size)
      {
-      if (amount < 512)
+      rtx x = GEN_INT (-crtl->args.pretend_args_size);
+
+      if (cfun->machine->uses_anonymous_args)
         {
-         insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
-                                       GEN_INT (- amount)));
-         RTX_FRAME_RELATED_P (insn) = 1;
+         int num_pushes = ARM_NUM_INTS (crtl->args.pretend_args_size);
+         unsigned long mask;
+
+         mask = 1ul << (LAST_ARG_REGNUM + 1);
+         mask -= 1ul << (LAST_ARG_REGNUM + 1 - num_pushes);
+
+         insn = thumb1_emit_multi_reg_push (mask, 0);
         }
        else
         {
-         rtx reg;
+         insn = emit_insn (gen_addsi3 (stack_pointer_rtx,
+                                       stack_pointer_rtx, x));
+       }
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
  
-         /* The stack decrement is too big for an immediate value in a single
-            insn.  In theory we could issue multiple subtracts, but after
-            three of them it becomes more space efficient to place the full
-            value in the constant pool and load into a register.  (Also the
-            ARM debugger really likes to see only one stack decrement per
-            function).  So instead we look for a scratch register into which
-            we can load the decrement, and then we subtract this from the
-            stack pointer.  Unfortunately on the thumb the only available
-            scratch registers are the argument registers, and we cannot use
-            these as they may hold arguments to the function.  Instead we
-            attempt to locate a call preserved register which is used by this
-            function.  If we can find one, then we know that it will have
-            been pushed at the start of the prologue and so we can corrupt
-            it now.  */
-         for (regno = LAST_ARG_REGNUM + 1; regno <= LAST_LO_REGNUM; regno++)
-           if (live_regs_mask & (1 << regno))
-             break;
+  if (TARGET_BACKTRACE)
+    {
+      HOST_WIDE_INT offset = 0;
+      unsigned work_register;
+      rtx work_reg, x, arm_hfp_rtx;
  
-         gcc_assert(regno <= LAST_LO_REGNUM);
+      /* We have been asked to create a stack backtrace structure.
+         The code looks like this:
  
-         reg = gen_rtx_REG (SImode, regno);
+        0   .align 2
+        0   func:
+         0     sub   SP, #16         Reserve space for 4 registers.
+        2     push  {R7}            Push low registers.
+         4     add   R7, SP, #20     Get the stack pointer before the push.
+         6     str   R7, [SP, #8]    Store the stack pointer
+                                       (before reserving the space).
+         8     mov   R7, PC          Get hold of the start of this code + 12.
+        10     str   R7, [SP, #16]   Store it.
+        12     mov   R7, FP          Get hold of the current frame pointer.
+        14     str   R7, [SP, #4]    Store it.
+        16     mov   R7, LR          Get hold of the current return address.
+        18     str   R7, [SP, #12]   Store it.
+        20     add   R7, SP, #16     Point at the start of the
+                                       backtrace structure.
+        22     mov   FP, R7          Put this value into the frame pointer.  */
  
-         emit_insn (gen_movsi (reg, GEN_INT (- amount)));
+      work_register = thumb_find_work_register (live_regs_mask);
+      work_reg = gen_rtx_REG (SImode, work_register);
+      arm_hfp_rtx = gen_rtx_REG (SImode, ARM_HARD_FRAME_POINTER_REGNUM);
  
-         insn = emit_insn (gen_addsi3 (stack_pointer_rtx,
-                                       stack_pointer_rtx, reg));
+      insn = emit_insn (gen_addsi3 (stack_pointer_rtx,
+                                   stack_pointer_rtx, GEN_INT (-16)));
+      RTX_FRAME_RELATED_P (insn) = 1;
+
+      if (l_mask)
+       {
+         insn = thumb1_emit_multi_reg_push (l_mask, l_mask);
           RTX_FRAME_RELATED_P (insn) = 1;
-         dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
-                              plus_constant (stack_pointer_rtx,
-                                             -amount));
-         RTX_FRAME_RELATED_P (dwarf) = 1;
-         add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
+
+         offset = bit_count (l_mask) * UNITS_PER_WORD;
         }
-    }
  
-  if (frame_pointer_needed)
-    thumb_set_frame_pointer (offsets);
+      x = GEN_INT (offset + 16 + crtl->args.pretend_args_size);
+      emit_insn (gen_addsi3 (work_reg, stack_pointer_rtx, x));
+
+      x = plus_constant (stack_pointer_rtx, offset + 4);
+      x = gen_frame_mem (SImode, x);
+      emit_move_insn (x, work_reg);
+
+      /* Make sure that the instruction fetching the PC is in the right place
+        to calculate "start of backtrace creation code + 12".  */
+      /* ??? The stores using the common WORK_REG ought to be enough to
+        prevent the scheduler from doing anything weird.  Failing that
+        we could always move all of the following into an UNSPEC_VOLATILE.  */
+      if (l_mask)
+       {
+         x = gen_rtx_REG (SImode, PC_REGNUM);
+         emit_move_insn (work_reg, x);
+
+         x = plus_constant (stack_pointer_rtx, offset + 12);
+         x = gen_frame_mem (SImode, x);
+         emit_move_insn (x, work_reg);
+
+         emit_move_insn (work_reg, arm_hfp_rtx);
+
+         x = plus_constant (stack_pointer_rtx, offset);
+         x = gen_frame_mem (SImode, x);
+         emit_move_insn (x, work_reg);
+       }
+      else
+       {
+         emit_move_insn (work_reg, arm_hfp_rtx);
+
+         x = plus_constant (stack_pointer_rtx, offset);
+         x = gen_frame_mem (SImode, x);
+         emit_move_insn (x, work_reg);
+
+         x = gen_rtx_REG (SImode, PC_REGNUM);
+         emit_move_insn (work_reg, x);
+
+         x = plus_constant (stack_pointer_rtx, offset + 12);
+         x = gen_frame_mem (SImode, x);
+         emit_move_insn (x, work_reg);
+       }
+
+      x = gen_rtx_REG (SImode, LR_REGNUM);
+      emit_move_insn (work_reg, x);
+
+      x = plus_constant (stack_pointer_rtx, offset + 8);
+      x = gen_frame_mem (SImode, x);
+      emit_move_insn (x, work_reg);
+
+      x = GEN_INT (offset + 12);
+      emit_insn (gen_addsi3 (work_reg, stack_pointer_rtx, x));
+
+      emit_move_insn (arm_hfp_rtx, work_reg);
+    }
+  /* Optimization:  If we are not pushing any low registers but we are going
+     to push some high registers then delay our first push.  This will just
+     be a push of LR and we can combine it with the push of the first high
+     register.  */
+  else if ((l_mask & 0xff) != 0
+          || (high_regs_pushed == 0 && l_mask))
+    {
+      unsigned long mask = l_mask;
+      mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1;
+      insn = thumb1_emit_multi_reg_push (mask, mask);
+      RTX_FRAME_RELATED_P (insn) = 1;
+    }
+
+  if (high_regs_pushed)
+    {
+      unsigned pushable_regs;
+      unsigned next_hi_reg;
+
+      for (next_hi_reg = 12; next_hi_reg > LAST_LO_REGNUM; next_hi_reg--)
+       if (live_regs_mask & (1 << next_hi_reg))
+         break;
+
+      pushable_regs = l_mask & 0xff;
+
+      if (pushable_regs == 0)
+       pushable_regs = 1 << thumb_find_work_register (live_regs_mask);
+
+      while (high_regs_pushed > 0)
+       {
+         unsigned long real_regs_mask = 0;
+
+         for (regno = LAST_LO_REGNUM; regno >= 0; regno --)
+           {
+             if (pushable_regs & (1 << regno))
+               {
+                 emit_move_insn (gen_rtx_REG (SImode, regno),
+                                 gen_rtx_REG (SImode, next_hi_reg));
+
+                 high_regs_pushed --;
+                 real_regs_mask |= (1 << next_hi_reg);
+
+                 if (high_regs_pushed)
+                   {
+                     for (next_hi_reg --; next_hi_reg > LAST_LO_REGNUM;
+                          next_hi_reg --)
+                       if (live_regs_mask & (1 << next_hi_reg))
+                         break;
+                   }
+                 else
+                   {
+                     pushable_regs &= ~((1 << regno) - 1);
+                     break;
+                   }
+               }
+           }
+
+         /* If we had to find a work register and we have not yet
+            saved the LR then add it to the list of regs to push.  */
+         if (l_mask == (1 << LR_REGNUM))
+           {
+             pushable_regs |= l_mask;
+             real_regs_mask |= l_mask;
+             l_mask = 0;
+           }
+
+         insn = thumb1_emit_multi_reg_push (pushable_regs, real_regs_mask);
+         RTX_FRAME_RELATED_P (insn) = 1;
+       }
+    }
+
+  /* Load the pic register before setting the frame pointer,
+     so we can use r7 as a temporary work register.  */
+  if (flag_pic && arm_pic_register != INVALID_REGNUM)
+    arm_load_pic_register (live_regs_mask);
+
+  if (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)
+    emit_move_insn (gen_rtx_REG (Pmode, ARM_HARD_FRAME_POINTER_REGNUM),
+                   stack_pointer_rtx);
+
+  if (flag_stack_usage_info)
+    current_function_static_stack_size
+      = offsets->outgoing_args - offsets->saved_args;
+
+  amount = offsets->outgoing_args - offsets->saved_regs;
+  amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
+  if (amount)
+    {
+      if (amount < 512)
+       {
+         insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
+                                       GEN_INT (- amount)));
+         RTX_FRAME_RELATED_P (insn) = 1;
+       }
+      else
+       {
+         rtx reg, dwarf;
+
+         /* The stack decrement is too big for an immediate value in a single
+            insn.  In theory we could issue multiple subtracts, but after
+            three of them it becomes more space efficient to place the full
+            value in the constant pool and load into a register.  (Also the
+            ARM debugger really likes to see only one stack decrement per
+            function).  So instead we look for a scratch register into which
+            we can load the decrement, and then we subtract this from the
+            stack pointer.  Unfortunately on the thumb the only available
+            scratch registers are the argument registers, and we cannot use
+            these as they may hold arguments to the function.  Instead we
+            attempt to locate a call preserved register which is used by this
+            function.  If we can find one, then we know that it will have
+            been pushed at the start of the prologue and so we can corrupt
+            it now.  */
+         for (regno = LAST_ARG_REGNUM + 1; regno <= LAST_LO_REGNUM; regno++)
+           if (live_regs_mask & (1 << regno))
+             break;
+
+         gcc_assert(regno <= LAST_LO_REGNUM);
+
+         reg = gen_rtx_REG (SImode, regno);
+
+         emit_insn (gen_movsi (reg, GEN_INT (- amount)));
+
+         insn = emit_insn (gen_addsi3 (stack_pointer_rtx,
+                                       stack_pointer_rtx, reg));
+
+         dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+                              plus_constant (stack_pointer_rtx,
+                                             -amount));
+         add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf);
+         RTX_FRAME_RELATED_P (insn) = 1;
+       }
+    }
+
+  if (frame_pointer_needed)
+    thumb_set_frame_pointer (offsets);
  
    /* If we are profiling, make sure no instructions are scheduled before
       the call to mcount.  Similarly if the user has requested no
@@ -20980,6 +22424,8 @@ thumb1_expand_epilogue (void)
    gcc_assert (amount >= 0);
    if (amount)
      {
+      emit_insn (gen_blockage ());
+
        if (amount < 512)
         emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx,
                                GEN_INT (amount)));
@@ -21010,321 +22456,113 @@ thumb1_expand_epilogue (void)
      emit_use (gen_rtx_REG (SImode, LR_REGNUM));
  }
  
-static void
-thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
-{
-  arm_stack_offsets *offsets;
-  unsigned long live_regs_mask = 0;
-  unsigned long l_mask;
-  unsigned high_regs_pushed = 0;
-  int cfa_offset = 0;
-  int regno;
-
-  if (IS_NAKED (arm_current_func_type ()))
-    return;
+/* Implementation of insn prologue_thumb1_interwork.  This is the first
+   "instruction" of a function called in ARM mode.  Swap to thumb mode.  */
  
-  if (is_called_in_ARM_mode (current_function_decl))
-    {
-      const char * name;
+const char *
+thumb1_output_interwork (void)
+{
+  const char * name;
+  FILE *f = asm_out_file;
  
-      gcc_assert (GET_CODE (DECL_RTL (current_function_decl)) == MEM);
-      gcc_assert (GET_CODE (XEXP (DECL_RTL (current_function_decl), 0))
-                 == SYMBOL_REF);
-      name = XSTR  (XEXP (DECL_RTL (current_function_decl), 0), 0);
+  gcc_assert (GET_CODE (DECL_RTL (current_function_decl)) == MEM);
+  gcc_assert (GET_CODE (XEXP (DECL_RTL (current_function_decl), 0))
+             == SYMBOL_REF);
+  name = XSTR (XEXP (DECL_RTL (current_function_decl), 0), 0);
  
-      /* Generate code sequence to switch us into Thumb mode.  */
-      /* The .code 32 directive has already been emitted by
-        ASM_DECLARE_FUNCTION_NAME.  */
-      asm_fprintf (f, "\torr\t%r, %r, #1\n", IP_REGNUM, PC_REGNUM);
-      asm_fprintf (f, "\tbx\t%r\n", IP_REGNUM);
+  /* Generate code sequence to switch us into Thumb mode.  */
+  /* The .code 32 directive has already been emitted by
+     ASM_DECLARE_FUNCTION_NAME.  */
+  asm_fprintf (f, "\torr\t%r, %r, #1\n", IP_REGNUM, PC_REGNUM);
+  asm_fprintf (f, "\tbx\t%r\n", IP_REGNUM);
  
-      /* Generate a label, so that the debugger will notice the
-        change in instruction sets.  This label is also used by
-        the assembler to bypass the ARM code when this function
-        is called from a Thumb encoded function elsewhere in the
-        same file.  Hence the definition of STUB_NAME here must
-        agree with the definition in gas/config/tc-arm.c.  */
+  /* Generate a label, so that the debugger will notice the
+     change in instruction sets.  This label is also used by
+     the assembler to bypass the ARM code when this function
+     is called from a Thumb encoded function elsewhere in the
+     same file.  Hence the definition of STUB_NAME here must
+     agree with the definition in gas/config/tc-arm.c.  */
  
  #define STUB_NAME ".real_start_of"
  
-      fprintf (f, "\t.code\t16\n");
+  fprintf (f, "\t.code\t16\n");
  #ifdef ARM_PE
-      if (arm_dllexport_name_p (name))
-        name = arm_strip_name_encoding (name);
+  if (arm_dllexport_name_p (name))
+    name = arm_strip_name_encoding (name);
  #endif
-      asm_fprintf (f, "\t.globl %s%U%s\n", STUB_NAME, name);
-      fprintf (f, "\t.thumb_func\n");
-      asm_fprintf (f, "%s%U%s:\n", STUB_NAME, name);
-    }
+  asm_fprintf (f, "\t.globl %s%U%s\n", STUB_NAME, name);
+  fprintf (f, "\t.thumb_func\n");
+  asm_fprintf (f, "%s%U%s:\n", STUB_NAME, name);
  
-  if (crtl->args.pretend_args_size)
-    {
-      /* Output unwind directive for the stack adjustment.  */
-      if (arm_except_unwind_info (&global_options) == UI_TARGET)
-       fprintf (f, "\t.pad #%d\n",
-                crtl->args.pretend_args_size);
+  return "";
+}
  
-      if (cfun->machine->uses_anonymous_args)
-       {
-         int num_pushes;
+/* Handle the case of a double word load into a low register from
+   a computed memory address.  The computed address may involve a
+   register which is overwritten by the load.  */
+const char *
+thumb_load_double_from_address (rtx *operands)
+{
+  rtx addr;
+  rtx base;
+  rtx offset;
+  rtx arg1;
+  rtx arg2;
  
-         fprintf (f, "\tpush\t{");
+  gcc_assert (GET_CODE (operands[0]) == REG);
+  gcc_assert (GET_CODE (operands[1]) == MEM);
  
-         num_pushes = ARM_NUM_INTS (crtl->args.pretend_args_size);
+  /* Get the memory address.  */
+  addr = XEXP (operands[1], 0);
  
-         for (regno = LAST_ARG_REGNUM + 1 - num_pushes;
-              regno <= LAST_ARG_REGNUM;
-              regno++)
-           asm_fprintf (f, "%r%s", regno,
-                        regno == LAST_ARG_REGNUM ? "" : ", ");
+  /* Work out how the memory address is computed.  */
+  switch (GET_CODE (addr))
+    {
+    case REG:
+      operands[2] = adjust_address (operands[1], SImode, 4);
  
-         fprintf (f, "}\n");
+      if (REGNO (operands[0]) == REGNO (addr))
+       {
+         output_asm_insn ("ldr\t%H0, %2", operands);
+         output_asm_insn ("ldr\t%0, %1", operands);
         }
        else
-       asm_fprintf (f, "\tsub\t%r, %r, #%d\n",
-                    SP_REGNUM, SP_REGNUM,
-                    crtl->args.pretend_args_size);
-
-      /* We don't need to record the stores for unwinding (would it
-        help the debugger any if we did?), but record the change in
-        the stack pointer.  */
-      if (dwarf2out_do_frame ())
         {
-         char *l = dwarf2out_cfi_label (false);
-
-         cfa_offset = cfa_offset + crtl->args.pretend_args_size;
-         dwarf2out_def_cfa (l, SP_REGNUM, cfa_offset);
+         output_asm_insn ("ldr\t%0, %1", operands);
+         output_asm_insn ("ldr\t%H0, %2", operands);
         }
-    }
-
-  /* Get the registers we are going to push.  */
-  offsets = arm_get_frame_offsets ();
-  live_regs_mask = offsets->saved_regs_mask;
-  /* Extract a mask of the ones we can give to the Thumb's push instruction.  */
-  l_mask = live_regs_mask & 0x40ff;
-  /* Then count how many other high registers will need to be pushed.  */
-  high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
-
-  if (TARGET_BACKTRACE)
-    {
-      unsigned offset;
-      unsigned work_register;
+      break;
  
-      /* We have been asked to create a stack backtrace structure.
-         The code looks like this:
+    case CONST:
+      /* Compute <address> + 4 for the high order load.  */
+      operands[2] = adjust_address (operands[1], SImode, 4);
  
-        0   .align 2
-        0   func:
-         0     sub   SP, #16         Reserve space for 4 registers.
-        2     push  {R7}            Push low registers.
-         4     add   R7, SP, #20     Get the stack pointer before the push.
-         6     str   R7, [SP, #8]    Store the stack pointer (before reserving the space).
-         8     mov   R7, PC          Get hold of the start of this code plus 12.
-        10     str   R7, [SP, #16]   Store it.
-        12     mov   R7, FP          Get hold of the current frame pointer.
-        14     str   R7, [SP, #4]    Store it.
-        16     mov   R7, LR          Get hold of the current return address.
-        18     str   R7, [SP, #12]   Store it.
-        20     add   R7, SP, #16     Point at the start of the backtrace structure.
-        22     mov   FP, R7          Put this value into the frame pointer.  */
+      output_asm_insn ("ldr\t%0, %1", operands);
+      output_asm_insn ("ldr\t%H0, %2", operands);
+      break;
  
-      work_register = thumb_find_work_register (live_regs_mask);
+    case PLUS:
+      arg1   = XEXP (addr, 0);
+      arg2   = XEXP (addr, 1);
  
-      if (arm_except_unwind_info (&global_options) == UI_TARGET)
-       asm_fprintf (f, "\t.pad #16\n");
+      if (CONSTANT_P (arg1))
+       base = arg2, offset = arg1;
+      else
+       base = arg1, offset = arg2;
  
-      asm_fprintf
-       (f, "\tsub\t%r, %r, #16\t%@ Create stack backtrace structure\n",
-        SP_REGNUM, SP_REGNUM);
+      gcc_assert (GET_CODE (base) == REG);
  
-      if (dwarf2out_do_frame ())
+      /* Catch the case of <address> = <reg> + <reg> */
+      if (GET_CODE (offset) == REG)
         {
-         char *l = dwarf2out_cfi_label (false);
+         int reg_offset = REGNO (offset);
+         int reg_base   = REGNO (base);
+         int reg_dest   = REGNO (operands[0]);
  
-         cfa_offset = cfa_offset + 16;
-         dwarf2out_def_cfa (l, SP_REGNUM, cfa_offset);
-       }
-
-      if (l_mask)
-       {
-         thumb_pushpop (f, l_mask, 1, &cfa_offset, l_mask);
-         offset = bit_count (l_mask) * UNITS_PER_WORD;
-       }
-      else
-       offset = 0;
-
-      asm_fprintf (f, "\tadd\t%r, %r, #%d\n", work_register, SP_REGNUM,
-                  offset + 16 + crtl->args.pretend_args_size);
-
-      asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM,
-                  offset + 4);
-
-      /* Make sure that the instruction fetching the PC is in the right place
-        to calculate "start of backtrace creation code + 12".  */
-      if (l_mask)
-       {
-         asm_fprintf (f, "\tmov\t%r, %r\n", work_register, PC_REGNUM);
-         asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM,
-                      offset + 12);
-         asm_fprintf (f, "\tmov\t%r, %r\n", work_register,
-                      ARM_HARD_FRAME_POINTER_REGNUM);
-         asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM,
-                      offset);
-       }
-      else
-       {
-         asm_fprintf (f, "\tmov\t%r, %r\n", work_register,
-                      ARM_HARD_FRAME_POINTER_REGNUM);
-         asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM,
-                      offset);
-         asm_fprintf (f, "\tmov\t%r, %r\n", work_register, PC_REGNUM);
-         asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM,
-                      offset + 12);
-       }
-
-      asm_fprintf (f, "\tmov\t%r, %r\n", work_register, LR_REGNUM);
-      asm_fprintf (f, "\tstr\t%r, [%r, #%d]\n", work_register, SP_REGNUM,
-                  offset + 8);
-      asm_fprintf (f, "\tadd\t%r, %r, #%d\n", work_register, SP_REGNUM,
-                  offset + 12);
-      asm_fprintf (f, "\tmov\t%r, %r\t\t%@ Backtrace structure created\n",
-                  ARM_HARD_FRAME_POINTER_REGNUM, work_register);
-    }
-  /* Optimization:  If we are not pushing any low registers but we are going
-     to push some high registers then delay our first push.  This will just
-     be a push of LR and we can combine it with the push of the first high
-     register.  */
-  else if ((l_mask & 0xff) != 0
-          || (high_regs_pushed == 0 && l_mask))
-    {
-      unsigned long mask = l_mask;
-      mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1;
-      thumb_pushpop (f, mask, 1, &cfa_offset, mask);
-    }
-
-  if (high_regs_pushed)
-    {
-      unsigned pushable_regs;
-      unsigned next_hi_reg;
-
-      for (next_hi_reg = 12; next_hi_reg > LAST_LO_REGNUM; next_hi_reg--)
-       if (live_regs_mask & (1 << next_hi_reg))
-         break;
-
-      pushable_regs = l_mask & 0xff;
-
-      if (pushable_regs == 0)
-       pushable_regs = 1 << thumb_find_work_register (live_regs_mask);
-
-      while (high_regs_pushed > 0)
-       {
-         unsigned long real_regs_mask = 0;
-
-         for (regno = LAST_LO_REGNUM; regno >= 0; regno --)
-           {
-             if (pushable_regs & (1 << regno))
-               {
-                 asm_fprintf (f, "\tmov\t%r, %r\n", regno, next_hi_reg);
-
-                 high_regs_pushed --;
-                 real_regs_mask |= (1 << next_hi_reg);
-
-                 if (high_regs_pushed)
-                   {
-                     for (next_hi_reg --; next_hi_reg > LAST_LO_REGNUM;
-                          next_hi_reg --)
-                       if (live_regs_mask & (1 << next_hi_reg))
-                         break;
-                   }
-                 else
-                   {
-                     pushable_regs &= ~((1 << regno) - 1);
-                     break;
-                   }
-               }
-           }
-
-         /* If we had to find a work register and we have not yet
-            saved the LR then add it to the list of regs to push.  */
-         if (l_mask == (1 << LR_REGNUM))
-           {
-             thumb_pushpop (f, pushable_regs | (1 << LR_REGNUM),
-                            1, &cfa_offset,
-                            real_regs_mask | (1 << LR_REGNUM));
-             l_mask = 0;
-           }
-         else
-           thumb_pushpop (f, pushable_regs, 1, &cfa_offset, real_regs_mask);
-       }
-    }
-}
-
-/* Handle the case of a double word load into a low register from
-   a computed memory address.  The computed address may involve a
-   register which is overwritten by the load.  */
-const char *
-thumb_load_double_from_address (rtx *operands)
-{
-  rtx addr;
-  rtx base;
-  rtx offset;
-  rtx arg1;
-  rtx arg2;
-
-  gcc_assert (GET_CODE (operands[0]) == REG);
-  gcc_assert (GET_CODE (operands[1]) == MEM);
-
-  /* Get the memory address.  */
-  addr = XEXP (operands[1], 0);
-
-  /* Work out how the memory address is computed.  */
-  switch (GET_CODE (addr))
-    {
-    case REG:
-      operands[2] = adjust_address (operands[1], SImode, 4);
-
-      if (REGNO (operands[0]) == REGNO (addr))
-       {
-         output_asm_insn ("ldr\t%H0, %2", operands);
-         output_asm_insn ("ldr\t%0, %1", operands);
-       }
-      else
-       {
-         output_asm_insn ("ldr\t%0, %1", operands);
-         output_asm_insn ("ldr\t%H0, %2", operands);
-       }
-      break;
-
-    case CONST:
-      /* Compute <address> + 4 for the high order load.  */
-      operands[2] = adjust_address (operands[1], SImode, 4);
-
-      output_asm_insn ("ldr\t%0, %1", operands);
-      output_asm_insn ("ldr\t%H0, %2", operands);
-      break;
-
-    case PLUS:
-      arg1   = XEXP (addr, 0);
-      arg2   = XEXP (addr, 1);
-
-      if (CONSTANT_P (arg1))
-       base = arg2, offset = arg1;
-      else
-       base = arg1, offset = arg2;
-
-      gcc_assert (GET_CODE (base) == REG);
-
-      /* Catch the case of <address> = <reg> + <reg> */
-      if (GET_CODE (offset) == REG)
-       {
-         int reg_offset = REGNO (offset);
-         int reg_base   = REGNO (base);
-         int reg_dest   = REGNO (operands[0]);
-
-         /* Add the base and offset registers together into the
-             higher destination register.  */
-         asm_fprintf (asm_out_file, "\tadd\t%r, %r, %r",
-                      reg_dest + 1, reg_base, reg_offset);
+         /* Add the base and offset registers together into the
+             higher destination register.  */
+         asm_fprintf (asm_out_file, "\tadd\t%r, %r, %r",
+                      reg_dest + 1, reg_base, reg_offset);
  
           /* Load the lower destination register from the address in
               the higher destination register.  */
@@ -21576,6 +22814,8 @@ arm_file_start (void)
        const char *fpu_name;
        if (arm_selected_arch)
         asm_fprintf (asm_out_file, "\t.arch %s\n", arm_selected_arch->name);
+      else if (strncmp (arm_selected_cpu->name, "generic", 7) == 0)
+       asm_fprintf (asm_out_file, "\t.arch %s\n", arm_selected_cpu->name + 8);
        else
         asm_fprintf (asm_out_file, "\t.cpu %s\n", arm_selected_cpu->name);
  
@@ -21592,9 +22832,9 @@ arm_file_start (void)
           if (arm_fpu_desc->model == ARM_FP_MODEL_VFP)
             {
               if (TARGET_HARD_FLOAT)
-               asm_fprintf (asm_out_file, "\t.eabi_attribute 27, 3\n");
+               EMIT_EABI_ATTRIBUTE (Tag_ABI_HardFP_use, 27, 3);
               if (TARGET_HARD_FLOAT_ABI)
-               asm_fprintf (asm_out_file, "\t.eabi_attribute 28, 1\n");
+               EMIT_EABI_ATTRIBUTE (Tag_ABI_VFP_args, 28, 1);
             }
         }
        asm_fprintf (asm_out_file, "\t.fpu %s\n", fpu_name);
@@ -21603,30 +22843,23 @@ arm_file_start (void)
           are used.  However we don't have any easy way of figuring this out.
          Conservatively record the setting that would have been used.  */
  
-      /* Tag_ABI_FP_rounding.  */
        if (flag_rounding_math)
-       asm_fprintf (asm_out_file, "\t.eabi_attribute 19, 1\n");
+       EMIT_EABI_ATTRIBUTE (Tag_ABI_FP_rounding, 19, 1);
+
        if (!flag_unsafe_math_optimizations)
         {
-         /* Tag_ABI_FP_denomal.  */
-         asm_fprintf (asm_out_file, "\t.eabi_attribute 20, 1\n");
-         /* Tag_ABI_FP_exceptions.  */
-         asm_fprintf (asm_out_file, "\t.eabi_attribute 21, 1\n");
+         EMIT_EABI_ATTRIBUTE (Tag_ABI_FP_denormal, 20, 1);
+         EMIT_EABI_ATTRIBUTE (Tag_ABI_FP_exceptions, 21, 1);
         }
-      /* Tag_ABI_FP_user_exceptions.  */
        if (flag_signaling_nans)
-       asm_fprintf (asm_out_file, "\t.eabi_attribute 22, 1\n");
-      /* Tag_ABI_FP_number_model.  */
-      asm_fprintf (asm_out_file, "\t.eabi_attribute 23, %d\n", 
-                  flag_finite_math_only ? 1 : 3);
-
-      /* Tag_ABI_align8_needed.  */
-      asm_fprintf (asm_out_file, "\t.eabi_attribute 24, 1\n");
-      /* Tag_ABI_align8_preserved.  */
-      asm_fprintf (asm_out_file, "\t.eabi_attribute 25, 1\n");
-      /* Tag_ABI_enum_size.  */
-      asm_fprintf (asm_out_file, "\t.eabi_attribute 26, %d\n",
-                  flag_short_enums ? 1 : 2);
+       EMIT_EABI_ATTRIBUTE (Tag_ABI_FP_user_exceptions, 22, 1);
+
+      EMIT_EABI_ATTRIBUTE (Tag_ABI_FP_number_model, 23,
+                          flag_finite_math_only ? 1 : 3);
+
+      EMIT_EABI_ATTRIBUTE (Tag_ABI_align8_needed, 24, 1);
+      EMIT_EABI_ATTRIBUTE (Tag_ABI_align8_preserved, 25, 1);
+      EMIT_EABI_ATTRIBUTE (Tag_ABI_enum_size, 26, flag_short_enums ? 1 : 2);
  
        /* Tag_ABI_optimization_goals.  */
        if (optimize_size)
@@ -21637,17 +22870,18 @@ arm_file_start (void)
         val = 1;
        else
         val = 6;
-      asm_fprintf (asm_out_file, "\t.eabi_attribute 30, %d\n", val);
+      EMIT_EABI_ATTRIBUTE (Tag_ABI_optimization_goals, 30, val);
+
+      EMIT_EABI_ATTRIBUTE (Tag_CPU_unaligned_access, 34, unaligned_access);
  
-      /* Tag_ABI_FP_16bit_format.  */
        if (arm_fp16_format)
-       asm_fprintf (asm_out_file, "\t.eabi_attribute 38, %d\n",
-                    (int)arm_fp16_format);
+       EMIT_EABI_ATTRIBUTE (Tag_ABI_FP_16bit_format, 38, (int) arm_fp16_format);
  
        if (arm_lang_output_object_attributes_hook)
         arm_lang_output_object_attributes_hook();
      }
-  default_file_start();
+
+  default_file_start ();
  }
  
  static void
@@ -21927,14 +23161,15 @@ arm_output_load_gr (rtx *operands)
     that way.  */
  
  static void
-arm_setup_incoming_varargs (CUMULATIVE_ARGS *pcum,
+arm_setup_incoming_varargs (cumulative_args_t pcum_v,
                             enum machine_mode mode,
                             tree type,
                             int *pretend_size,
                             int second_time ATTRIBUTE_UNUSED)
  {
+  CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
    int nregs;
-  
+
    cfun->machine->uses_anonymous_args = 1;
    if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
      {
@@ -21944,7 +23179,7 @@ arm_setup_incoming_varargs (CUMULATIVE_ARGS *pcum,
      }
    else
      nregs = pcum->nregs;
-  
+
    if (nregs < NUM_ARG_REGS)
      *pretend_size = (NUM_ARG_REGS - nregs) * UNITS_PER_WORD;
  }
@@ -22379,6 +23614,11 @@ arm_vector_mode_supported_p (enum machine_mode mode)
           || (mode == V8QImode)))
      return true;
  
+  if (TARGET_INT_SIMD && (mode == V4UQQmode || mode == V4QQmode
+      || mode == V2UHQmode || mode == V2HQmode || mode == V2UHAmode
+      || mode == V2HAmode))
+    return true;
+
    return false;
  }
  
@@ -22396,7 +23636,7 @@ arm_array_mode_supported_p (enum machine_mode mode,
    return false;
  }
  
-/* Use the option -mvectorize-with-neon-quad to override the use of doubleword
+/* Use the option -mvectorize-with-neon-double to override the use of quardword
     registers when autovectorizing for Neon, at least until multiple vector
     widths are supported properly by the middle-end.  */
  
@@ -22407,15 +23647,15 @@ arm_preferred_simd_mode (enum machine_mode mode)
      switch (mode)
        {
        case SFmode:
-       return TARGET_NEON_VECTORIZE_QUAD ? V4SFmode : V2SFmode;
+       return TARGET_NEON_VECTORIZE_DOUBLE ? V2SFmode : V4SFmode;
        case SImode:
-       return TARGET_NEON_VECTORIZE_QUAD ? V4SImode : V2SImode;
+       return TARGET_NEON_VECTORIZE_DOUBLE ? V2SImode : V4SImode;
        case HImode:
-       return TARGET_NEON_VECTORIZE_QUAD ? V8HImode : V4HImode;
+       return TARGET_NEON_VECTORIZE_DOUBLE ? V4HImode : V8HImode;
        case QImode:
-       return TARGET_NEON_VECTORIZE_QUAD ? V16QImode : V8QImode;
+       return TARGET_NEON_VECTORIZE_DOUBLE ? V8QImode : V16QImode;
        case DImode:
-       if (TARGET_NEON_VECTORIZE_QUAD)
+       if (!TARGET_NEON_VECTORIZE_DOUBLE)
           return V2DImode;
         break;
  
@@ -22439,7 +23679,7 @@ arm_preferred_simd_mode (enum machine_mode mode)
  }
  
  /* Implement TARGET_CLASS_LIKELY_SPILLED_P.
- 
+
     We need to define this for LO_REGS on Thumb-1.  Otherwise we can end up
     using r0-r4 for function arguments, r7 for the stack frame and don't have
     enough left over to do doubleword arithmetic.  For Thumb-2 all the
@@ -22465,7 +23705,7 @@ arm_small_register_classes_for_mode_p (enum machine_mode mode ATTRIBUTE_UNUSED)
  
  /* Implement TARGET_SHIFT_TRUNCATION_MASK.  SImode shifts use normal
     ARM insns and therefore guarantee that the shift count is modulo 256.
-   DImode shifts (those implemented by lib1funcs.asm or by optabs.c)
+   DImode shifts (those implemented by lib1funcs.S or by optabs.c)
     guarantee no particular behavior for out-of-range counts.  */
  
  static unsigned HOST_WIDE_INT
@@ -22730,13 +23970,6 @@ arm_unwind_emit_set (FILE * asm_out_file, rtx p)
           asm_fprintf (asm_out_file, "\t.movsp %r, #%d\n",
                        REGNO (e0), (int)INTVAL(XEXP (e1, 1)));
         }
-      else if (GET_CODE (e1) == UNSPEC && XINT (e1, 1) == UNSPEC_STACK_ALIGN)
-       {
-         /* Stack pointer save before alignment.  */
-         reg = REGNO (e0);
-         asm_fprintf (asm_out_file, "\t.unwind_raw 0, 0x%x @ vsp = r%d\n",
-                      reg + 0x90, reg);
-       }
        else
         abort ();
        break;
@@ -22752,7 +23985,8 @@ arm_unwind_emit_set (FILE * asm_out_file, rtx p)
  static void
  arm_unwind_emit (FILE * asm_out_file, rtx insn)
  {
-  rtx pat;
+  rtx note, pat;
+  bool handled_one = false;
  
    if (arm_except_unwind_info (&global_options) != UI_TARGET)
      return;
@@ -22762,14 +23996,56 @@ arm_unwind_emit (FILE * asm_out_file, rtx insn)
           || crtl->all_throwers_are_sibcalls))
      return;
  
-  if (GET_CODE (insn) == NOTE || !RTX_FRAME_RELATED_P (insn))
+  if (NOTE_P (insn) || !RTX_FRAME_RELATED_P (insn))
      return;
  
-  pat = find_reg_note (insn, REG_FRAME_RELATED_EXPR, NULL_RTX);
-  if (pat)
-    pat = XEXP (pat, 0);
-  else
-    pat = PATTERN (insn);
+  for (note = REG_NOTES (insn); note ; note = XEXP (note, 1))
+    {
+      pat = XEXP (note, 0);
+      switch (REG_NOTE_KIND (note))
+       {
+       case REG_FRAME_RELATED_EXPR:
+         goto found;
+
+       case REG_CFA_REGISTER:
+         if (pat == NULL)
+           {
+             pat = PATTERN (insn);
+             if (GET_CODE (pat) == PARALLEL)
+               pat = XVECEXP (pat, 0, 0);
+           }
+
+         /* Only emitted for IS_STACKALIGN re-alignment.  */
+         {
+           rtx dest, src;
+           unsigned reg;
+
+           src = SET_SRC (pat);
+           dest = SET_DEST (pat);
+
+           gcc_assert (src == stack_pointer_rtx);
+           reg = REGNO (dest);
+           asm_fprintf (asm_out_file, "\t.unwind_raw 0, 0x%x @ vsp = r%d\n",
+                        reg + 0x90, reg);
+         }
+         handled_one = true;
+         break;
+
+       case REG_CFA_DEF_CFA:
+       case REG_CFA_EXPRESSION:
+       case REG_CFA_ADJUST_CFA:
+       case REG_CFA_OFFSET:
+         /* ??? Only handling here what we actually emit.  */
+         gcc_unreachable ();
+
+       default:
+         break;
+       }
+    }
+  if (handled_one)
+    return;
+  pat = PATTERN (insn);
+ found:
  
    switch (GET_CODE (pat))
      {
@@ -22825,57 +24101,6 @@ arm_asm_init_sections (void)
  }
  #endif /* ARM_UNWIND_INFO */
  
-/* Implement TARGET_EXCEPT_UNWIND_INFO.  */
-
-static enum unwind_info_type
-arm_except_unwind_info (struct gcc_options *opts)
-{
-  /* Honor the --enable-sjlj-exceptions configure switch.  */
-#ifdef CONFIG_SJLJ_EXCEPTIONS
-  if (CONFIG_SJLJ_EXCEPTIONS)
-    return UI_SJLJ;
-#endif
-
-  /* If not using ARM EABI unwind tables... */
-  if (ARM_UNWIND_INFO)
-    {
-      /* For simplicity elsewhere in this file, indicate that all unwind
-        info is disabled if we're not emitting unwind tables.  */
-      if (!opts->x_flag_exceptions && !opts->x_flag_unwind_tables)
-       return UI_NONE;
-      else
-       return UI_TARGET;
-    }
-
-  /* ... we use sjlj exceptions for backwards compatibility.  */
-  return UI_SJLJ;
-}
-
-
-/* Handle UNSPEC DWARF call frame instructions.  These are needed for dynamic
-   stack alignment.  */
-
-static void
-arm_dwarf_handle_frame_unspec (const char *label, rtx pattern, int index)
-{
-  rtx unspec = SET_SRC (pattern);
-  gcc_assert (GET_CODE (unspec) == UNSPEC);
-
-  switch (index)
-    {
-    case UNSPEC_STACK_ALIGN:
-      /* ??? We should set the CFA = (SP & ~7).  At this point we haven't
-         put anything on the stack, so hopefully it won't matter.
-         CFA = SP will be correct after alignment.  */
-      dwarf2out_reg_save_reg (label, stack_pointer_rtx,
-                              SET_DEST (pattern));
-      break;
-    default:
-      gcc_unreachable ();
-    }
-}
-
-
  /* Output unwind directives for the start/end of a function.  */
  
  void
@@ -22928,6 +24153,9 @@ arm_emit_tls_decoration (FILE *fp, rtx x)
      case TLS_LE32:
        fputs ("(tpoff)", fp);
        break;
+    case TLS_DESCSEQ:
+      fputs ("(tlsdesc)", fp);
+      break;
      default:
        gcc_unreachable ();
      }
@@ -22937,9 +24165,11 @@ arm_emit_tls_decoration (FILE *fp, rtx x)
      case TLS_GD32:
      case TLS_LDM32:
      case TLS_IE32:
+    case TLS_DESCSEQ:
        fputs (" + (. - ", fp);
        output_addr_const (fp, XVECEXP (x, 0, 2));
-      fputs (" - ", fp);
+      /* For DESCSEQ the 3rd operand encodes thumbness, and is added */
+      fputs (reloc == TLS_DESCSEQ ? " + " : " - ", fp);
        output_addr_const (fp, XVECEXP (x, 0, 3));
        fputc (')', fp);
        break;
@@ -23017,7 +24247,7 @@ arm_output_shift(rtx * operands, int set_flags)
    const char *shift;
    HOST_WIDE_INT val;
    char c;
-  
+
    c = flag_chars[set_flags];
    if (TARGET_UNIFIED_ASM)
      {
@@ -23048,10 +24278,10 @@ thumb1_output_casesi (rtx *operands)
    switch (GET_MODE(diff_vec))
      {
      case QImode:
-      return (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned ? 
+      return (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned ?
               "bl\t%___gnu_thumb1_case_uqi" : "bl\t%___gnu_thumb1_case_sqi");
      case HImode:
-      return (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned ? 
+      return (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned ?
               "bl\t%___gnu_thumb1_case_uhi" : "bl\t%___gnu_thumb1_case_shi");
      case SImode:
        return "bl\t%___gnu_thumb1_case_si";
@@ -23101,9 +24331,13 @@ arm_issue_rate (void)
  {
    switch (arm_tune)
      {
+    case cortexa15:
+      return 3;
+
      case cortexr4:
      case cortexr4f:
      case cortexr5:
+    case genericv7a:
      case cortexa5:
      case cortexa8:
      case cortexa9:
@@ -23159,7 +24393,7 @@ arm_mangle_type (const_tree type)
  
    /* The ARM ABI documents (10th October 2008) say that "__va_list"
       has to be managled as if it is in the "std" namespace.  */
-  if (TARGET_AAPCS_BASED 
+  if (TARGET_AAPCS_BASED
        && lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
      {
        static bool warned;
@@ -23241,461 +24475,57 @@ arm_have_conditional_execution (void)
    return !TARGET_THUMB1;
  }
  
-/* Legitimize a memory reference for sync primitive implemented using
-   ldrex / strex.  We currently force the form of the reference to be
-   indirect without offset.  We do not yet support the indirect offset
-   addressing supported by some ARM targets for these
-   instructions.  */
-static rtx
-arm_legitimize_sync_memory (rtx memory)
+static unsigned int
+arm_autovectorize_vector_sizes (void)
  {
-  rtx addr = force_reg (Pmode, XEXP (memory, 0));
-  rtx legitimate_memory = gen_rtx_MEM (GET_MODE (memory), addr);
-
-  set_mem_alias_set (legitimate_memory, ALIAS_SET_MEMORY_BARRIER);
-  MEM_VOLATILE_P (legitimate_memory) = MEM_VOLATILE_P (memory);
-  return legitimate_memory;
+  return TARGET_NEON_VECTORIZE_DOUBLE ? 0 : (16 | 8);
  }
  
-/* An instruction emitter. */
-typedef void (* emit_f) (int label, const char *, rtx *);
-
-/* An instruction emitter that emits via the conventional
-   output_asm_insn.  */
-static void
-arm_emit (int label ATTRIBUTE_UNUSED, const char *pattern, rtx *operands)
+static bool
+arm_vector_alignment_reachable (const_tree type, bool is_packed)
  {
-  output_asm_insn (pattern, operands);
-}
-
-/* Count the number of emitted synchronization instructions.  */
-static unsigned arm_insn_count;
+  /* Vectors which aren't in packed structures will not be less aligned than
+     the natural alignment of their element type, so this is safe.  */
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    return !is_packed;
  
-/* An emitter that counts emitted instructions but does not actually
-   emit instruction into the instruction stream.  */
-static void
-arm_count (int label,
-          const char *pattern ATTRIBUTE_UNUSED,
-          rtx *operands ATTRIBUTE_UNUSED)
-{
-  if (! label)
-    ++ arm_insn_count;
+  return default_builtin_vector_alignment_reachable (type, is_packed);
  }
  
-/* Construct a pattern using conventional output formatting and feed
-   it to output_asm_insn.  Provides a mechanism to construct the
-   output pattern on the fly.  Note the hard limit on the pattern
-   buffer size.  */
-static void ATTRIBUTE_PRINTF_4
-arm_output_asm_insn (emit_f emit, int label, rtx *operands,
-                    const char *pattern, ...)
+static bool
+arm_builtin_support_vector_misalignment (enum machine_mode mode,
+                                        const_tree type, int misalignment,
+                                        bool is_packed)
  {
-  va_list ap;
-  char buffer[256];
+  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+    {
+      HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
  
-  va_start (ap, pattern);
-  vsprintf (buffer, pattern, ap);
-  va_end (ap);
-  emit (label, buffer, operands);
+      if (is_packed)
+        return align == 1;
+
+      /* If the misalignment is unknown, we should be able to handle the access
+        so long as it is not to a member of a packed data structure.  */
+      if (misalignment == -1)
+        return true;
+
+      /* Return true if the misalignment is a multiple of the natural alignment
+         of the vector's element type.  This is probably always going to be
+        true in practice, since we've already established that this isn't a
+        packed access.  */
+      return ((misalignment % align) == 0);
+    }
+
+  return default_builtin_support_vector_misalignment (mode, type, misalignment,
+                                                     is_packed);
  }
  
-/* Emit the memory barrier instruction, if any, provided by this
-   target to a specified emitter.  */
  static void
-arm_process_output_memory_barrier (emit_f emit, rtx *operands)
+arm_conditional_register_usage (void)
  {
-  if (TARGET_HAVE_DMB)
-    {
-      /* Note we issue a system level barrier. We should consider
-         issuing a inner shareabilty zone barrier here instead, ie.
-         "DMB ISH".  */
-      emit (0, "dmb\tsy", operands);
-      return;
-    }
-
-  if (TARGET_HAVE_DMB_MCR)
-    {
-      emit (0, "mcr\tp15, 0, r0, c7, c10, 5", operands);
-      return;
-    }
-
-  gcc_unreachable ();
-}
-
-/* Emit the memory barrier instruction, if any, provided by this
-   target.  */
-const char *
-arm_output_memory_barrier (rtx *operands)
-{
-  arm_process_output_memory_barrier (arm_emit, operands);
-  return "";
-}
-
-/* Helper to figure out the instruction suffix required on ldrex/strex
-   for operations on an object of the specified mode.  */
-static const char *
-arm_ldrex_suffix (enum machine_mode mode)
-{
-  switch (mode)
-    {
-    case QImode: return "b";
-    case HImode: return "h";
-    case SImode: return "";
-    case DImode: return "d";
-    default:
-      gcc_unreachable ();
-    }
-  return "";
-}
-
-/* Emit an ldrex{b,h,d, } instruction appropriate for the specified
-   mode.  */
-static void
-arm_output_ldrex (emit_f emit,
-                 enum machine_mode mode,
-                 rtx target,
-                 rtx memory)
-{
-  const char *suffix = arm_ldrex_suffix (mode);
-  rtx operands[2];
-
-  operands[0] = target;
-  operands[1] = memory;
-  arm_output_asm_insn (emit, 0, operands, "ldrex%s\t%%0, %%C1", suffix);
-}
-
-/* Emit a strex{b,h,d, } instruction appropriate for the specified
-   mode.  */
-static void
-arm_output_strex (emit_f emit,
-                 enum machine_mode mode,
-                 const char *cc,
-                 rtx result,
-                 rtx value,
-                 rtx memory)
-{
-  const char *suffix = arm_ldrex_suffix (mode);
-  rtx operands[3];
-
-  operands[0] = result;
-  operands[1] = value;
-  operands[2] = memory;
-  arm_output_asm_insn (emit, 0, operands, "strex%s%s\t%%0, %%1, %%C2", suffix,
-                      cc);
-}
-
-/* Helper to emit a two operand instruction.  */
-static void
-arm_output_op2 (emit_f emit, const char *mnemonic, rtx d, rtx s)
-{
-  rtx operands[2];
-
-  operands[0] = d;
-  operands[1] = s;
-  arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1", mnemonic);
-}
-
-/* Helper to emit a three operand instruction.  */
-static void
-arm_output_op3 (emit_f emit, const char *mnemonic, rtx d, rtx a, rtx b)
-{
-  rtx operands[3];
-
-  operands[0] = d;
-  operands[1] = a;
-  operands[2] = b;
-  arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1, %%2", mnemonic);
-}
-
-/* Emit a load store exclusive synchronization loop.
-
-   do
-     old_value = [mem]
-     if old_value != required_value
-       break;
-     t1 = sync_op (old_value, new_value)
-     [mem] = t1, t2 = [0|1]
-   while ! t2
-
-   Note:
-     t1 == t2 is not permitted
-     t1 == old_value is permitted
-
-   required_value:
-
-   RTX register or const_int representing the required old_value for
-   the modify to continue, if NULL no comparsion is performed.  */
-static void
-arm_output_sync_loop (emit_f emit,
-                     enum machine_mode mode,
-                     rtx old_value,
-                     rtx memory,
-                     rtx required_value,
-                     rtx new_value,
-                     rtx t1,
-                     rtx t2,
-                     enum attr_sync_op sync_op,
-                     int early_barrier_required)
-{
-  rtx operands[1];
-
-  gcc_assert (t1 != t2);
-
-  if (early_barrier_required)
-    arm_process_output_memory_barrier (emit, NULL);
-
-  arm_output_asm_insn (emit, 1, operands, "%sLSYT%%=:", LOCAL_LABEL_PREFIX);
-
-  arm_output_ldrex (emit, mode, old_value, memory);
-
-  if (required_value)
-    {
-      rtx operands[2];
-
-      operands[0] = old_value;
-      operands[1] = required_value;
-      arm_output_asm_insn (emit, 0, operands, "cmp\t%%0, %%1");
-      arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYB%%=", LOCAL_LABEL_PREFIX);
-    }
-
-  switch (sync_op)
-    {
-    case SYNC_OP_ADD:
-      arm_output_op3 (emit, "add", t1, old_value, new_value);
-      break;
-
-    case SYNC_OP_SUB:
-      arm_output_op3 (emit, "sub", t1, old_value, new_value);
-      break;
-
-    case SYNC_OP_IOR:
-      arm_output_op3 (emit, "orr", t1, old_value, new_value);
-      break;
-
-    case SYNC_OP_XOR:
-      arm_output_op3 (emit, "eor", t1, old_value, new_value);
-      break;
-
-    case SYNC_OP_AND:
-      arm_output_op3 (emit,"and", t1, old_value, new_value);
-      break;
-
-    case SYNC_OP_NAND:
-      arm_output_op3 (emit, "and", t1, old_value, new_value);
-      arm_output_op2 (emit, "mvn", t1, t1);
-      break;
-
-    case SYNC_OP_NONE:
-      t1 = new_value;
-      break;
-    }
-
-  if (t2)
-    {
-       arm_output_strex (emit, mode, "", t2, t1, memory);
-       operands[0] = t2;
-       arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
-       arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=",
-                           LOCAL_LABEL_PREFIX);
-    }
-  else
-    {
-      /* Use old_value for the return value because for some operations
-        the old_value can easily be restored.  This saves one register.  */
-      arm_output_strex (emit, mode, "", old_value, t1, memory);
-      operands[0] = old_value;
-      arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
-      arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=",
-                          LOCAL_LABEL_PREFIX);
-
-      switch (sync_op)
-       {
-       case SYNC_OP_ADD:
-         arm_output_op3 (emit, "sub", old_value, t1, new_value);
-         break;
-
-       case SYNC_OP_SUB:
-         arm_output_op3 (emit, "add", old_value, t1, new_value);
-         break;
-
-       case SYNC_OP_XOR:
-         arm_output_op3 (emit, "eor", old_value, t1, new_value);
-         break;
-
-       case SYNC_OP_NONE:
-         arm_output_op2 (emit, "mov", old_value, required_value);
-         break;
-
-       default:
-         gcc_unreachable ();
-       }
-    }
-
-  arm_process_output_memory_barrier (emit, NULL);
-  arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX);
-}
-
-static rtx
-arm_get_sync_operand (rtx *operands, int index, rtx default_value)
-{
-  if (index > 0)
-    default_value = operands[index - 1];
-
-  return default_value;
-}
-
-#define FETCH_SYNC_OPERAND(NAME, DEFAULT) \
-  arm_get_sync_operand (operands, (int) get_attr_sync_##NAME (insn), DEFAULT);
-
-/* Extract the operands for a synchroniztion instruction from the
-   instructions attributes and emit the instruction.  */
-static void
-arm_process_output_sync_insn (emit_f emit, rtx insn, rtx *operands)
-{
-  rtx result, memory, required_value, new_value, t1, t2;
-  int early_barrier;
-  enum machine_mode mode;
-  enum attr_sync_op sync_op;
-
-  result = FETCH_SYNC_OPERAND(result, 0);
-  memory = FETCH_SYNC_OPERAND(memory, 0);
-  required_value = FETCH_SYNC_OPERAND(required_value, 0);
-  new_value = FETCH_SYNC_OPERAND(new_value, 0);
-  t1 = FETCH_SYNC_OPERAND(t1, 0);
-  t2 = FETCH_SYNC_OPERAND(t2, 0);
-  early_barrier =
-    get_attr_sync_release_barrier (insn) == SYNC_RELEASE_BARRIER_YES;
-  sync_op = get_attr_sync_op (insn);
-  mode = GET_MODE (memory);
-
-  arm_output_sync_loop (emit, mode, result, memory, required_value,
-                       new_value, t1, t2, sync_op, early_barrier);
-}
-
-/* Emit a synchronization instruction loop.  */
-const char *
-arm_output_sync_insn (rtx insn, rtx *operands)
-{
-  arm_process_output_sync_insn (arm_emit, insn, operands);
-  return "";
-}
-
-/* Count the number of machine instruction that will be emitted for a
-   synchronization instruction.  Note that the emitter used does not
-   emit instructions, it just counts instructions being carefull not
-   to count labels.  */
-unsigned int
-arm_sync_loop_insns (rtx insn, rtx *operands)
-{
-  arm_insn_count = 0;
-  arm_process_output_sync_insn (arm_count, insn, operands);
-  return arm_insn_count;
-}
-
-/* Helper to call a target sync instruction generator, dealing with
-   the variation in operands required by the different generators.  */
-static rtx
-arm_call_generator (struct arm_sync_generator *generator, rtx old_value,
-                   rtx memory, rtx required_value, rtx new_value)
-{
-  switch (generator->op)
-    {
-    case arm_sync_generator_omn:
-      gcc_assert (! required_value);
-      return generator->u.omn (old_value, memory, new_value);
-
-    case arm_sync_generator_omrn:
-      gcc_assert (required_value);
-      return generator->u.omrn (old_value, memory, required_value, new_value);
-    }
-
-  return NULL;
-}
-
-/* Expand a synchronization loop. The synchronization loop is expanded
-   as an opaque block of instructions in order to ensure that we do
-   not subsequently get extraneous memory accesses inserted within the
-   critical region. The exclusive access property of ldrex/strex is
-   only guaranteed in there are no intervening memory accesses. */
-void
-arm_expand_sync (enum machine_mode mode,
-                struct arm_sync_generator *generator,
-                rtx target, rtx memory, rtx required_value, rtx new_value)
-{
-  if (target == NULL)
-    target = gen_reg_rtx (mode);
-
-  memory = arm_legitimize_sync_memory (memory);
-  if (mode != SImode)
-    {
-      rtx load_temp = gen_reg_rtx (SImode);
-
-      if (required_value)
-       required_value = convert_modes (SImode, mode, required_value, true);
-
-      new_value = convert_modes (SImode, mode, new_value, true);
-      emit_insn (arm_call_generator (generator, load_temp, memory,
-                                    required_value, new_value));
-      emit_move_insn (target, gen_lowpart (mode, load_temp));
-    }
-  else
-    {
-      emit_insn (arm_call_generator (generator, target, memory, required_value,
-                                    new_value));
-    }
-}
-
-static unsigned int
-arm_autovectorize_vector_sizes (void)
-{
-  return TARGET_NEON_VECTORIZE_QUAD ? 16 | 8 : 0;
-}
-
-static bool
-arm_vector_alignment_reachable (const_tree type, bool is_packed)
-{
-  /* Vectors which aren't in packed structures will not be less aligned than
-     the natural alignment of their element type, so this is safe.  */
-  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
-    return !is_packed;
-
-  return default_builtin_vector_alignment_reachable (type, is_packed);
-}
-
-static bool
-arm_builtin_support_vector_misalignment (enum machine_mode mode,
-                                        const_tree type, int misalignment,
-                                        bool is_packed)
-{
-  if (TARGET_NEON && !BYTES_BIG_ENDIAN)
-    {
-      HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
-
-      if (is_packed)
-        return align == 1;
-
-      /* If the misalignment is unknown, we should be able to handle the access
-        so long as it is not to a member of a packed data structure.  */
-      if (misalignment == -1)
-        return true;
-
-      /* Return true if the misalignment is a multiple of the natural alignment
-         of the vector's element type.  This is probably always going to be
-        true in practice, since we've already established that this isn't a
-        packed access.  */
-      return ((misalignment % align) == 0);
-    }
-  
-  return default_builtin_support_vector_misalignment (mode, type, misalignment,
-                                                     is_packed);
-}
-
-static void
-arm_conditional_register_usage (void)
-{
-  int regno;
-
-  if (TARGET_SOFT_FLOAT || TARGET_THUMB1 || !TARGET_FPA)
+  int regno;
+
+  if (TARGET_SOFT_FLOAT || TARGET_THUMB1 || !TARGET_FPA)
      {
        for (regno = FIRST_FPA_REGNUM;
            regno <= LAST_FPA_REGNUM; ++regno)
@@ -23815,6 +24645,9 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
    /* ARM mode.  */
    if (TARGET_ARM)
      return 4;
+  /* Thumb1 mode.  */
+  if (TARGET_THUMB1)
+    return 2;
  
    /* Thumb2 mode.  */
    regno = REGNO (first_op);
@@ -23830,4 +24663,853 @@ arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
    return 4;
  }
  
+/* Compute the number of instructions emitted by output_move_double.  */
+int
+arm_count_output_move_double_insns (rtx *operands)
+{
+  int count;
+  rtx ops[2];
+  /* output_move_double may modify the operands array, so call it
+     here on a copy of the array.  */
+  ops[0] = operands[0];
+  ops[1] = operands[1];
+  output_move_double (ops, false, &count);
+  return count;
+}
+
+int
+vfp3_const_double_for_fract_bits (rtx operand)
+{
+  REAL_VALUE_TYPE r0;
+  
+  if (GET_CODE (operand) != CONST_DOUBLE)
+    return 0;
+  
+  REAL_VALUE_FROM_CONST_DOUBLE (r0, operand);
+  if (exact_real_inverse (DFmode, &r0))
+    {
+      if (exact_real_truncate (DFmode, &r0))
+       {
+         HOST_WIDE_INT value = real_to_integer (&r0);
+         value = value & 0xffffffff;
+         if ((value != 0) && ( (value & (value - 1)) == 0))
+           return int_log2 (value);
+       }
+    }
+  return 0;
+}
+\f
+/* Emit a memory barrier around an atomic sequence according to MODEL.  */
+
+static void
+arm_pre_atomic_barrier (enum memmodel model)
+{
+  switch (model)
+    {
+    case MEMMODEL_RELAXED:
+    case MEMMODEL_CONSUME:
+    case MEMMODEL_ACQUIRE:
+      break;
+    case MEMMODEL_RELEASE:
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      emit_insn (gen_memory_barrier ());
+      break;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+static void
+arm_post_atomic_barrier (enum memmodel model)
+{
+  switch (model)
+    {
+    case MEMMODEL_RELAXED:
+    case MEMMODEL_CONSUME:
+    case MEMMODEL_RELEASE:
+      break;
+    case MEMMODEL_ACQUIRE:
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      emit_insn (gen_memory_barrier ());
+      break;
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Emit the load-exclusive and store-exclusive instructions.  */
+
+static void
+arm_emit_load_exclusive (enum machine_mode mode, rtx rval, rtx mem)
+{
+  rtx (*gen) (rtx, rtx);
+
+  switch (mode)
+    {
+    case QImode: gen = gen_arm_load_exclusiveqi; break;
+    case HImode: gen = gen_arm_load_exclusivehi; break;
+    case SImode: gen = gen_arm_load_exclusivesi; break;
+    case DImode: gen = gen_arm_load_exclusivedi; break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (gen (rval, mem));
+}
+
+static void
+arm_emit_store_exclusive (enum machine_mode mode, rtx bval, rtx rval, rtx mem)
+{
+  rtx (*gen) (rtx, rtx, rtx);
+
+  switch (mode)
+    {
+    case QImode: gen = gen_arm_store_exclusiveqi; break;
+    case HImode: gen = gen_arm_store_exclusivehi; break;
+    case SImode: gen = gen_arm_store_exclusivesi; break;
+    case DImode: gen = gen_arm_store_exclusivedi; break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (gen (bval, rval, mem));
+}
+
+/* Mark the previous jump instruction as unlikely.  */
+
+static void
+emit_unlikely_jump (rtx insn)
+{
+  rtx very_unlikely = GEN_INT (REG_BR_PROB_BASE / 100 - 1);
+
+  insn = emit_jump_insn (insn);
+  add_reg_note (insn, REG_BR_PROB, very_unlikely);
+}
+
+/* Expand a compare and swap pattern.  */
+
+void
+arm_expand_compare_and_swap (rtx operands[])
+{
+  rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
+  enum machine_mode mode;
+  rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+
+  bval = operands[0];
+  rval = operands[1];
+  mem = operands[2];
+  oldval = operands[3];
+  newval = operands[4];
+  is_weak = operands[5];
+  mod_s = operands[6];
+  mod_f = operands[7];
+  mode = GET_MODE (mem);
+
+  switch (mode)
+    {
+    case QImode:
+    case HImode:
+      /* For narrow modes, we're going to perform the comparison in SImode,
+        so do the zero-extension now.  */
+      rval = gen_reg_rtx (SImode);
+      oldval = convert_modes (SImode, mode, oldval, true);
+      /* FALLTHRU */
+
+    case SImode:
+      /* Force the value into a register if needed.  We waited until after
+        the zero-extension above to do this properly.  */
+      if (!arm_add_operand (oldval, mode))
+       oldval = force_reg (mode, oldval);
+      break;
+
+    case DImode:
+      if (!cmpdi_operand (oldval, mode))
+       oldval = force_reg (mode, oldval);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (mode)
+    {
+    case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
+    case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
+    case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
+    case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
+
+  if (mode == QImode || mode == HImode)
+    emit_move_insn (operands[1], gen_lowpart (mode, rval));
+
+  /* In all cases, we arrange for success to be signaled by Z set.
+     This arrangement allows for the boolean result to be used directly
+     in a subsequent branch, post optimization.  */
+  x = gen_rtx_REG (CCmode, CC_REGNUM);
+  x = gen_rtx_EQ (SImode, x, const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, bval, x));
+}
+
+/* Split a compare and swap pattern.  It is IMPLEMENTATION DEFINED whether
+   another memory store between the load-exclusive and store-exclusive can
+   reset the monitor from Exclusive to Open state.  This means we must wait
+   until after reload to split the pattern, lest we get a register spill in
+   the middle of the atomic sequence.  */
+
+void
+arm_split_compare_and_swap (rtx operands[])
+{
+  rtx rval, mem, oldval, newval, scratch;
+  enum machine_mode mode;
+  enum memmodel mod_s, mod_f;
+  bool is_weak;
+  rtx label1, label2, x, cond;
+
+  rval = operands[0];
+  mem = operands[1];
+  oldval = operands[2];
+  newval = operands[3];
+  is_weak = (operands[4] != const0_rtx);
+  mod_s = (enum memmodel) INTVAL (operands[5]);
+  mod_f = (enum memmodel) INTVAL (operands[6]);
+  scratch = operands[7];
+  mode = GET_MODE (mem);
+
+  arm_pre_atomic_barrier (mod_s);
+
+  label1 = NULL_RTX;
+  if (!is_weak)
+    {
+      label1 = gen_label_rtx ();
+      emit_label (label1);
+    }
+  label2 = gen_label_rtx ();
+
+  arm_emit_load_exclusive (mode, rval, mem);
+
+  cond = arm_gen_compare_reg (NE, rval, oldval, scratch);
+  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+  x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                           gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+  emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
+
+  arm_emit_store_exclusive (mode, scratch, mem, newval);
+
+  /* Weak or strong, we want EQ to be true for success, so that we
+     match the flags that we got from the compare above.  */
+  cond = gen_rtx_REG (CCmode, CC_REGNUM);
+  x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
+  emit_insn (gen_rtx_SET (VOIDmode, cond, x));
+
+  if (!is_weak)
+    {
+      x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+      x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+                               gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+      emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
+    }
+
+  if (mod_f != MEMMODEL_RELAXED)
+    emit_label (label2);
+
+  arm_post_atomic_barrier (mod_s);
+
+  if (mod_f == MEMMODEL_RELAXED)
+    emit_label (label2);
+}
+
+void
+arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
+                    rtx value, rtx model_rtx, rtx cond)
+{
+  enum memmodel model = (enum memmodel) INTVAL (model_rtx);
+  enum machine_mode mode = GET_MODE (mem);
+  enum machine_mode wmode = (mode == DImode ? DImode : SImode);
+  rtx label, x;
+
+  arm_pre_atomic_barrier (model);
+
+  label = gen_label_rtx ();
+  emit_label (label);
+
+  if (new_out)
+    new_out = gen_lowpart (wmode, new_out);
+  if (old_out)
+    old_out = gen_lowpart (wmode, old_out);
+  else
+    old_out = new_out;
+  value = simplify_gen_subreg (wmode, value, mode, 0);
+
+  arm_emit_load_exclusive (mode, old_out, mem);
+
+  switch (code)
+    {
+    case SET:
+      new_out = value;
+      break;
+
+    case NOT:
+      x = gen_rtx_AND (wmode, old_out, value);
+      emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
+      x = gen_rtx_NOT (wmode, new_out);
+      emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
+      break;
+
+    case MINUS:
+      if (CONST_INT_P (value))
+       {
+         value = GEN_INT (-INTVAL (value));
+         code = PLUS;
+       }
+      /* FALLTHRU */
+
+    case PLUS:
+      if (mode == DImode)
+       {
+         /* DImode plus/minus need to clobber flags.  */
+         /* The adddi3 and subdi3 patterns are incorrectly written so that
+            they require matching operands, even when we could easily support
+            three operands.  Thankfully, this can be fixed up post-splitting,
+            as the individual add+adc patterns do accept three operands and
+            post-reload cprop can make these moves go away.  */
+         emit_move_insn (new_out, old_out);
+         if (code == PLUS)
+           x = gen_adddi3 (new_out, new_out, value);
+         else
+           x = gen_subdi3 (new_out, new_out, value);
+         emit_insn (x);
+         break;
+       }
+      /* FALLTHRU */
+
+    default:
+      x = gen_rtx_fmt_ee (code, wmode, old_out, value);
+      emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
+      break;
+    }
+
+  arm_emit_store_exclusive (mode, cond, mem, gen_lowpart (mode, new_out));
+
+  x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+  emit_unlikely_jump (gen_cbranchsi4 (x, cond, const0_rtx, label));
+
+  arm_post_atomic_barrier (model);
+}
+\f
+#define MAX_VECT_LEN 16
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_vector_p;
+  bool testing_p;
+};
+
+/* Generate a variable permutation.  */
+
+static void
+arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+
+  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
+  gcc_checking_assert (GET_MODE (op0) == vmode);
+  gcc_checking_assert (GET_MODE (op1) == vmode);
+  gcc_checking_assert (GET_MODE (sel) == vmode);
+  gcc_checking_assert (TARGET_NEON);
+
+  if (one_vector_p)
+    {
+      if (vmode == V8QImode)
+       emit_insn (gen_neon_vtbl1v8qi (target, op0, sel));
+      else
+       emit_insn (gen_neon_vtbl1v16qi (target, op0, sel));
+    }
+  else
+    {
+      rtx pair;
+
+      if (vmode == V8QImode)
+       {
+         pair = gen_reg_rtx (V16QImode);
+         emit_insn (gen_neon_vcombinev8qi (pair, op0, op1));
+         pair = gen_lowpart (TImode, pair);
+         emit_insn (gen_neon_vtbl2v8qi (target, pair, sel));
+       }
+      else
+       {
+         pair = gen_reg_rtx (OImode);
+         emit_insn (gen_neon_vcombinev16qi (pair, op0, op1));
+         emit_insn (gen_neon_vtbl2v16qi (target, pair, sel));
+       }
+    }
+}
+
+void
+arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  unsigned int i, nelt = GET_MODE_NUNITS (vmode);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+  rtx rmask[MAX_VECT_LEN], mask;
+
+  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  gcc_checking_assert (!BYTES_BIG_ENDIAN);
+
+  /* The VTBL instruction does not use a modulo index, so we must take care
+     of that ourselves.  */
+  mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
+  for (i = 0; i < nelt; ++i)
+    rmask[i] = mask;
+  mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
+  sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
+
+  arm_expand_vec_perm_1 (target, op0, op1, sel);
+}
+
+/* Generate or test for an insn that supports a constant permutation.  */
+
+/* Recognize patterns for the VUZP insns.  */
+
+static bool
+arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
+{
+  unsigned int i, odd, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt; i++)
+    {
+      unsigned elt = (i * 2 + odd) & mask;
+      if (d->perm[i] != elt)
+       return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vuzpv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vuzpv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vuzpv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vuzpv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vuzpv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vuzpv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vuzpv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vuzpv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
+    }
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (odd)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* Recognize patterns for the VZIP insns.  */
+
+static bool
+arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
+{
+  unsigned int i, high, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  high = nelt / 2;
+  if (d->perm[0] == high)
+    ;
+  else if (d->perm[0] == 0)
+    high = 0;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt / 2; i++)
+    {
+      unsigned elt = (i + high) & mask;
+      if (d->perm[i * 2] != elt)
+       return false;
+      elt = (elt + nelt) & mask;
+      if (d->perm[i * 2 + 1] != elt)
+       return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vzipv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vzipv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vzipv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vzipv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vzipv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vzipv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vzipv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vzipv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      high = !high;
+    }
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (high)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* Recognize patterns for the VREV insns.  */
+
+static bool
+arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
+{
+  unsigned int i, j, diff, nelt = d->nelt;
+  rtx (*gen)(rtx, rtx, rtx);
+
+  if (!d->one_vector_p)
+    return false;
+
+  diff = d->perm[0];
+  switch (diff)
+    {
+    case 7:
+      switch (d->vmode)
+       {
+       case V16QImode: gen = gen_neon_vrev64v16qi; break;
+       case V8QImode:  gen = gen_neon_vrev64v8qi;  break;
+       default:
+         return false;
+       }
+      break;
+    case 3:
+      switch (d->vmode)
+       {
+       case V16QImode: gen = gen_neon_vrev32v16qi; break;
+       case V8QImode:  gen = gen_neon_vrev32v8qi;  break;
+       case V8HImode:  gen = gen_neon_vrev64v8hi;  break;
+       case V4HImode:  gen = gen_neon_vrev64v4hi;  break;
+       default:
+         return false;
+       }
+      break;
+    case 1:
+      switch (d->vmode)
+       {
+       case V16QImode: gen = gen_neon_vrev16v16qi; break;
+       case V8QImode:  gen = gen_neon_vrev16v8qi;  break;
+       case V8HImode:  gen = gen_neon_vrev32v8hi;  break;
+       case V4HImode:  gen = gen_neon_vrev32v4hi;  break;
+       case V4SImode:  gen = gen_neon_vrev64v4si;  break;
+       case V2SImode:  gen = gen_neon_vrev64v2si;  break;
+       case V4SFmode:  gen = gen_neon_vrev64v4sf;  break;
+       case V2SFmode:  gen = gen_neon_vrev64v2sf;  break;
+       default:
+         return false;
+       }
+      break;
+    default:
+      return false;
+    }
+
+  for (i = 0; i < nelt; i += diff)
+    for (j = 0; j <= diff; j += 1)
+      if (d->perm[i + j] != i + diff - j)
+       return false;
+
+  /* Success! */
+  if (d->testing_p)
+    return true;
+
+  /* ??? The third operand is an artifact of the builtin infrastructure
+     and is ignored by the actual instruction.  */
+  emit_insn (gen (d->target, d->op0, const0_rtx));
+  return true;
+}
+
+/* Recognize patterns for the VTRN insns.  */
+
+static bool
+arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
+{
+  unsigned int i, odd, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt; i += 2)
+    {
+      if (d->perm[i] != i + odd)
+       return false;
+      if (d->perm[i + 1] != ((i + nelt + odd) & mask))
+       return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vtrnv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vtrnv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vtrnv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vtrnv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vtrnv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vtrnv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vtrnv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vtrnv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
+    }
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (odd)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* The NEON VTBL instruction is a fully variable permuation that's even
+   stronger than what we expose via VEC_PERM_EXPR.  What it doesn't do
+   is mask the index operand as VEC_PERM_EXPR requires.  Therefore we
+   can do slightly better by expanding this as a constant where we don't
+   have to apply a mask.  */
+
+static bool
+arm_evpc_neon_vtbl (struct expand_vec_perm_d *d)
+{
+  rtx rperm[MAX_VECT_LEN], sel;
+  enum machine_mode vmode = d->vmode;
+  unsigned int i, nelt = d->nelt;
+
+  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  /* Generic code will try constant permutation twice.  Once with the
+     original mode and again with the elements lowered to QImode.
+     So wait and don't do the selector expansion ourselves.  */
+  if (vmode != V8QImode && vmode != V16QImode)
+    return false;
+
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (d->perm[i]);
+  sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+  sel = force_reg (vmode, sel);
+
+  arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
+  return true;
+}
+
+static bool
+arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  /* The pattern matching functions above are written to look for a small
+     number to begin the sequence (0, 1, N/2).  If we begin with an index
+     from the second operand, we can swap the operands.  */
+  if (d->perm[0] >= d->nelt)
+    {
+      unsigned i, nelt = d->nelt;
+      rtx x;
+
+      for (i = 0; i < nelt; ++i)
+       d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
+
+      x = d->op0;
+      d->op0 = d->op1;
+      d->op1 = x;
+    }
+
+  if (TARGET_NEON)
+    {
+      if (arm_evpc_neon_vuzp (d))
+       return true;
+      if (arm_evpc_neon_vzip (d))
+       return true;
+      if (arm_evpc_neon_vrev (d))
+       return true;
+      if (arm_evpc_neon_vtrn (d))
+       return true;
+      return arm_evpc_neon_vtbl (d);
+    }
+  return false;
+}
+
+/* Expand a vec_perm_const pattern.  */
+
+bool
+arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  struct expand_vec_perm_d d;
+  int i, nelt, which;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = GET_MODE (target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      d.one_vector_p = false;
+      if (!rtx_equal_p (op0, op1))
+       break;
+
+      /* The elements of PERM do not suggest that only the first operand
+        is used, but both operands are identical.  Allow easier matching
+        of the permutation by folding the permutation into the single
+        input vector.  */
+      /* FALLTHRU */
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d.perm[i] &= nelt - 1;
+      d.op0 = op1;
+      d.one_vector_p = true;
+      break;
+
+    case 1:
+      d.op1 = op0;
+      d.one_vector_p = true;
+      break;
+    }
+
+  return arm_expand_vec_perm_const_1 (&d);
+}
+
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
+
+static bool
+arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+  memcpy (d.perm, sel, nelt);
+
+  /* Categorize the set of elements in the selector.  */
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to the vector type.  */
+  d.one_vector_p = (which != 3);
+
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_vector_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = arm_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
+\f
  #include "gt-arm.h"