#include "function.h"
#include "expr.h"
#include "optabs.h"
-#include "toplev.h"
+#include "diagnostic-core.h"
#include "recog.h"
#include "cgraph.h"
#include "ggc.h"
#include "df.h"
#include "intl.h"
#include "libfuncs.h"
+#include "params.h"
/* Forward definitions of types. */
typedef struct minipool_node Mnode;
void (*arm_lang_output_object_attributes_hook)(void);
/* Forward function declarations. */
+static bool arm_needs_doubleword_align (enum machine_mode, const_tree);
static int arm_compute_static_chain_stack_bytes (void);
static arm_stack_offsets *arm_get_frame_offsets (void);
static void arm_add_gc_roots (void);
static rtx emit_set_insn (rtx, rtx);
static int arm_arg_partial_bytes (CUMULATIVE_ARGS *, enum machine_mode,
tree, bool);
+static rtx arm_function_arg (CUMULATIVE_ARGS *, enum machine_mode,
+ const_tree, bool);
+static void arm_function_arg_advance (CUMULATIVE_ARGS *, enum machine_mode,
+ const_tree, bool);
+static unsigned int arm_function_arg_boundary (enum machine_mode, const_tree);
static rtx aapcs_allocate_return_reg (enum machine_mode, const_tree,
const_tree);
static int aapcs_select_return_coproc (const_tree, const_tree);
static bool arm_return_in_msb (const_tree);
static bool arm_must_pass_in_stack (enum machine_mode, const_tree);
static bool arm_return_in_memory (const_tree, const_tree);
-#ifdef TARGET_UNWIND_INFO
+#if ARM_UNWIND_INFO
static void arm_unwind_emit (FILE *, rtx);
static bool arm_output_ttype (rtx);
+static void arm_asm_emit_except_personality (rtx);
+static void arm_asm_init_sections (void);
#endif
+static enum unwind_info_type arm_except_unwind_info (struct gcc_options *);
static void arm_dwarf_handle_frame_unspec (const char *, rtx, int);
static rtx arm_dwarf_register_span (rtx);
static tree arm_build_builtin_va_list (void);
static void arm_expand_builtin_va_start (tree, rtx);
static tree arm_gimplify_va_arg_expr (tree, tree, gimple_seq *, gimple_seq *);
+static void arm_option_override (void);
static bool arm_handle_option (size_t, const char *, int);
static void arm_target_help (void);
static unsigned HOST_WIDE_INT arm_shift_truncation_mask (enum machine_mode);
static bool arm_tls_symbol_p (rtx x);
static int arm_issue_rate (void);
static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED;
+static bool arm_output_addr_const_extra (FILE *, rtx);
static bool arm_allocate_stack_slots_for_args (void);
static const char *arm_invalid_parameter_type (const_tree t);
static const char *arm_invalid_return_type (const_tree t);
static void arm_trampoline_init (rtx, tree, rtx);
static rtx arm_trampoline_adjust_address (rtx);
static rtx arm_pic_static_addr (rtx orig, rtx reg);
+static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
+static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
+static enum machine_mode arm_preferred_simd_mode (enum machine_mode);
+static bool arm_class_likely_spilled_p (reg_class_t);
+static bool arm_vector_alignment_reachable (const_tree type, bool is_packed);
+static bool arm_builtin_support_vector_misalignment (enum machine_mode mode,
+ const_tree type,
+ int misalignment,
+ bool is_packed);
+static void arm_conditional_register_usage (void);
+static reg_class_t arm_preferred_rename_class (reg_class_t class);
\f
/* Table of machine attributes. */
#endif
{ NULL, 0, 0, false, false, false, NULL }
};
+
+/* Set default optimization options. */
+static const struct default_options arm_option_optimization_table[] =
+ {
+ /* Enable section anchors by default at -O1 or higher. */
+ { OPT_LEVELS_1_PLUS, OPT_fsection_anchors, NULL, 1 },
+ { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
+ { OPT_LEVELS_NONE, 0, NULL, 0 }
+ };
\f
/* Initialize the GCC target structure. */
#if TARGET_DLLIMPORT_DECL_ATTRIBUTES
#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
#define TARGET_PRINT_OPERAND_PUNCT_VALID_P arm_print_operand_punct_valid_p
+#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
+#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA arm_output_addr_const_extra
+
#undef TARGET_ASM_FUNCTION_PROLOGUE
#define TARGET_ASM_FUNCTION_PROLOGUE arm_output_function_prologue
#define TARGET_HANDLE_OPTION arm_handle_option
#undef TARGET_HELP
#define TARGET_HELP arm_target_help
+#undef TARGET_OPTION_OVERRIDE
+#define TARGET_OPTION_OVERRIDE arm_option_override
+#undef TARGET_OPTION_OPTIMIZATION_TABLE
+#define TARGET_OPTION_OPTIMIZATION_TABLE arm_option_optimization_table
#undef TARGET_COMP_TYPE_ATTRIBUTES
#define TARGET_COMP_TYPE_ATTRIBUTES arm_comp_type_attributes
#define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask
#undef TARGET_VECTOR_MODE_SUPPORTED_P
#define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p
+#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
+#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode
#undef TARGET_MACHINE_DEPENDENT_REORG
#define TARGET_MACHINE_DEPENDENT_REORG arm_reorg
#define TARGET_PASS_BY_REFERENCE arm_pass_by_reference
#undef TARGET_ARG_PARTIAL_BYTES
#define TARGET_ARG_PARTIAL_BYTES arm_arg_partial_bytes
+#undef TARGET_FUNCTION_ARG
+#define TARGET_FUNCTION_ARG arm_function_arg
+#undef TARGET_FUNCTION_ARG_ADVANCE
+#define TARGET_FUNCTION_ARG_ADVANCE arm_function_arg_advance
+#undef TARGET_FUNCTION_ARG_BOUNDARY
+#define TARGET_FUNCTION_ARG_BOUNDARY arm_function_arg_boundary
#undef TARGET_SETUP_INCOMING_VARARGS
#define TARGET_SETUP_INCOMING_VARARGS arm_setup_incoming_varargs
#undef TARGET_MUST_PASS_IN_STACK
#define TARGET_MUST_PASS_IN_STACK arm_must_pass_in_stack
-#ifdef TARGET_UNWIND_INFO
+#if ARM_UNWIND_INFO
#undef TARGET_ASM_UNWIND_EMIT
#define TARGET_ASM_UNWIND_EMIT arm_unwind_emit
#undef TARGET_ARM_EABI_UNWINDER
#define TARGET_ARM_EABI_UNWINDER true
-#endif /* TARGET_UNWIND_INFO */
+
+#undef TARGET_ASM_EMIT_EXCEPT_PERSONALITY
+#define TARGET_ASM_EMIT_EXCEPT_PERSONALITY arm_asm_emit_except_personality
+
+#undef TARGET_ASM_INIT_SECTIONS
+#define TARGET_ASM_INIT_SECTIONS arm_asm_init_sections
+#endif /* ARM_UNWIND_INFO */
+
+#undef TARGET_EXCEPT_UNWIND_INFO
+#define TARGET_EXCEPT_UNWIND_INFO arm_except_unwind_info
#undef TARGET_DWARF_HANDLE_FRAME_UNSPEC
#define TARGET_DWARF_HANDLE_FRAME_UNSPEC arm_dwarf_handle_frame_unspec
#undef TARGET_CAN_ELIMINATE
#define TARGET_CAN_ELIMINATE arm_can_eliminate
+#undef TARGET_CONDITIONAL_REGISTER_USAGE
+#define TARGET_CONDITIONAL_REGISTER_USAGE arm_conditional_register_usage
+
+#undef TARGET_CLASS_LIKELY_SPILLED_P
+#define TARGET_CLASS_LIKELY_SPILLED_P arm_class_likely_spilled_p
+
+#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
+#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
+ arm_vector_alignment_reachable
+
+#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
+#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
+ arm_builtin_support_vector_misalignment
+
+#undef TARGET_PREFERRED_RENAME_CLASS
+#define TARGET_PREFERRED_RENAME_CLASS \
+ arm_preferred_rename_class
+
struct gcc_target targetm = TARGET_INITIALIZER;
\f
/* Obstack for minipool constant handling. */
#define FL_NEON (1 << 20) /* Neon instructions. */
#define FL_ARCH7EM (1 << 21) /* Instructions present in the ARMv7E-M
architecture. */
+#define FL_ARCH7 (1 << 22) /* Architecture 7. */
#define FL_IWMMXT (1 << 29) /* XScale v2 or "Intel Wireless MMX technology". */
#define FL_FOR_ARCH6ZK FL_FOR_ARCH6K
#define FL_FOR_ARCH6T2 (FL_FOR_ARCH6 | FL_THUMB2)
#define FL_FOR_ARCH6M (FL_FOR_ARCH6 & ~FL_NOTM)
-#define FL_FOR_ARCH7 (FL_FOR_ARCH6T2 &~ FL_NOTM)
+#define FL_FOR_ARCH7 ((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7)
#define FL_FOR_ARCH7A (FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
#define FL_FOR_ARCH7R (FL_FOR_ARCH7A | FL_DIV)
#define FL_FOR_ARCH7M (FL_FOR_ARCH7 | FL_DIV)
/* Nonzero if this chip supports the ARM 6K extensions. */
int arm_arch6k = 0;
+/* Nonzero if this chip supports the ARM 7 extensions. */
+int arm_arch7 = 0;
+
/* Nonzero if instructions not present in the 'M' profile can be used. */
int arm_arch_notm = 0;
/* Nonzero if generating Thumb instructions. */
int thumb_code = 0;
+/* Nonzero if generating Thumb-1 instructions. */
+int thumb1_code = 0;
+
/* Nonzero if we should define __THUMB_INTERWORK__ in the
preprocessor.
XXX This is a bit of a hack, it's intended to help work around
the next function. */
static int after_arm_reorg = 0;
-static enum arm_pcs arm_pcs_default;
+enum arm_pcs arm_pcs_default;
/* For an explanation of these variables, see final_prescan_insn below. */
int arm_ccfsm_state;
/* arm_current_cc is also used for Thumb-2 cond_exec blocks. */
enum arm_cond_code arm_current_cc;
+
rtx arm_target_insn;
int arm_target_label;
/* The number of conditionally executed insns, including the current insn. */
"hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
};
+/* The register numbers in sequence, for passing to arm_gen_load_multiple. */
+int arm_regs_in_sequence[] =
+{
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
#define ARM_LSL_NAME (TARGET_UNIFIED_ASM ? "lsl" : "asl")
#define streq(string1, string2) (strcmp (string1, string2) == 0)
const struct tune_params *const tune;
};
+
+#define ARM_PREFETCH_NOT_BENEFICIAL 0, -1, -1
+#define ARM_PREFETCH_BENEFICIAL(prefetch_slots,l1_size,l1_line_size) \
+ prefetch_slots, \
+ l1_size, \
+ l1_line_size
+
const struct tune_params arm_slowmul_tune =
{
arm_slowmul_rtx_costs,
- 3
+ NULL,
+ 3,
+ ARM_PREFETCH_NOT_BENEFICIAL
};
const struct tune_params arm_fastmul_tune =
{
arm_fastmul_rtx_costs,
- 1
+ NULL,
+ 1,
+ ARM_PREFETCH_NOT_BENEFICIAL
};
const struct tune_params arm_xscale_tune =
{
arm_xscale_rtx_costs,
- 2
+ xscale_sched_adjust_cost,
+ 2,
+ ARM_PREFETCH_NOT_BENEFICIAL
};
const struct tune_params arm_9e_tune =
{
arm_9e_rtx_costs,
- 1
+ NULL,
+ 1,
+ ARM_PREFETCH_NOT_BENEFICIAL
+};
+
+const struct tune_params arm_cortex_a9_tune =
+{
+ arm_9e_rtx_costs,
+ cortex_a9_sched_adjust_cost,
+ 1,
+ ARM_PREFETCH_BENEFICIAL(4,32,32)
};
+
/* Not all of these give usefully different compilation alternatives,
but there is no simple way of generalizing them. */
static const struct processors all_cores[] =
va_list_type);
DECL_ARTIFICIAL (va_list_name) = 1;
TYPE_NAME (va_list_type) = va_list_name;
+ TYPE_STUB_DECL (va_list_type) = va_list_name;
/* Create the __ap field. */
ap_field = build_decl (BUILTINS_LOCATION,
FIELD_DECL,
{
const char *p;
- GET_ENVIRONMENT (p, "COLUMNS");
+ p = getenv ("COLUMNS");
if (p != NULL)
{
int value = atoi (p);
}
-/* Fix up any incompatible options that the user has specified.
- This has now turned into a maze. */
-void
-arm_override_options (void)
+/* Fix up any incompatible options that the user has specified. */
+static void
+arm_option_override (void)
{
unsigned i;
+#ifdef SUBTARGET_OVERRIDE_OPTIONS
+ SUBTARGET_OVERRIDE_OPTIONS;
+#endif
+
if (arm_selected_arch)
{
if (arm_selected_cpu)
arm_selected_cpu = &all_cores[SUBTARGET_CPU_DEFAULT];
#endif
/* Default to ARM6. */
- if (arm_selected_cpu->name)
+ if (!arm_selected_cpu->name)
arm_selected_cpu = &all_cores[arm6];
}
/* Callee super interworking implies thumb interworking. Adding
this to the flags here simplifies the logic elsewhere. */
if (TARGET_THUMB && TARGET_CALLEE_INTERWORKING)
- target_flags |= MASK_INTERWORK;
+ target_flags |= MASK_INTERWORK;
/* TARGET_BACKTRACE calls leaf_function_p, which causes a crash if done
from here where no function is being compiled currently. */
if (TARGET_ARM && TARGET_CALLEE_INTERWORKING)
warning (0, "enabling callee interworking support is only meaningful when compiling for the Thumb");
- if (TARGET_ARM && TARGET_CALLER_INTERWORKING)
- warning (0, "enabling caller interworking support is only meaningful when compiling for the Thumb");
-
if (TARGET_APCS_STACK && !TARGET_APCS_FRAME)
{
warning (0, "-mapcs-stack-check incompatible with -mno-apcs-frame");
arm_arch6 = (insn_flags & FL_ARCH6) != 0;
arm_arch6k = (insn_flags & FL_ARCH6K) != 0;
arm_arch_notm = (insn_flags & FL_NOTM) != 0;
+ arm_arch7 = (insn_flags & FL_ARCH7) != 0;
arm_arch7em = (insn_flags & FL_ARCH7EM) != 0;
arm_arch_thumb2 = (insn_flags & FL_THUMB2) != 0;
arm_arch_xscale = (insn_flags & FL_XSCALE) != 0;
arm_ld_sched = (tune_flags & FL_LDSCHED) != 0;
arm_tune_strongarm = (tune_flags & FL_STRONG) != 0;
- thumb_code = (TARGET_ARM == 0);
+ thumb_code = TARGET_ARM == 0;
+ thumb1_code = TARGET_THUMB1 != 0;
arm_tune_wbuf = (tune_flags & FL_WBUF) != 0;
arm_tune_xscale = (tune_flags & FL_XSCALE) != 0;
arm_arch_iwmmxt = (insn_flags & FL_IWMMXT) != 0;
flag_reorder_blocks = 1;
}
+ if (flag_pic)
+ /* Hoisting PIC address calculations more aggressively provides a small,
+ but measurable, size reduction for PIC code. Therefore, we decrease
+ the bar for unrestricted expression hoisting to the cost of PIC address
+ calculation, which is 2 instructions. */
+ maybe_set_param_value (PARAM_GCSE_UNRESTRICTED_COST, 2,
+ global_options.x_param_values,
+ global_options_set.x_param_values);
+
+ /* ARM EABI defaults to strict volatile bitfields. */
+ if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0)
+ flag_strict_volatile_bitfields = 1;
+
+ /* Enable sw prefetching at -O3 for CPUS that have prefetch, and we have deemed
+ it beneficial (signified by setting num_prefetch_slots to 1 or more.) */
+ if (flag_prefetch_loop_arrays < 0
+ && HAVE_prefetch
+ && optimize >= 3
+ && current_tune->num_prefetch_slots > 0)
+ flag_prefetch_loop_arrays = 1;
+
+ /* Set up parameters to be used in prefetching algorithm. Do not override the
+ defaults unless we are tuning for a core we have researched values for. */
+ if (current_tune->num_prefetch_slots > 0)
+ maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
+ current_tune->num_prefetch_slots,
+ global_options.x_param_values,
+ global_options_set.x_param_values);
+ if (current_tune->l1_cache_line_size >= 0)
+ maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
+ current_tune->l1_cache_line_size,
+ global_options.x_param_values,
+ global_options_set.x_param_values);
+ if (current_tune->l1_cache_size >= 0)
+ maybe_set_param_value (PARAM_L1_CACHE_SIZE,
+ current_tune->l1_cache_size,
+ global_options.x_param_values,
+ global_options_set.x_param_values);
+
/* Register global variables with the garbage collector. */
arm_add_gc_roots ();
}
if (optimize > 0
&& (TREE_NOTHROW (current_function_decl)
|| !(flag_unwind_tables
- || (flag_exceptions && !USING_SJLJ_EXCEPTIONS)))
+ || (flag_exceptions
+ && arm_except_unwind_info (&global_options) != UI_SJLJ)))
&& TREE_THIS_VOLATILE (current_function_decl))
type |= ARM_FT_VOLATILE;
{
HOST_WIDE_INT v;
- /* Allow repeated pattern. */
+ /* Allow repeated patterns 0x00XY00XY or 0xXYXYXYXY. */
v = i & 0xff;
v |= v << 16;
if (i == v || i == (v | (v << 8)))
return TRUE;
+
+ /* Allow repeated pattern 0xXY00XY00. */
+ v = i & 0xff00;
+ v |= v << 16;
+ if (i == v)
+ return TRUE;
}
return FALSE;
immediate value easier to load. */
enum rtx_code
-arm_canonicalize_comparison (enum rtx_code code, enum machine_mode mode,
- rtx * op1)
+arm_canonicalize_comparison (enum rtx_code code, rtx *op0, rtx *op1)
{
- unsigned HOST_WIDE_INT i = INTVAL (*op1);
- unsigned HOST_WIDE_INT maxval;
+ enum machine_mode mode;
+ unsigned HOST_WIDE_INT i, maxval;
+
+ mode = GET_MODE (*op0);
+ if (mode == VOIDmode)
+ mode = GET_MODE (*op1);
+
maxval = (((unsigned HOST_WIDE_INT) 1) << (GET_MODE_BITSIZE(mode) - 1)) - 1;
+ /* For DImode, we have GE/LT/GEU/LTU comparisons. In ARM mode
+ we can also use cmp/cmpeq for GTU/LEU. GT/LE must be either
+ reversed or (for constant OP1) adjusted to GE/LT. Similarly
+ for GTU/LEU in Thumb mode. */
+ if (mode == DImode)
+ {
+ rtx tem;
+
+ /* To keep things simple, always use the Cirrus cfcmp64 if it is
+ available. */
+ if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK)
+ return code;
+
+ if (code == GT || code == LE
+ || (!TARGET_ARM && (code == GTU || code == LEU)))
+ {
+ /* Missing comparison. First try to use an available
+ comparison. */
+ if (GET_CODE (*op1) == CONST_INT)
+ {
+ i = INTVAL (*op1);
+ switch (code)
+ {
+ case GT:
+ case LE:
+ if (i != maxval
+ && arm_const_double_by_immediates (GEN_INT (i + 1)))
+ {
+ *op1 = GEN_INT (i + 1);
+ return code == GT ? GE : LT;
+ }
+ break;
+ case GTU:
+ case LEU:
+ if (i != ~((unsigned HOST_WIDE_INT) 0)
+ && arm_const_double_by_immediates (GEN_INT (i + 1)))
+ {
+ *op1 = GEN_INT (i + 1);
+ return code == GTU ? GEU : LTU;
+ }
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ }
+
+ /* If that did not work, reverse the condition. */
+ tem = *op0;
+ *op0 = *op1;
+ *op1 = tem;
+ return swap_condition (code);
+ }
+
+ return code;
+ }
+
+ /* Comparisons smaller than DImode. Only adjust comparisons against
+ an out-of-range constant. */
+ if (GET_CODE (*op1) != CONST_INT
+ || const_ok_for_arm (INTVAL (*op1))
+ || const_ok_for_arm (- INTVAL (*op1)))
+ return code;
+
+ i = INTVAL (*op1);
+
switch (code)
{
case EQ:
have been created by C++. */
for (field = TYPE_FIELDS (type);
field && TREE_CODE (field) != FIELD_DECL;
- field = TREE_CHAIN (field))
+ field = DECL_CHAIN (field))
continue;
if (field == NULL)
/* Now check the remaining fields, if any. Only bitfields are allowed,
since they are not addressable. */
- for (field = TREE_CHAIN (field);
+ for (field = DECL_CHAIN (field);
field;
- field = TREE_CHAIN (field))
+ field = DECL_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
integral, or can be returned in an integer register. */
for (field = TYPE_FIELDS (type);
field;
- field = TREE_CHAIN (field))
+ field = DECL_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
/* Detect varargs functions. These always use the base rules
(no argument is ever a candidate for a co-processor
register). */
- bool base_rules = (TYPE_ARG_TYPES (type) != 0
- && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (type)))
- != void_type_node));
+ bool base_rules = stdarg_p (type);
if (user_convention)
{
if (user_pcs > ARM_PCS_AAPCS_LOCAL)
- sorry ("Non-AAPCS derived PCS variant");
+ sorry ("non-AAPCS derived PCS variant");
else if (base_rules && user_pcs != ARM_PCS_AAPCS)
- error ("Variadic functions must use the base AAPCS variant");
+ error ("variadic functions must use the base AAPCS variant");
}
if (base_rules)
if (!COMPLETE_TYPE_P(type))
return -1;
- for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
+ for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
if (!COMPLETE_TYPE_P(type))
return -1;
- for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
+ for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
{
if (TREE_CODE (field) != FIELD_DECL)
continue;
static int
aapcs_select_call_coproc (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
- tree type)
+ const_tree type)
{
int i;
numbers referred to here are those in the AAPCS. */
static void
aapcs_layout_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
- tree type, int named)
+ const_tree type, bool named)
{
int nregs, nregs2;
int ncrn;
/* Return true if mode/type need doubleword alignment. */
-bool
-arm_needs_doubleword_align (enum machine_mode mode, tree type)
+static bool
+arm_needs_doubleword_align (enum machine_mode mode, const_tree type)
{
return (GET_MODE_ALIGNMENT (mode) > PARM_BOUNDARY
|| (type && TYPE_ALIGN (type) > PARM_BOUNDARY));
CUM is a variable of type CUMULATIVE_ARGS which gives info about
the preceding args and about the function being called.
NAMED is nonzero if this argument is a named parameter
- (otherwise it is an extra parameter matching an ellipsis). */
+ (otherwise it is an extra parameter matching an ellipsis).
-rtx
+ On the ARM, normally the first 16 bytes are passed in registers r0-r3; all
+ other arguments are passed on the stack. If (NAMED == 0) (which happens
+ only in assign_parms, since TARGET_SETUP_INCOMING_VARARGS is
+ defined), say it is passed in the stack (function_prologue will
+ indeed make it pass in the stack if necessary). */
+
+static rtx
arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
- tree type, int named)
+ const_tree type, bool named)
{
int nregs;
&& arm_needs_doubleword_align (mode, type))
pcum->nregs++;
- if (mode == VOIDmode)
- /* Pick an arbitrary value for operand 2 of the call insn. */
- return const0_rtx;
-
/* Only allow splitting an arg between regs and memory if all preceding
args were allocated to regs. For args passed by reference we only count
the reference pointer. */
return gen_rtx_REG (mode, pcum->nregs);
}
+static unsigned int
+arm_function_arg_boundary (enum machine_mode mode, const_tree type)
+{
+ return (ARM_DOUBLEWORD_ALIGN && arm_needs_doubleword_align (mode, type)
+ ? DOUBLEWORD_ALIGNMENT
+ : PARM_BOUNDARY);
+}
+
static int
arm_arg_partial_bytes (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
tree type, bool named)
return 0;
}
-void
+/* Update the data in PCUM to advance over an argument
+ of mode MODE and data type TYPE.
+ (TYPE is null for libcalls where that information may not be available.) */
+
+static void
arm_function_arg_advance (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
- tree type, bool named)
+ const_tree type, bool named)
{
if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
{
if (GET_CODE (orig) == SYMBOL_REF
|| GET_CODE (orig) == LABEL_REF)
{
- rtx pic_ref, address;
rtx insn;
if (reg == 0)
{
gcc_assert (can_create_pseudo_p ());
reg = gen_reg_rtx (Pmode);
- address = gen_reg_rtx (Pmode);
}
- else
- address = reg;
/* VxWorks does not impose a fixed gap between segments; the run-time
gap can be different from the object-file gap. We therefore can't
insn = arm_pic_static_addr (orig, reg);
else
{
+ rtx pat;
+ rtx mem;
+
/* If this function doesn't have a pic register, create one now. */
require_pic_register ();
- if (TARGET_32BIT)
- emit_insn (gen_pic_load_addr_32bit (address, orig));
- else /* TARGET_THUMB1 */
- emit_insn (gen_pic_load_addr_thumb1 (address, orig));
+ pat = gen_calculate_pic_address (reg, cfun->machine->pic_reg, orig);
+
+ /* Make the MEM as close to a constant as possible. */
+ mem = SET_SRC (pat);
+ gcc_assert (MEM_P (mem) && !MEM_VOLATILE_P (mem));
+ MEM_READONLY_P (mem) = 1;
+ MEM_NOTRAP_P (mem) = 1;
- pic_ref = gen_const_mem (Pmode,
- gen_rtx_PLUS (Pmode, cfun->machine->pic_reg,
- address));
- insn = emit_move_insn (reg, pic_ref);
+ insn = emit_insn (pat);
}
/* Put a REG_EQUAL note on this insn, so that it can be optimized
return FALSE;
}
+/* Return true if X will surely end up in an index register after next
+ splitting pass. */
+static bool
+will_be_in_index_register (const_rtx x)
+{
+ /* arm.md: calculate_pic_address will split this into a register. */
+ return GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_PIC_SYM;
+}
+
/* Return nonzero if X is a valid ARM state address operand. */
int
arm_legitimate_address_outer_p (enum machine_mode mode, rtx x, RTX_CODE outer,
rtx xop1 = XEXP (x, 1);
return ((arm_address_register_rtx_p (xop0, strict_p)
- && GET_CODE(xop1) == CONST_INT
- && arm_legitimate_index_p (mode, xop1, outer, strict_p))
+ && ((GET_CODE(xop1) == CONST_INT
+ && arm_legitimate_index_p (mode, xop1, outer, strict_p))
+ || (!strict_p && will_be_in_index_register (xop1))))
|| (arm_address_register_rtx_p (xop1, strict_p)
&& arm_legitimate_index_p (mode, xop0, outer, strict_p)));
}
rtx xop1 = XEXP (x, 1);
return ((arm_address_register_rtx_p (xop0, strict_p)
- && thumb2_legitimate_index_p (mode, xop1, strict_p))
+ && (thumb2_legitimate_index_p (mode, xop1, strict_p)
+ || (!strict_p && will_be_in_index_register (xop1))))
|| (arm_address_register_rtx_p (xop1, strict_p)
&& thumb2_legitimate_index_p (mode, xop0, strict_p)));
}
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
- if (TARGET_NEON
- && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode)))
+ /* For quad modes, we restrict the constant offset to be slightly less
+ than what the instruction format permits. We do this because for
+ quad mode moves, we will actually decompose them into two separate
+ double-mode reads or writes. INDEX must therefore be a valid
+ (double-mode) offset and so should INDEX+8. */
+ if (TARGET_NEON && VALID_NEON_QREG_MODE (mode))
return (code == CONST_INT
&& INTVAL (index) < 1016
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
+ /* We have no such constraint on double mode offsets, so we permit the
+ full range of the instruction format. */
+ if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
+ return (code == CONST_INT
+ && INTVAL (index) < 1024
+ && INTVAL (index) > -1024
+ && (INTVAL (index) & 3) == 0);
+
if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode))
return (code == CONST_INT
&& INTVAL (index) < 1024
&& (INTVAL (index) & 3) == 0);
}
- if (TARGET_NEON
- && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode)))
+ /* For quad modes, we restrict the constant offset to be slightly less
+ than what the instruction format permits. We do this because for
+ quad mode moves, we will actually decompose them into two separate
+ double-mode reads or writes. INDEX must therefore be a valid
+ (double-mode) offset and so should INDEX+8. */
+ if (TARGET_NEON && VALID_NEON_QREG_MODE (mode))
return (code == CONST_INT
&& INTVAL (index) < 1016
&& INTVAL (index) > -1024
&& (INTVAL (index) & 3) == 0);
+ /* We have no such constraint on double mode offsets, so we permit the
+ full range of the instruction format. */
+ if (TARGET_NEON && VALID_NEON_DREG_MODE (mode))
+ return (code == CONST_INT
+ && INTVAL (index) < 1024
+ && INTVAL (index) > -1024
+ && (INTVAL (index) & 3) == 0);
+
if (arm_address_register_rtx_p (index, strict_p)
&& (GET_MODE_SIZE (mode) <= 4))
return 1;
&& XEXP (x, 0) != frame_pointer_rtx
&& XEXP (x, 1) != frame_pointer_rtx
&& thumb1_index_register_rtx_p (XEXP (x, 0), strict_p)
- && thumb1_index_register_rtx_p (XEXP (x, 1), strict_p))
+ && (thumb1_index_register_rtx_p (XEXP (x, 1), strict_p)
+ || (!strict_p && will_be_in_index_register (XEXP (x, 1)))))
return 1;
/* REG+const has 5-7 bit offset for non-SP registers. */
&& (REGNO (XEXP (x, 0)) == FRAME_POINTER_REGNUM
|| REGNO (XEXP (x, 0)) == ARG_POINTER_REGNUM
|| (REGNO (XEXP (x, 0)) >= FIRST_VIRTUAL_REGISTER
- && REGNO (XEXP (x, 0)) <= LAST_VIRTUAL_REGISTER))
+ && REGNO (XEXP (x, 0))
+ <= LAST_VIRTUAL_POINTER_REGISTER))
&& GET_MODE_SIZE (mode) >= 4
&& GET_CODE (XEXP (x, 1)) == CONST_INT
&& (INTVAL (XEXP (x, 1)) & 3) == 0)
#define REG_OR_SUBREG_RTX(X) \
(GET_CODE (X) == REG ? (X) : SUBREG_REG (X))
-#ifndef COSTS_N_INSNS
-#define COSTS_N_INSNS(N) ((N) * 4 - 2)
-#endif
static inline int
thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
{
enum machine_mode mode = GET_MODE (x);
+ int total;
switch (code)
{
return 14;
return 2;
+ case SIGN_EXTEND:
case ZERO_EXTEND:
- /* XXX still guessing. */
- switch (GET_MODE (XEXP (x, 0)))
- {
- case QImode:
- return (1 + (mode == DImode ? 4 : 0)
- + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0));
+ total = mode == DImode ? COSTS_N_INSNS (1) : 0;
+ total += thumb1_rtx_costs (XEXP (x, 0), GET_CODE (XEXP (x, 0)), code);
- case HImode:
- return (4 + (mode == DImode ? 4 : 0)
- + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0));
+ if (mode == SImode)
+ return total;
- case SImode:
- return (1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0));
+ if (arm_arch6)
+ return total + COSTS_N_INSNS (1);
- default:
- return 99;
- }
+ /* Assume a two-shift sequence. Increase the cost slightly so
+ we prefer actual shifts over an extend operation. */
+ return total + 1 + COSTS_N_INSNS (2);
default:
return 99;
since then they might not be moved outside of loops. As a compromise
we allow integration with ops that have a constant as their second
operand. */
- if ((REG_OR_SUBREG_REG (XEXP (x, 0))
- && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
- && GET_CODE (XEXP (x, 1)) != CONST_INT)
- || (REG_OR_SUBREG_REG (XEXP (x, 0))
- && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))))
- *total = 4;
+ if (REG_OR_SUBREG_REG (XEXP (x, 0))
+ && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
+ && GET_CODE (XEXP (x, 1)) != CONST_INT)
+ *total = COSTS_N_INSNS (1);
if (mode == DImode)
{
return false;
case SIGN_EXTEND:
- if (GET_MODE_CLASS (mode) == MODE_INT)
- {
- *total = 0;
- if (mode == DImode)
- *total += COSTS_N_INSNS (1);
-
- if (GET_MODE (XEXP (x, 0)) != SImode)
- {
- if (arm_arch6)
- {
- if (GET_CODE (XEXP (x, 0)) != MEM)
- *total += COSTS_N_INSNS (1);
- }
- else if (!arm_arch4 || GET_CODE (XEXP (x, 0)) != MEM)
- *total += COSTS_N_INSNS (2);
- }
-
- return false;
- }
-
- /* Fall through */
case ZERO_EXTEND:
*total = 0;
if (GET_MODE_CLASS (mode) == MODE_INT)
{
+ rtx op = XEXP (x, 0);
+ enum machine_mode opmode = GET_MODE (op);
+
if (mode == DImode)
*total += COSTS_N_INSNS (1);
- if (GET_MODE (XEXP (x, 0)) != SImode)
+ if (opmode != SImode)
{
- if (arm_arch6)
+ if (MEM_P (op))
{
- if (GET_CODE (XEXP (x, 0)) != MEM)
- *total += COSTS_N_INSNS (1);
+ /* If !arm_arch4, we use one of the extendhisi2_mem
+ or movhi_bytes patterns for HImode. For a QImode
+ sign extension, we first zero-extend from memory
+ and then perform a shift sequence. */
+ if (!arm_arch4 && (opmode != QImode || code == SIGN_EXTEND))
+ *total += COSTS_N_INSNS (2);
}
- else if (!arm_arch4 || GET_CODE (XEXP (x, 0)) != MEM)
- *total += COSTS_N_INSNS (GET_MODE (XEXP (x, 0)) == QImode ?
- 1 : 2);
+ else if (arm_arch6)
+ *total += COSTS_N_INSNS (1);
+
+ /* We don't have the necessary insn, so we need to perform some
+ other operation. */
+ else if (TARGET_ARM && code == ZERO_EXTEND && mode == QImode)
+ /* An and with constant 255. */
+ *total += COSTS_N_INSNS (1);
+ else
+ /* A shift sequence. Increase costs slightly to avoid
+ combining two shifts into an extend operation. */
+ *total += COSTS_N_INSNS (2) + 1;
}
return false;
if (outer == SET)
{
if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256)
- return 0;
+ return COSTS_N_INSNS (1);
+ /* See split "TARGET_THUMB1 && satisfies_constraint_J". */
+ if (INTVAL (x) >= -255 && INTVAL (x) <= -1)
+ return COSTS_N_INSNS (2);
+ /* See split "TARGET_THUMB1 && satisfies_constraint_K". */
if (thumb_shiftable_const (INTVAL (x)))
return COSTS_N_INSNS (2);
return COSTS_N_INSNS (3);
a single register, otherwise it costs one insn per word. */
if (REG_P (XEXP (x, 0)))
*total = COSTS_N_INSNS (1);
+ else if (flag_pic
+ && GET_CODE (XEXP (x, 0)) == PLUS
+ && will_be_in_index_register (XEXP (XEXP (x, 0), 1)))
+ /* This will be split into two instructions.
+ See arm.md:calculate_pic_address. */
+ *total = COSTS_N_INSNS (2);
else
*total = COSTS_N_INSNS (ARM_NUM_REGS (mode));
return true;
return false;
case SIGN_EXTEND:
- *total = 0;
- if (GET_MODE_SIZE (GET_MODE (XEXP (x, 0))) < 4)
- {
- if (!(arm_arch4 && MEM_P (XEXP (x, 0))))
- *total += COSTS_N_INSNS (arm_arch6 ? 1 : 2);
- }
- if (mode == DImode)
- *total += COSTS_N_INSNS (1);
- return false;
-
case ZERO_EXTEND:
- *total = 0;
- if (!(arm_arch4 && MEM_P (XEXP (x, 0))))
- {
- switch (GET_MODE (XEXP (x, 0)))
- {
- case QImode:
- *total += COSTS_N_INSNS (1);
- break;
-
- case HImode:
- *total += COSTS_N_INSNS (arm_arch6 ? 1 : 2);
-
- case SImode:
- break;
-
- default:
- *total += COSTS_N_INSNS (2);
- }
- }
-
- if (mode == DImode)
- *total += COSTS_N_INSNS (1);
-
- return false;
+ return arm_rtx_costs_1 (x, outer_code, total, 0);
case CONST_INT:
if (const_ok_for_arm (INTVAL (x)))
return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x);
}
-static int
-arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
+/* Adjust cost hook for XScale. */
+static bool
+xscale_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
{
- rtx i_pat, d_pat;
-
/* Some true dependencies can have a higher cost depending
on precisely how certain input operands are used. */
- if (arm_tune_xscale
- && REG_NOTE_KIND (link) == 0
+ if (REG_NOTE_KIND(link) == 0
&& recog_memoized (insn) >= 0
&& recog_memoized (dep) >= 0)
{
if (reg_overlap_mentioned_p (recog_data.operand[opno],
shifted_operand))
- return 2;
+ {
+ *cost = 2;
+ return false;
+ }
}
}
}
+ return true;
+}
+
+/* Adjust cost hook for Cortex A9. */
+static bool
+cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
+{
+ switch (REG_NOTE_KIND (link))
+ {
+ case REG_DEP_ANTI:
+ *cost = 0;
+ return false;
+
+ case REG_DEP_TRUE:
+ case REG_DEP_OUTPUT:
+ if (recog_memoized (insn) >= 0
+ && recog_memoized (dep) >= 0)
+ {
+ if (GET_CODE (PATTERN (insn)) == SET)
+ {
+ if (GET_MODE_CLASS
+ (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT
+ || GET_MODE_CLASS
+ (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT)
+ {
+ enum attr_type attr_type_insn = get_attr_type (insn);
+ enum attr_type attr_type_dep = get_attr_type (dep);
+
+ /* By default all dependencies of the form
+ s0 = s0 <op> s1
+ s0 = s0 <op> s2
+ have an extra latency of 1 cycle because
+ of the input and output dependency in this
+ case. However this gets modeled as an true
+ dependency and hence all these checks. */
+ if (REG_P (SET_DEST (PATTERN (insn)))
+ && REG_P (SET_DEST (PATTERN (dep)))
+ && reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)),
+ SET_DEST (PATTERN (dep))))
+ {
+ /* FMACS is a special case where the dependant
+ instruction can be issued 3 cycles before
+ the normal latency in case of an output
+ dependency. */
+ if ((attr_type_insn == TYPE_FMACS
+ || attr_type_insn == TYPE_FMACD)
+ && (attr_type_dep == TYPE_FMACS
+ || attr_type_dep == TYPE_FMACD))
+ {
+ if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
+ *cost = insn_default_latency (dep) - 3;
+ else
+ *cost = insn_default_latency (dep);
+ return false;
+ }
+ else
+ {
+ if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
+ *cost = insn_default_latency (dep) + 1;
+ else
+ *cost = insn_default_latency (dep);
+ }
+ return false;
+ }
+ }
+ }
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return true;
+}
+
+/* This function implements the target macro TARGET_SCHED_ADJUST_COST.
+ It corrects the value of COST based on the relationship between
+ INSN and DEP through the dependence LINK. It returns the new
+ value. There is a per-core adjust_cost hook to adjust scheduler costs
+ and the per-core hook can choose to completely override the generic
+ adjust_cost function. Only put bits of code into arm_adjust_cost that
+ are common across all cores. */
+static int
+arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
+{
+ rtx i_pat, d_pat;
+
+ /* When generating Thumb-1 code, we want to place flag-setting operations
+ close to a conditional branch which depends on them, so that we can
+ omit the comparison. */
+ if (TARGET_THUMB1
+ && REG_NOTE_KIND (link) == 0
+ && recog_memoized (insn) == CODE_FOR_cbranchsi4_insn
+ && recog_memoized (dep) >= 0
+ && get_attr_conds (dep) == CONDS_SET)
+ return 0;
+
+ if (current_tune->sched_adjust_cost != NULL)
+ {
+ if (!current_tune->sched_adjust_cost (insn, link, dep, &cost))
+ return cost;
+ }
/* XXX This is not strictly true for the FPA. */
if (REG_NOTE_KIND (link) == REG_DEP_ANTI
constant pool are cached, and that others will miss. This is a
hack. */
- if ((GET_CODE (src_mem) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (src_mem))
+ if ((GET_CODE (src_mem) == SYMBOL_REF
+ && CONSTANT_POOL_ADDRESS_P (src_mem))
|| reg_mentioned_p (stack_pointer_rtx, src_mem)
|| reg_mentioned_p (frame_pointer_rtx, src_mem)
|| reg_mentioned_p (hard_frame_pointer_rtx, src_mem))
return arm_address_register_rtx_p (ind, 0);
/* Allow post-increment with Neon registers. */
- if (type != 1 && (GET_CODE (ind) == POST_INC || GET_CODE (ind) == PRE_DEC))
+ if ((type != 1 && GET_CODE (ind) == POST_INC)
+ || (type == 0 && GET_CODE (ind) == PRE_DEC))
return arm_address_register_rtx_p (XEXP (ind, 0), 0);
/* FIXME: vld1 allows register post-modify. */
if (nops == 2 && arm_ld_sched && add_offset != 0)
return false;
+ /* XScale has load-store double instructions, but they have stricter
+ alignment requirements than load-store multiple, so we cannot
+ use them.
+
+ For XScale ldm requires 2 + NREGS cycles to complete and blocks
+ the pipeline until completion.
+
+ NREGS CYCLES
+ 1 3
+ 2 4
+ 3 5
+ 4 6
+
+ An ldr instruction takes 1-3 cycles, but does not block the
+ pipeline.
+
+ NREGS CYCLES
+ 1 1-3
+ 2 2-6
+ 3 3-9
+ 4 4-12
+
+ Best case ldr will always win. However, the more ldr instructions
+ we issue, the less likely we are to be able to schedule them well.
+ Using ldr instructions also increases code size.
+
+ As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
+ for counts of 3 or 4 regs. */
+ if (nops <= 2 && arm_tune_xscale && !optimize_size)
+ return false;
return true;
}
return true;
}
-int
-load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
- HOST_WIDE_INT *load_offset)
+/* Used to determine in a peephole whether a sequence of load
+ instructions can be changed into a load-multiple instruction.
+ NOPS is the number of separate load instructions we are examining. The
+ first NOPS entries in OPERANDS are the destination registers, the
+ next NOPS entries are memory operands. If this function is
+ successful, *BASE is set to the common base register of the memory
+ accesses; *LOAD_OFFSET is set to the first memory location's offset
+ from that base register.
+ REGS is an array filled in with the destination register numbers.
+ SAVED_ORDER (if nonnull), is an array filled in with an order that maps
+ insn numbers to to an ascending order of stores. If CHECK_REGS is true,
+ the sequence of registers in REGS matches the loads from ascending memory
+ locations, and the function verifies that the register numbers are
+ themselves ascending. If CHECK_REGS is false, the register numbers
+ are stored in the order they are found in the operands. */
+static int
+load_multiple_sequence (rtx *operands, int nops, int *regs, int *saved_order,
+ int *base, HOST_WIDE_INT *load_offset, bool check_regs)
{
int unsorted_regs[MAX_LDM_STM_OPS];
HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
int order[MAX_LDM_STM_OPS];
+ rtx base_reg_rtx = NULL;
int base_reg = -1;
int i, ldm_case;
== CONST_INT)))
{
if (i == 0)
- base_reg = REGNO (reg);
- else
{
- if (base_reg != (int) REGNO (reg))
- /* Not addressed from the same base register. */
+ base_reg = REGNO (reg);
+ base_reg_rtx = reg;
+ if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM)
return 0;
}
+ else if (base_reg != (int) REGNO (reg))
+ /* Not addressed from the same base register. */
+ return 0;
+
unsorted_regs[i] = (GET_CODE (operands[i]) == REG
? REGNO (operands[i])
: REGNO (SUBREG_REG (operands[i])));
/* If it isn't an integer register, or if it overwrites the
base register but isn't the last insn in the list, then
we can't do this. */
- if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14
+ if (unsorted_regs[i] < 0
+ || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM)
+ || unsorted_regs[i] > 14
|| (i != nops - 1 && unsorted_regs[i] == base_reg))
return 0;
order[0] has been set to the lowest offset in the list. Sort
the offsets into order, verifying that they are adjacent, and
check that the register numbers are ascending. */
- if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
+ if (!compute_offset_order (nops, unsorted_offsets, order,
+ check_regs ? unsorted_regs : NULL))
return 0;
+ if (saved_order)
+ memcpy (saved_order, order, sizeof order);
+
if (base)
{
*base = base_reg;
for (i = 0; i < nops; i++)
- regs[i] = unsorted_regs[order[i]];
+ regs[i] = unsorted_regs[check_regs ? order[i] : i];
*load_offset = unsorted_offsets[order[0]];
}
+ if (TARGET_THUMB1
+ && !peep2_reg_dead_p (nops, base_reg_rtx))
+ return 0;
+
if (unsorted_offsets[order[0]] == 0)
ldm_case = 1; /* ldmia */
else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
ldm_case = 2; /* ldmib */
else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
ldm_case = 3; /* ldmda */
- else if (unsorted_offsets[order[nops - 1]] == -4)
+ else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4)
ldm_case = 4; /* ldmdb */
else if (const_ok_for_arm (unsorted_offsets[order[0]])
|| const_ok_for_arm (-unsorted_offsets[order[0]]))
return ldm_case;
}
-const char *
-emit_ldm_seq (rtx *operands, int nops)
+/* Used to determine in a peephole whether a sequence of store instructions can
+ be changed into a store-multiple instruction.
+ NOPS is the number of separate store instructions we are examining.
+ NOPS_TOTAL is the total number of instructions recognized by the peephole
+ pattern.
+ The first NOPS entries in OPERANDS are the source registers, the next
+ NOPS entries are memory operands. If this function is successful, *BASE is
+ set to the common base register of the memory accesses; *LOAD_OFFSET is set
+ to the first memory location's offset from that base register. REGS is an
+ array filled in with the source register numbers, REG_RTXS (if nonnull) is
+ likewise filled with the corresponding rtx's.
+ SAVED_ORDER (if nonnull), is an array filled in with an order that maps insn
+ numbers to to an ascending order of stores.
+ If CHECK_REGS is true, the sequence of registers in *REGS matches the stores
+ from ascending memory locations, and the function verifies that the register
+ numbers are themselves ascending. If CHECK_REGS is false, the register
+ numbers are stored in the order they are found in the operands. */
+static int
+store_multiple_sequence (rtx *operands, int nops, int nops_total,
+ int *regs, rtx *reg_rtxs, int *saved_order, int *base,
+ HOST_WIDE_INT *load_offset, bool check_regs)
{
- int regs[MAX_LDM_STM_OPS];
- int base_reg;
- HOST_WIDE_INT offset;
- char buf[100];
- int i;
+ int unsorted_regs[MAX_LDM_STM_OPS];
+ rtx unsorted_reg_rtxs[MAX_LDM_STM_OPS];
+ HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
+ int order[MAX_LDM_STM_OPS];
+ int base_reg = -1;
+ rtx base_reg_rtx = NULL;
+ int i, stm_case;
- switch (load_multiple_sequence (operands, nops, regs, &base_reg, &offset))
- {
- case 1:
- strcpy (buf, "ldm%(ia%)\t");
- break;
+ /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
+ easily extended if required. */
+ gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
- case 2:
- strcpy (buf, "ldm%(ib%)\t");
- break;
-
- case 3:
- strcpy (buf, "ldm%(da%)\t");
- break;
-
- case 4:
- strcpy (buf, "ldm%(db%)\t");
- break;
-
- case 5:
- if (offset >= 0)
- sprintf (buf, "add%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX,
- reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg],
- (long) offset);
- else
- sprintf (buf, "sub%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX,
- reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg],
- (long) -offset);
- output_asm_insn (buf, operands);
- base_reg = regs[0];
- strcpy (buf, "ldm%(ia%)\t");
- break;
-
- default:
- gcc_unreachable ();
- }
-
- sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX,
- reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]);
-
- for (i = 1; i < nops; i++)
- sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX,
- reg_names[regs[i]]);
-
- strcat (buf, "}\t%@ phole ldm");
-
- output_asm_insn (buf, operands);
- return "";
-}
-
-int
-store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
- HOST_WIDE_INT * load_offset)
-{
- int unsorted_regs[MAX_LDM_STM_OPS];
- HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
- int order[MAX_LDM_STM_OPS];
- int base_reg = -1;
- int i, stm_case;
-
- /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
- easily extended if required. */
- gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS);
-
- memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
+ memset (order, 0, MAX_LDM_STM_OPS * sizeof (int));
/* Loop over the operands and check that the memory references are
suitable (i.e. immediate offsets from the same base register). At
&& (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1))
== CONST_INT)))
{
- unsorted_regs[i] = (GET_CODE (operands[i]) == REG
- ? REGNO (operands[i])
- : REGNO (SUBREG_REG (operands[i])));
+ unsorted_reg_rtxs[i] = (GET_CODE (operands[i]) == REG
+ ? operands[i] : SUBREG_REG (operands[i]));
+ unsorted_regs[i] = REGNO (unsorted_reg_rtxs[i]);
+
if (i == 0)
- base_reg = REGNO (reg);
+ {
+ base_reg = REGNO (reg);
+ base_reg_rtx = reg;
+ if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM)
+ return 0;
+ }
else if (base_reg != (int) REGNO (reg))
/* Not addressed from the same base register. */
return 0;
/* If it isn't an integer register, then we can't do this. */
- if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14)
+ if (unsorted_regs[i] < 0
+ || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM)
+ || (TARGET_THUMB2 && unsorted_regs[i] == base_reg)
+ || (TARGET_THUMB2 && unsorted_regs[i] == SP_REGNUM)
+ || unsorted_regs[i] > 14)
return 0;
unsorted_offsets[i] = INTVAL (offset);
order[0] has been set to the lowest offset in the list. Sort
the offsets into order, verifying that they are adjacent, and
check that the register numbers are ascending. */
- if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
+ if (!compute_offset_order (nops, unsorted_offsets, order,
+ check_regs ? unsorted_regs : NULL))
return 0;
+ if (saved_order)
+ memcpy (saved_order, order, sizeof order);
+
if (base)
{
*base = base_reg;
for (i = 0; i < nops; i++)
- regs[i] = unsorted_regs[order[i]];
+ {
+ regs[i] = unsorted_regs[check_regs ? order[i] : i];
+ if (reg_rtxs)
+ reg_rtxs[i] = unsorted_reg_rtxs[check_regs ? order[i] : i];
+ }
*load_offset = unsorted_offsets[order[0]];
}
+ if (TARGET_THUMB1
+ && !peep2_reg_dead_p (nops_total, base_reg_rtx))
+ return 0;
+
if (unsorted_offsets[order[0]] == 0)
stm_case = 1; /* stmia */
else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
stm_case = 2; /* stmib */
else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
stm_case = 3; /* stmda */
- else if (unsorted_offsets[order[nops - 1]] == -4)
+ else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4)
stm_case = 4; /* stmdb */
else
return 0;
return stm_case;
}
+\f
+/* Routines for use in generating RTL. */
-const char *
-emit_stm_seq (rtx *operands, int nops)
+/* Generate a load-multiple instruction. COUNT is the number of loads in
+ the instruction; REGS and MEMS are arrays containing the operands.
+ BASEREG is the base register to be used in addressing the memory operands.
+ WBACK_OFFSET is nonzero if the instruction should update the base
+ register. */
+
+static rtx
+arm_gen_load_multiple_1 (int count, int *regs, rtx *mems, rtx basereg,
+ HOST_WIDE_INT wback_offset)
{
- int regs[MAX_LDM_STM_OPS];
- int base_reg;
- HOST_WIDE_INT offset;
- char buf[100];
- int i;
+ int i = 0, j;
+ rtx result;
- switch (store_multiple_sequence (operands, nops, regs, &base_reg, &offset))
+ if (!multiple_operation_profitable_p (false, count, 0))
{
- case 1:
- strcpy (buf, "stm%(ia%)\t");
- break;
+ rtx seq;
- case 2:
- strcpy (buf, "stm%(ib%)\t");
- break;
+ start_sequence ();
- case 3:
- strcpy (buf, "stm%(da%)\t");
- break;
+ for (i = 0; i < count; i++)
+ emit_move_insn (gen_rtx_REG (SImode, regs[i]), mems[i]);
- case 4:
- strcpy (buf, "stm%(db%)\t");
- break;
+ if (wback_offset != 0)
+ emit_move_insn (basereg, plus_constant (basereg, wback_offset));
- default:
- gcc_unreachable ();
- }
+ seq = get_insns ();
+ end_sequence ();
- sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX,
- reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]);
+ return seq;
+ }
- for (i = 1; i < nops; i++)
- sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX,
- reg_names[regs[i]]);
+ result = gen_rtx_PARALLEL (VOIDmode,
+ rtvec_alloc (count + (wback_offset != 0 ? 1 : 0)));
+ if (wback_offset != 0)
+ {
+ XVECEXP (result, 0, 0)
+ = gen_rtx_SET (VOIDmode, basereg,
+ plus_constant (basereg, wback_offset));
+ i = 1;
+ count++;
+ }
- strcat (buf, "}\t%@ phole stm");
+ for (j = 0; i < count; i++, j++)
+ XVECEXP (result, 0, i)
+ = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, regs[j]), mems[j]);
- output_asm_insn (buf, operands);
- return "";
+ return result;
}
-\f
-/* Routines for use in generating RTL. */
-rtx
-arm_gen_load_multiple (int base_regno, int count, rtx from, int up,
- int write_back, rtx basemem, HOST_WIDE_INT *offsetp)
+/* Generate a store-multiple instruction. COUNT is the number of stores in
+ the instruction; REGS and MEMS are arrays containing the operands.
+ BASEREG is the base register to be used in addressing the memory operands.
+ WBACK_OFFSET is nonzero if the instruction should update the base
+ register. */
+
+static rtx
+arm_gen_store_multiple_1 (int count, int *regs, rtx *mems, rtx basereg,
+ HOST_WIDE_INT wback_offset)
{
- HOST_WIDE_INT offset = *offsetp;
int i = 0, j;
rtx result;
- int sign = up ? 1 : -1;
- rtx mem, addr;
-
- /* XScale has load-store double instructions, but they have stricter
- alignment requirements than load-store multiple, so we cannot
- use them.
- For XScale ldm requires 2 + NREGS cycles to complete and blocks
- the pipeline until completion.
+ if (GET_CODE (basereg) == PLUS)
+ basereg = XEXP (basereg, 0);
- NREGS CYCLES
- 1 3
- 2 4
- 3 5
- 4 6
-
- An ldr instruction takes 1-3 cycles, but does not block the
- pipeline.
-
- NREGS CYCLES
- 1 1-3
- 2 2-6
- 3 3-9
- 4 4-12
-
- Best case ldr will always win. However, the more ldr instructions
- we issue, the less likely we are to be able to schedule them well.
- Using ldr instructions also increases code size.
-
- As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
- for counts of 3 or 4 regs. */
- if (arm_tune_xscale && count <= 2 && ! optimize_size)
+ if (!multiple_operation_profitable_p (false, count, 0))
{
rtx seq;
start_sequence ();
for (i = 0; i < count; i++)
- {
- addr = plus_constant (from, i * 4 * sign);
- mem = adjust_automodify_address (basemem, SImode, addr, offset);
- emit_move_insn (gen_rtx_REG (SImode, base_regno + i), mem);
- offset += 4 * sign;
- }
+ emit_move_insn (mems[i], gen_rtx_REG (SImode, regs[i]));
- if (write_back)
- {
- emit_move_insn (from, plus_constant (from, count * 4 * sign));
- *offsetp = offset;
- }
+ if (wback_offset != 0)
+ emit_move_insn (basereg, plus_constant (basereg, wback_offset));
seq = get_insns ();
end_sequence ();
}
result = gen_rtx_PARALLEL (VOIDmode,
- rtvec_alloc (count + (write_back ? 1 : 0)));
- if (write_back)
+ rtvec_alloc (count + (wback_offset != 0 ? 1 : 0)));
+ if (wback_offset != 0)
{
XVECEXP (result, 0, 0)
- = gen_rtx_SET (VOIDmode, from, plus_constant (from, count * 4 * sign));
+ = gen_rtx_SET (VOIDmode, basereg,
+ plus_constant (basereg, wback_offset));
i = 1;
count++;
}
for (j = 0; i < count; i++, j++)
+ XVECEXP (result, 0, i)
+ = gen_rtx_SET (VOIDmode, mems[j], gen_rtx_REG (SImode, regs[j]));
+
+ return result;
+}
+
+/* Generate either a load-multiple or a store-multiple instruction. This
+ function can be used in situations where we can start with a single MEM
+ rtx and adjust its address upwards.
+ COUNT is the number of operations in the instruction, not counting a
+ possible update of the base register. REGS is an array containing the
+ register operands.
+ BASEREG is the base register to be used in addressing the memory operands,
+ which are constructed from BASEMEM.
+ WRITE_BACK specifies whether the generated instruction should include an
+ update of the base register.
+ OFFSETP is used to pass an offset to and from this function; this offset
+ is not used when constructing the address (instead BASEMEM should have an
+ appropriate offset in its address), it is used only for setting
+ MEM_OFFSET. It is updated only if WRITE_BACK is true.*/
+
+static rtx
+arm_gen_multiple_op (bool is_load, int *regs, int count, rtx basereg,
+ bool write_back, rtx basemem, HOST_WIDE_INT *offsetp)
+{
+ rtx mems[MAX_LDM_STM_OPS];
+ HOST_WIDE_INT offset = *offsetp;
+ int i;
+
+ gcc_assert (count <= MAX_LDM_STM_OPS);
+
+ if (GET_CODE (basereg) == PLUS)
+ basereg = XEXP (basereg, 0);
+
+ for (i = 0; i < count; i++)
{
- addr = plus_constant (from, j * 4 * sign);
- mem = adjust_automodify_address_nv (basemem, SImode, addr, offset);
- XVECEXP (result, 0, i)
- = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, base_regno + j), mem);
- offset += 4 * sign;
+ rtx addr = plus_constant (basereg, i * 4);
+ mems[i] = adjust_automodify_address_nv (basemem, SImode, addr, offset);
+ offset += 4;
}
if (write_back)
*offsetp = offset;
- return result;
+ if (is_load)
+ return arm_gen_load_multiple_1 (count, regs, mems, basereg,
+ write_back ? 4 * count : 0);
+ else
+ return arm_gen_store_multiple_1 (count, regs, mems, basereg,
+ write_back ? 4 * count : 0);
}
rtx
-arm_gen_store_multiple (int base_regno, int count, rtx to, int up,
- int write_back, rtx basemem, HOST_WIDE_INT *offsetp)
+arm_gen_load_multiple (int *regs, int count, rtx basereg, int write_back,
+ rtx basemem, HOST_WIDE_INT *offsetp)
{
- HOST_WIDE_INT offset = *offsetp;
- int i = 0, j;
- rtx result;
- int sign = up ? 1 : -1;
- rtx mem, addr;
+ return arm_gen_multiple_op (TRUE, regs, count, basereg, write_back, basemem,
+ offsetp);
+}
- /* See arm_gen_load_multiple for discussion of
- the pros/cons of ldm/stm usage for XScale. */
- if (arm_tune_xscale && count <= 2 && ! optimize_size)
- {
- rtx seq;
+rtx
+arm_gen_store_multiple (int *regs, int count, rtx basereg, int write_back,
+ rtx basemem, HOST_WIDE_INT *offsetp)
+{
+ return arm_gen_multiple_op (FALSE, regs, count, basereg, write_back, basemem,
+ offsetp);
+}
- start_sequence ();
+/* Called from a peephole2 expander to turn a sequence of loads into an
+ LDM instruction. OPERANDS are the operands found by the peephole matcher;
+ NOPS indicates how many separate loads we are trying to combine. SORT_REGS
+ is true if we can reorder the registers because they are used commutatively
+ subsequently.
+ Returns true iff we could generate a new instruction. */
- for (i = 0; i < count; i++)
- {
- addr = plus_constant (to, i * 4 * sign);
- mem = adjust_automodify_address (basemem, SImode, addr, offset);
- emit_move_insn (mem, gen_rtx_REG (SImode, base_regno + i));
- offset += 4 * sign;
- }
+bool
+gen_ldm_seq (rtx *operands, int nops, bool sort_regs)
+{
+ int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
+ rtx mems[MAX_LDM_STM_OPS];
+ int i, j, base_reg;
+ rtx base_reg_rtx;
+ HOST_WIDE_INT offset;
+ int write_back = FALSE;
+ int ldm_case;
+ rtx addr;
+
+ ldm_case = load_multiple_sequence (operands, nops, regs, mem_order,
+ &base_reg, &offset, !sort_regs);
+
+ if (ldm_case == 0)
+ return false;
+
+ if (sort_regs)
+ for (i = 0; i < nops - 1; i++)
+ for (j = i + 1; j < nops; j++)
+ if (regs[i] > regs[j])
+ {
+ int t = regs[i];
+ regs[i] = regs[j];
+ regs[j] = t;
+ }
+ base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
+
+ if (TARGET_THUMB1)
+ {
+ gcc_assert (peep2_reg_dead_p (nops, base_reg_rtx));
+ gcc_assert (ldm_case == 1 || ldm_case == 5);
+ write_back = TRUE;
+ }
- if (write_back)
+ if (ldm_case == 5)
+ {
+ rtx newbase = TARGET_THUMB1 ? base_reg_rtx : gen_rtx_REG (SImode, regs[0]);
+ emit_insn (gen_addsi3 (newbase, base_reg_rtx, GEN_INT (offset)));
+ offset = 0;
+ if (!TARGET_THUMB1)
{
- emit_move_insn (to, plus_constant (to, count * 4 * sign));
- *offsetp = offset;
+ base_reg = regs[0];
+ base_reg_rtx = newbase;
}
+ }
- seq = get_insns ();
- end_sequence ();
+ for (i = 0; i < nops; i++)
+ {
+ addr = plus_constant (base_reg_rtx, offset + i * 4);
+ mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
+ SImode, addr, 0);
+ }
+ emit_insn (arm_gen_load_multiple_1 (nops, regs, mems, base_reg_rtx,
+ write_back ? offset + i * 4 : 0));
+ return true;
+}
- return seq;
+/* Called from a peephole2 expander to turn a sequence of stores into an
+ STM instruction. OPERANDS are the operands found by the peephole matcher;
+ NOPS indicates how many separate stores we are trying to combine.
+ Returns true iff we could generate a new instruction. */
+
+bool
+gen_stm_seq (rtx *operands, int nops)
+{
+ int i;
+ int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
+ rtx mems[MAX_LDM_STM_OPS];
+ int base_reg;
+ rtx base_reg_rtx;
+ HOST_WIDE_INT offset;
+ int write_back = FALSE;
+ int stm_case;
+ rtx addr;
+ bool base_reg_dies;
+
+ stm_case = store_multiple_sequence (operands, nops, nops, regs, NULL,
+ mem_order, &base_reg, &offset, true);
+
+ if (stm_case == 0)
+ return false;
+
+ base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
+
+ base_reg_dies = peep2_reg_dead_p (nops, base_reg_rtx);
+ if (TARGET_THUMB1)
+ {
+ gcc_assert (base_reg_dies);
+ write_back = TRUE;
}
- result = gen_rtx_PARALLEL (VOIDmode,
- rtvec_alloc (count + (write_back ? 1 : 0)));
- if (write_back)
+ if (stm_case == 5)
{
- XVECEXP (result, 0, 0)
- = gen_rtx_SET (VOIDmode, to,
- plus_constant (to, count * 4 * sign));
- i = 1;
- count++;
+ gcc_assert (base_reg_dies);
+ emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset)));
+ offset = 0;
}
- for (j = 0; i < count; i++, j++)
+ addr = plus_constant (base_reg_rtx, offset);
+
+ for (i = 0; i < nops; i++)
{
- addr = plus_constant (to, j * 4 * sign);
- mem = adjust_automodify_address_nv (basemem, SImode, addr, offset);
- XVECEXP (result, 0, i)
- = gen_rtx_SET (VOIDmode, mem, gen_rtx_REG (SImode, base_regno + j));
- offset += 4 * sign;
+ addr = plus_constant (base_reg_rtx, offset + i * 4);
+ mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
+ SImode, addr, 0);
}
+ emit_insn (arm_gen_store_multiple_1 (nops, regs, mems, base_reg_rtx,
+ write_back ? offset + i * 4 : 0));
+ return true;
+}
- if (write_back)
- *offsetp = offset;
+/* Called from a peephole2 expander to turn a sequence of stores that are
+ preceded by constant loads into an STM instruction. OPERANDS are the
+ operands found by the peephole matcher; NOPS indicates how many
+ separate stores we are trying to combine; there are 2 * NOPS
+ instructions in the peephole.
+ Returns true iff we could generate a new instruction. */
- return result;
+bool
+gen_const_stm_seq (rtx *operands, int nops)
+{
+ int regs[MAX_LDM_STM_OPS], sorted_regs[MAX_LDM_STM_OPS];
+ int reg_order[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
+ rtx reg_rtxs[MAX_LDM_STM_OPS], orig_reg_rtxs[MAX_LDM_STM_OPS];
+ rtx mems[MAX_LDM_STM_OPS];
+ int base_reg;
+ rtx base_reg_rtx;
+ HOST_WIDE_INT offset;
+ int write_back = FALSE;
+ int stm_case;
+ rtx addr;
+ bool base_reg_dies;
+ int i, j;
+ HARD_REG_SET allocated;
+
+ stm_case = store_multiple_sequence (operands, nops, 2 * nops, regs, reg_rtxs,
+ mem_order, &base_reg, &offset, false);
+
+ if (stm_case == 0)
+ return false;
+
+ memcpy (orig_reg_rtxs, reg_rtxs, sizeof orig_reg_rtxs);
+
+ /* If the same register is used more than once, try to find a free
+ register. */
+ CLEAR_HARD_REG_SET (allocated);
+ for (i = 0; i < nops; i++)
+ {
+ for (j = i + 1; j < nops; j++)
+ if (regs[i] == regs[j])
+ {
+ rtx t = peep2_find_free_register (0, nops * 2,
+ TARGET_THUMB1 ? "l" : "r",
+ SImode, &allocated);
+ if (t == NULL_RTX)
+ return false;
+ reg_rtxs[i] = t;
+ regs[i] = REGNO (t);
+ }
+ }
+
+ /* Compute an ordering that maps the register numbers to an ascending
+ sequence. */
+ reg_order[0] = 0;
+ for (i = 0; i < nops; i++)
+ if (regs[i] < regs[reg_order[0]])
+ reg_order[0] = i;
+
+ for (i = 1; i < nops; i++)
+ {
+ int this_order = reg_order[i - 1];
+ for (j = 0; j < nops; j++)
+ if (regs[j] > regs[reg_order[i - 1]]
+ && (this_order == reg_order[i - 1]
+ || regs[j] < regs[this_order]))
+ this_order = j;
+ reg_order[i] = this_order;
+ }
+
+ /* Ensure that registers that must be live after the instruction end
+ up with the correct value. */
+ for (i = 0; i < nops; i++)
+ {
+ int this_order = reg_order[i];
+ if ((this_order != mem_order[i]
+ || orig_reg_rtxs[this_order] != reg_rtxs[this_order])
+ && !peep2_reg_dead_p (nops * 2, orig_reg_rtxs[this_order]))
+ return false;
+ }
+
+ /* Load the constants. */
+ for (i = 0; i < nops; i++)
+ {
+ rtx op = operands[2 * nops + mem_order[i]];
+ sorted_regs[i] = regs[reg_order[i]];
+ emit_move_insn (reg_rtxs[reg_order[i]], op);
+ }
+
+ base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
+
+ base_reg_dies = peep2_reg_dead_p (nops * 2, base_reg_rtx);
+ if (TARGET_THUMB1)
+ {
+ gcc_assert (base_reg_dies);
+ write_back = TRUE;
+ }
+
+ if (stm_case == 5)
+ {
+ gcc_assert (base_reg_dies);
+ emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset)));
+ offset = 0;
+ }
+
+ addr = plus_constant (base_reg_rtx, offset);
+
+ for (i = 0; i < nops; i++)
+ {
+ addr = plus_constant (base_reg_rtx, offset + i * 4);
+ mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
+ SImode, addr, 0);
+ }
+ emit_insn (arm_gen_store_multiple_1 (nops, sorted_regs, mems, base_reg_rtx,
+ write_back ? offset + i * 4 : 0));
+ return true;
}
int
for (i = 0; in_words_to_go >= 2; i+=4)
{
if (in_words_to_go > 4)
- emit_insn (arm_gen_load_multiple (0, 4, src, TRUE, TRUE,
- srcbase, &srcoffset));
+ emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, src,
+ TRUE, srcbase, &srcoffset));
else
- emit_insn (arm_gen_load_multiple (0, in_words_to_go, src, TRUE,
- FALSE, srcbase, &srcoffset));
+ emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, in_words_to_go,
+ src, FALSE, srcbase,
+ &srcoffset));
if (out_words_to_go)
{
if (out_words_to_go > 4)
- emit_insn (arm_gen_store_multiple (0, 4, dst, TRUE, TRUE,
- dstbase, &dstoffset));
+ emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, dst,
+ TRUE, dstbase, &dstoffset));
else if (out_words_to_go != 1)
- emit_insn (arm_gen_store_multiple (0, out_words_to_go,
- dst, TRUE,
+ emit_insn (arm_gen_store_multiple (arm_regs_in_sequence,
+ out_words_to_go, dst,
(last_bytes == 0
? FALSE : TRUE),
dstbase, &dstoffset));
&& (rtx_equal_p (XEXP (x, 0), y) || rtx_equal_p (XEXP (x, 1), y)))
return CC_Cmode;
+ if (GET_MODE (x) == DImode || GET_MODE (y) == DImode)
+ {
+ /* To keep things simple, always use the Cirrus cfcmp64 if it is
+ available. */
+ if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK)
+ return CCmode;
+
+ switch (op)
+ {
+ case EQ:
+ case NE:
+ /* A DImode comparison against zero can be implemented by
+ or'ing the two halves together. */
+ if (y == const0_rtx)
+ return CC_Zmode;
+
+ /* We can do an equality test in three Thumb instructions. */
+ if (!TARGET_ARM)
+ return CC_Zmode;
+
+ /* FALLTHROUGH */
+
+ case LTU:
+ case LEU:
+ case GTU:
+ case GEU:
+ /* DImode unsigned comparisons can be implemented by cmp +
+ cmpeq without a scratch register. Not worth doing in
+ Thumb-2. */
+ if (TARGET_ARM)
+ return CC_CZmode;
+
+ /* FALLTHROUGH */
+
+ case LT:
+ case LE:
+ case GT:
+ case GE:
+ /* DImode signed and unsigned comparisons can be implemented
+ by cmp + sbcs with a scratch register, but that does not
+ set the Z flag - we must reverse GT/LE/GTU/LEU. */
+ gcc_assert (op != EQ && op != NE);
+ return CC_NCVmode;
+
+ default:
+ gcc_unreachable ();
+ }
+ }
+
return CCmode;
}
rtx
arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y)
{
- enum machine_mode mode = SELECT_CC_MODE (code, x, y);
- rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+ enum machine_mode mode;
+ rtx cc_reg;
+ int dimode_comparison = GET_MODE (x) == DImode || GET_MODE (y) == DImode;
- emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
+ /* We might have X as a constant, Y as a register because of the predicates
+ used for cmpdi. If so, force X to a register here. */
+ if (dimode_comparison && !REG_P (x))
+ x = force_reg (DImode, x);
+
+ mode = SELECT_CC_MODE (code, x, y);
+ cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+
+ if (dimode_comparison
+ && !(TARGET_HARD_FLOAT && TARGET_MAVERICK)
+ && mode != CC_CZmode)
+ {
+ rtx clobber, set;
+
+ /* To compare two non-zero values for equality, XOR them and
+ then compare against zero. Not used for ARM mode; there
+ CC_CZmode is cheaper. */
+ if (mode == CC_Zmode && y != const0_rtx)
+ {
+ x = expand_binop (DImode, xor_optab, x, y, NULL_RTX, 0, OPTAB_WIDEN);
+ y = const0_rtx;
+ }
+ /* A scratch register is required. */
+ clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (SImode));
+ set = gen_rtx_SET (VOIDmode, cc_reg, gen_rtx_COMPARE (mode, x, y));
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
+ }
+ else
+ emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
return cc_reg;
}
return false;
}
+/* Return true if it is possible to inline both the high and low parts
+ of a 64-bit constant into 32-bit data processing instructions. */
+bool
+arm_const_double_by_immediates (rtx val)
+{
+ enum machine_mode mode = GET_MODE (val);
+ rtx part;
+
+ if (mode == VOIDmode)
+ mode = DImode;
+
+ part = gen_highpart_mode (SImode, mode, val);
+
+ gcc_assert (GET_CODE (part) == CONST_INT);
+
+ if (!const_ok_for_arm (INTVAL (part)))
+ return false;
+
+ part = gen_lowpart (SImode, val);
+
+ gcc_assert (GET_CODE (part) == CONST_INT);
+
+ if (!const_ok_for_arm (INTVAL (part)))
+ return false;
+
+ return true;
+}
+
/* Scan INSN and note any of its operands that need fixing.
If DO_PUSHES is false we do not actually push any of the fixups
needed. The function returns TRUE if any fixups were needed/pushed.
FOR_EACH_BB (bb)
{
rtx insn;
+
COPY_REG_SET (&live, DF_LR_OUT (bb));
df_simulate_initialize_backwards (bb, &live);
FOR_BB_INSNS_REVERSE (bb, insn)
rtx dst = XEXP (pat, 0);
rtx src = XEXP (pat, 1);
rtx op0 = XEXP (src, 0);
+ rtx op1 = (GET_RTX_CLASS (GET_CODE (src)) == RTX_COMM_ARITH
+ ? XEXP (src, 1) : NULL);
+
if (rtx_equal_p (dst, op0)
|| GET_CODE (src) == PLUS || GET_CODE (src) == MINUS)
{
rtx ccreg = gen_rtx_REG (CCmode, CC_REGNUM);
rtx clobber = gen_rtx_CLOBBER (VOIDmode, ccreg);
rtvec vec = gen_rtvec (2, pat, clobber);
+
+ PATTERN (insn) = gen_rtx_PARALLEL (VOIDmode, vec);
+ INSN_CODE (insn) = -1;
+ }
+ /* We can also handle a commutative operation where the
+ second operand matches the destination. */
+ else if (op1 && rtx_equal_p (dst, op1))
+ {
+ rtx ccreg = gen_rtx_REG (CCmode, CC_REGNUM);
+ rtx clobber = gen_rtx_CLOBBER (VOIDmode, ccreg);
+ rtvec vec;
+
+ src = copy_rtx (src);
+ XEXP (src, 0) = op1;
+ XEXP (src, 1) = op0;
+ pat = gen_rtx_SET (VOIDmode, dst, src);
+ vec = gen_rtvec (2, pat, clobber);
PATTERN (insn) = gen_rtx_PARALLEL (VOIDmode, vec);
INSN_CODE (insn) = -1;
}
}
}
+
if (NONDEBUG_INSN_P (insn))
df_simulate_one_insn_backwards (bb, insn, &live);
}
}
+
CLEAR_REG_SET (&live);
}
{
if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY)
{
- output_asm_insn ("ldr%?\t%0, [%1, %2]!", otherops);
- output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
+ output_asm_insn ("str%?\t%0, [%1, %2]!", otherops);
+ output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
}
else
{
- output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
- output_asm_insn ("ldr%?\t%0, [%1], %2", otherops);
+ output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
+ output_asm_insn ("str%?\t%0, [%1], %2", otherops);
}
}
else if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY)
return "";
}
+/* Compute and return the length of neon_mov<mode>, where <mode> is
+ one of VSTRUCT modes: EI, OI, CI or XI. */
+int
+arm_attr_length_move_neon (rtx insn)
+{
+ rtx reg, mem, addr;
+ int load;
+ enum machine_mode mode;
+
+ extract_insn_cached (insn);
+
+ if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
+ {
+ mode = GET_MODE (recog_data.operand[0]);
+ switch (mode)
+ {
+ case EImode:
+ case OImode:
+ return 8;
+ case CImode:
+ return 12;
+ case XImode:
+ return 16;
+ default:
+ gcc_unreachable ();
+ }
+ }
+
+ load = REG_P (recog_data.operand[0]);
+ reg = recog_data.operand[!load];
+ mem = recog_data.operand[load];
+
+ gcc_assert (MEM_P (mem));
+
+ mode = GET_MODE (reg);
+ addr = XEXP (mem, 0);
+
+ /* Strip off const from addresses like (const (plus (...))). */
+ if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS)
+ addr = XEXP (addr, 0);
+
+ if (GET_CODE (addr) == LABEL_REF || GET_CODE (addr) == PLUS)
+ {
+ int insns = HARD_REGNO_NREGS (REGNO (reg), mode) / 2;
+ return insns * 4;
+ }
+ else
+ return 4;
+}
+
+/* Return nonzero if the offset in the address is an immediate. Otherwise,
+ return zero. */
+
+int
+arm_address_offset_is_imm (rtx insn)
+{
+ rtx mem, addr;
+
+ extract_insn_cached (insn);
+
+ if (REG_P (recog_data.operand[0]))
+ return 0;
+
+ mem = recog_data.operand[0];
+
+ gcc_assert (MEM_P (mem));
+
+ addr = XEXP (mem, 0);
+
+ if (GET_CODE (addr) == REG
+ || (GET_CODE (addr) == PLUS
+ && GET_CODE (XEXP (addr, 0)) == REG
+ && GET_CODE (XEXP (addr, 1)) == CONST_INT))
+ return 1;
+ else
+ return 0;
+}
+
/* Output an ADD r, s, #n where n may be too big for one instruction.
If adding zero to one register, output nothing. */
const char *
&& !crtl->tail_call_emit)
{
unsigned long mask;
- mask = (1 << (arm_size_return_regs() / 4)) - 1;
+ /* Preserve return values, of any size. */
+ mask = (1 << ((arm_size_return_regs() + 3) / 4)) - 1;
mask ^= 0xf;
mask &= ~saved_regs_mask;
reg = 0;
generates better code on Thumb-2 by avoiding the need to
use 32-bit push/pop instructions. */
if (!crtl->tail_call_emit
- && arm_size_return_regs () <= 12)
+ && arm_size_return_regs () <= 12
+ && (offsets->saved_regs_mask & (1 << 3)) == 0)
{
reg = 3;
}
using the EABI unwinder, to prevent faulting instructions from being
swapped with a stack adjustment. */
if (crtl->profile || !TARGET_SCHED_PROLOG
- || (ARM_EABI_UNWIND_TABLES && cfun->can_throw_non_call_exceptions))
+ || (arm_except_unwind_info (&global_options) == UI_TARGET
+ && cfun->can_throw_non_call_exceptions))
emit_insn (gen_blockage ());
/* If the link register is being kept alive, with the return address in it,
the value being loaded is big-wordian or little-wordian. The
order of the two register loads can matter however, if the address
of the memory location is actually held in one of the registers
- being overwritten by the load. */
+ being overwritten by the load.
+
+ The 'Q' and 'R' constraints are also available for 64-bit
+ constants. */
case 'Q':
+ if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE)
+ {
+ rtx part = gen_lowpart (SImode, x);
+ fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part));
+ return;
+ }
+
if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM)
{
output_operand_lossage ("invalid operand for code '%c'", code);
return;
case 'R':
+ if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE)
+ {
+ enum machine_mode mode = GET_MODE (x);
+ rtx part;
+
+ if (mode == VOIDmode)
+ mode = DImode;
+ part = gen_highpart_mode (SImode, mode, x);
+ fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part));
+ return;
+ }
+
if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM)
{
output_operand_lossage ("invalid operand for code '%c'", code);
{
rtx addr;
bool postinc = FALSE;
+ unsigned align, modesize, align_bits;
+
gcc_assert (GET_CODE (x) == MEM);
addr = XEXP (x, 0);
if (GET_CODE (addr) == POST_INC)
postinc = 1;
addr = XEXP (addr, 0);
}
- asm_fprintf (stream, "[%r]", REGNO (addr));
+ asm_fprintf (stream, "[%r", REGNO (addr));
+
+ /* We know the alignment of this access, so we can emit a hint in the
+ instruction (for some alignments) as an aid to the memory subsystem
+ of the target. */
+ align = MEM_ALIGN (x) >> 3;
+ modesize = GET_MODE_SIZE (GET_MODE (x));
+
+ /* Only certain alignment specifiers are supported by the hardware. */
+ if (modesize == 16 && (align % 32) == 0)
+ align_bits = 256;
+ else if ((modesize == 8 || modesize == 16) && (align % 16) == 0)
+ align_bits = 128;
+ else if ((align % 8) == 0)
+ align_bits = 64;
+ else
+ align_bits = 0;
+
+ if (align_bits != 0)
+ asm_fprintf (stream, ":%d", align_bits);
+
+ asm_fprintf (stream, "]");
+
if (postinc)
fputs("!", stream);
}
return;
+ case 'C':
+ {
+ rtx addr;
+
+ gcc_assert (GET_CODE (x) == MEM);
+ addr = XEXP (x, 0);
+ gcc_assert (GET_CODE (addr) == REG);
+ asm_fprintf (stream, "[%r]", REGNO (addr));
+ }
+ return;
+
/* Translate an S register number into a D register number and element index. */
case 'y':
{
case CC_Cmode:
switch (comp_code)
- {
- case LTU: return ARM_CS;
- case GEU: return ARM_CC;
- default: gcc_unreachable ();
- }
+ {
+ case LTU: return ARM_CS;
+ case GEU: return ARM_CC;
+ default: gcc_unreachable ();
+ }
+
+ case CC_CZmode:
+ switch (comp_code)
+ {
+ case NE: return ARM_NE;
+ case EQ: return ARM_EQ;
+ case GEU: return ARM_CS;
+ case GTU: return ARM_HI;
+ case LEU: return ARM_LS;
+ case LTU: return ARM_CC;
+ default: gcc_unreachable ();
+ }
+
+ case CC_NCVmode:
+ switch (comp_code)
+ {
+ case GE: return ARM_GE;
+ case LT: return ARM_LT;
+ case GEU: return ARM_CS;
+ case LTU: return ARM_CC;
+ default: gcc_unreachable ();
+ }
case CCmode:
switch (comp_code)
static enum insn_code
locate_neon_builtin_icode (int fcode, neon_itype *itype)
{
- neon_builtin_datum key, *found;
+ neon_builtin_datum key
+ = { NULL, (neon_itype) 0, 0, { CODE_FOR_nothing }, 0, 0 };
+ neon_builtin_datum *found;
int idx;
key.base_fcode = fcode;
return;
}
- if (ARM_EABI_UNWIND_TABLES && push)
+ if (push && arm_except_unwind_info (&global_options) == UI_TARGET)
{
fprintf (f, "\t.save\t{");
for (regno = 0; regno < 15; regno++)
/* Return to caller. */
asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
}
-
\f
+/* Scan INSN just before assembler is output for it.
+ For Thumb-1, we track the status of the condition codes; this
+ information is used in the cbranchsi4_insn pattern. */
void
thumb1_final_prescan_insn (rtx insn)
{
if (flag_print_asm_name)
asm_fprintf (asm_out_file, "%@ 0x%04x\n",
INSN_ADDRESSES (INSN_UID (insn)));
+ /* Don't overwrite the previous setter when we get to a cbranch. */
+ if (INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn)
+ {
+ enum attr_conds conds;
+
+ if (cfun->machine->thumb1_cc_insn)
+ {
+ if (modified_in_p (cfun->machine->thumb1_cc_op0, insn)
+ || modified_in_p (cfun->machine->thumb1_cc_op1, insn))
+ CC_STATUS_INIT;
+ }
+ conds = get_attr_conds (insn);
+ if (conds == CONDS_SET)
+ {
+ rtx set = single_set (insn);
+ cfun->machine->thumb1_cc_insn = insn;
+ cfun->machine->thumb1_cc_op0 = SET_DEST (set);
+ cfun->machine->thumb1_cc_op1 = const0_rtx;
+ cfun->machine->thumb1_cc_mode = CC_NOOVmode;
+ if (INSN_CODE (insn) == CODE_FOR_thumb1_subsi3_insn)
+ {
+ rtx src1 = XEXP (SET_SRC (set), 1);
+ if (src1 == const0_rtx)
+ cfun->machine->thumb1_cc_mode = CCmode;
+ }
+ }
+ else if (conds != CONDS_NOCOND)
+ cfun->machine->thumb1_cc_insn = NULL_RTX;
+ }
}
int
#endif
}
+/* Given the stack offsets and register mask in OFFSETS, decide how
+ many additional registers to push instead of subtracting a constant
+ from SP. For epilogues the principle is the same except we use pop.
+ FOR_PROLOGUE indicates which we're generating. */
+static int
+thumb1_extra_regs_pushed (arm_stack_offsets *offsets, bool for_prologue)
+{
+ HOST_WIDE_INT amount;
+ unsigned long live_regs_mask = offsets->saved_regs_mask;
+ /* Extract a mask of the ones we can give to the Thumb's push/pop
+ instruction. */
+ unsigned long l_mask = live_regs_mask & (for_prologue ? 0x40ff : 0xff);
+ /* Then count how many other high registers will need to be pushed. */
+ unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
+ int n_free, reg_base;
+
+ if (!for_prologue && frame_pointer_needed)
+ amount = offsets->locals_base - offsets->saved_regs;
+ else
+ amount = offsets->outgoing_args - offsets->saved_regs;
+
+ /* If the stack frame size is 512 exactly, we can save one load
+ instruction, which should make this a win even when optimizing
+ for speed. */
+ if (!optimize_size && amount != 512)
+ return 0;
+
+ /* Can't do this if there are high registers to push. */
+ if (high_regs_pushed != 0)
+ return 0;
+
+ /* Shouldn't do it in the prologue if no registers would normally
+ be pushed at all. In the epilogue, also allow it if we'll have
+ a pop insn for the PC. */
+ if (l_mask == 0
+ && (for_prologue
+ || TARGET_BACKTRACE
+ || (live_regs_mask & 1 << LR_REGNUM) == 0
+ || TARGET_INTERWORK
+ || crtl->args.pretend_args_size != 0))
+ return 0;
+
+ /* Don't do this if thumb_expand_prologue wants to emit instructions
+ between the push and the stack frame allocation. */
+ if (for_prologue
+ && ((flag_pic && arm_pic_register != INVALID_REGNUM)
+ || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0)))
+ return 0;
+
+ reg_base = 0;
+ n_free = 0;
+ if (!for_prologue)
+ {
+ reg_base = arm_size_return_regs () / UNITS_PER_WORD;
+ live_regs_mask >>= reg_base;
+ }
+
+ while (reg_base + n_free < 8 && !(live_regs_mask & 1)
+ && (for_prologue || call_used_regs[reg_base + n_free]))
+ {
+ live_regs_mask >>= 1;
+ n_free++;
+ }
+
+ if (n_free == 0)
+ return 0;
+ gcc_assert (amount / 4 * 4 == amount);
+
+ if (amount >= 512 && (amount - n_free * 4) < 512)
+ return (amount - 508) / 4;
+ if (amount <= n_free * 4)
+ return amount / 4;
+ return 0;
+}
+
/* The bits which aren't usefully expanded as rtl. */
const char *
thumb_unexpanded_epilogue (void)
int regno;
unsigned long live_regs_mask = 0;
int high_regs_pushed = 0;
+ int extra_pop;
int had_to_push_lr;
int size;
the register is used to hold a return value. */
size = arm_size_return_regs ();
+ extra_pop = thumb1_extra_regs_pushed (offsets, false);
+ if (extra_pop > 0)
+ {
+ unsigned long extra_mask = (1 << extra_pop) - 1;
+ live_regs_mask |= extra_mask << (size / UNITS_PER_WORD);
+ }
+
/* The prolog may have pushed some high registers to use as
work registers. e.g. the testsuite file:
gcc/testsuite/gcc/gcc.c-torture/execute/complex-2.c
live_regs_mask);
/* We have either just popped the return address into the
- PC or it is was kept in LR for the entire function. */
+ PC or it is was kept in LR for the entire function.
+ Note that thumb_pushpop has already called thumb_exit if the
+ PC was in the list. */
if (!had_to_push_lr)
thumb_exit (asm_out_file, LR_REGNUM);
}
}
}
-/* Given the stack offsets and register mask in OFFSETS, decide
- how many additional registers to push instead of subtracting
- a constant from SP. */
-static int
-thumb1_extra_regs_pushed (arm_stack_offsets *offsets)
-{
- HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs;
- unsigned long live_regs_mask = offsets->saved_regs_mask;
- /* Extract a mask of the ones we can give to the Thumb's push instruction. */
- unsigned long l_mask = live_regs_mask & 0x40ff;
- /* Then count how many other high registers will need to be pushed. */
- unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
- int n_free;
-
- /* If the stack frame size is 512 exactly, we can save one load
- instruction, which should make this a win even when optimizing
- for speed. */
- if (!optimize_size && amount != 512)
- return 0;
-
- /* Can't do this if there are high registers to push, or if we
- are not going to do a push at all. */
- if (high_regs_pushed != 0 || l_mask == 0)
- return 0;
-
- /* Don't do this if thumb1_expand_prologue wants to emit instructions
- between the push and the stack frame allocation. */
- if ((flag_pic && arm_pic_register != INVALID_REGNUM)
- || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))
- return 0;
-
- for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1)
- n_free++;
-
- if (n_free == 0)
- return 0;
- gcc_assert (amount / 4 * 4 == amount);
-
- if (amount >= 512 && (amount - n_free * 4) < 512)
- return (amount - 508) / 4;
- if (amount <= n_free * 4)
- return amount / 4;
- return 0;
-}
-
/* Generate the rest of a function's prologue. */
void
thumb1_expand_prologue (void)
stack_pointer_rtx);
amount = offsets->outgoing_args - offsets->saved_regs;
- amount -= 4 * thumb1_extra_regs_pushed (offsets);
+ amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
if (amount)
{
if (amount < 512)
using the EABI unwinder, to prevent faulting instructions from being
swapped with a stack adjustment. */
if (crtl->profile || !TARGET_SCHED_PROLOG
- || (ARM_EABI_UNWIND_TABLES && cfun->can_throw_non_call_exceptions))
+ || (arm_except_unwind_info (&global_options) == UI_TARGET
+ && cfun->can_throw_non_call_exceptions))
emit_insn (gen_blockage ());
cfun->machine->lr_save_eliminated = !thumb_force_lr_save ();
emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx));
amount = offsets->locals_base - offsets->saved_regs;
}
+ amount -= 4 * thumb1_extra_regs_pushed (offsets, false);
gcc_assert (amount >= 0);
if (amount)
if (crtl->args.pretend_args_size)
{
/* Output unwind directive for the stack adjustment. */
- if (ARM_EABI_UNWIND_TABLES)
+ if (arm_except_unwind_info (&global_options) == UI_TARGET)
fprintf (f, "\t.pad #%d\n",
crtl->args.pretend_args_size);
work_register = thumb_find_work_register (live_regs_mask);
- if (ARM_EABI_UNWIND_TABLES)
+ if (arm_except_unwind_info (&global_options) == UI_TARGET)
asm_fprintf (f, "\t.pad #16\n");
asm_fprintf
|| (high_regs_pushed == 0 && l_mask))
{
unsigned long mask = l_mask;
- mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1;
+ mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1;
thumb_pushpop (f, mask, 1, &cfa_offset, mask);
}
return !reg_overlap_mentioned_p (value, addr);
}
+/* Return nonzero if the CONSUMER instruction (a store) does need
+ PRODUCER's value to calculate the address. */
+
+int
+arm_early_store_addr_dep (rtx producer, rtx consumer)
+{
+ return !arm_no_early_store_addr_dep (producer, consumer);
+}
+
+/* Return nonzero if the CONSUMER instruction (a load) does need
+ PRODUCER's value to calculate the address. */
+
+int
+arm_early_load_addr_dep (rtx producer, rtx consumer)
+{
+ rtx value = PATTERN (producer);
+ rtx addr = PATTERN (consumer);
+
+ if (GET_CODE (value) == COND_EXEC)
+ value = COND_EXEC_CODE (value);
+ if (GET_CODE (value) == PARALLEL)
+ value = XVECEXP (value, 0, 0);
+ value = XEXP (value, 0);
+ if (GET_CODE (addr) == COND_EXEC)
+ addr = COND_EXEC_CODE (addr);
+ if (GET_CODE (addr) == PARALLEL)
+ addr = XVECEXP (addr, 0, 0);
+ addr = XEXP (addr, 1);
+
+ return reg_overlap_mentioned_p (value, addr);
+}
+
/* Return nonzero if the CONSUMER instruction (an ALU op) does not
have an early register shift value or amount dependency on the
result of PRODUCER. */
else
addr = plus_constant (addr, delta);
- emit_move_insn (gen_frame_mem (Pmode, addr), source);
- }
- else
- emit_move_insn (gen_rtx_REG (Pmode, LR_REGNUM), source);
+ emit_move_insn (gen_frame_mem (Pmode, addr), source);
+ }
+ else
+ emit_move_insn (gen_rtx_REG (Pmode, LR_REGNUM), source);
+}
+
+/* Implements target hook vector_mode_supported_p. */
+bool
+arm_vector_mode_supported_p (enum machine_mode mode)
+{
+ /* Neon also supports V2SImode, etc. listed in the clause below. */
+ if (TARGET_NEON && (mode == V2SFmode || mode == V4SImode || mode == V8HImode
+ || mode == V16QImode || mode == V4SFmode || mode == V2DImode))
+ return true;
+
+ if ((TARGET_NEON || TARGET_IWMMXT)
+ && ((mode == V2SImode)
+ || (mode == V4HImode)
+ || (mode == V8QImode)))
+ return true;
+
+ return false;
+}
+
+/* Use the option -mvectorize-with-neon-quad to override the use of doubleword
+ registers when autovectorizing for Neon, at least until multiple vector
+ widths are supported properly by the middle-end. */
+
+static enum machine_mode
+arm_preferred_simd_mode (enum machine_mode mode)
+{
+ if (TARGET_NEON)
+ switch (mode)
+ {
+ case SFmode:
+ return TARGET_NEON_VECTORIZE_QUAD ? V4SFmode : V2SFmode;
+ case SImode:
+ return TARGET_NEON_VECTORIZE_QUAD ? V4SImode : V2SImode;
+ case HImode:
+ return TARGET_NEON_VECTORIZE_QUAD ? V8HImode : V4HImode;
+ case QImode:
+ return TARGET_NEON_VECTORIZE_QUAD ? V16QImode : V8QImode;
+ case DImode:
+ if (TARGET_NEON_VECTORIZE_QUAD)
+ return V2DImode;
+ break;
+
+ default:;
+ }
+
+ if (TARGET_REALLY_IWMMXT)
+ switch (mode)
+ {
+ case SImode:
+ return V2SImode;
+ case HImode:
+ return V4HImode;
+ case QImode:
+ return V8QImode;
+
+ default:;
+ }
+
+ return word_mode;
}
-/* Implements target hook vector_mode_supported_p. */
-bool
-arm_vector_mode_supported_p (enum machine_mode mode)
-{
- /* Neon also supports V2SImode, etc. listed in the clause below. */
- if (TARGET_NEON && (mode == V2SFmode || mode == V4SImode || mode == V8HImode
- || mode == V16QImode || mode == V4SFmode || mode == V2DImode))
- return true;
+/* Implement TARGET_CLASS_LIKELY_SPILLED_P.
+
+ We need to define this for LO_REGS on thumb. Otherwise we can end up
+ using r0-r4 for function arguments, r7 for the stack frame and don't
+ have enough left over to do doubleword arithmetic. */
- if ((TARGET_NEON || TARGET_IWMMXT)
- && ((mode == V2SImode)
- || (mode == V4HImode)
- || (mode == V8QImode)))
+static bool
+arm_class_likely_spilled_p (reg_class_t rclass)
+{
+ if ((TARGET_THUMB && rclass == LO_REGS)
+ || rclass == CC_REG)
return true;
return false;
return p;
}
-#ifdef TARGET_UNWIND_INFO
+#if ARM_UNWIND_INFO
/* Emit unwind directives for a store-multiple instruction or stack pointer
push during alignment.
These should only ever be generated by the function prologue code, so
{
rtx pat;
- if (!ARM_EABI_UNWIND_TABLES)
+ if (arm_except_unwind_info (&global_options) != UI_TARGET)
return;
if (!(flag_unwind_tables || crtl->uses_eh_lsda)
return TRUE;
}
-#endif /* TARGET_UNWIND_INFO */
+
+/* Implement TARGET_ASM_EMIT_EXCEPT_PERSONALITY. */
+
+static void
+arm_asm_emit_except_personality (rtx personality)
+{
+ fputs ("\t.personality\t", asm_out_file);
+ output_addr_const (asm_out_file, personality);
+ fputc ('\n', asm_out_file);
+}
+
+/* Implement TARGET_ASM_INITIALIZE_SECTIONS. */
+
+static void
+arm_asm_init_sections (void)
+{
+ exception_section = get_unnamed_section (0, output_section_asm_op,
+ "\t.handlerdata");
+}
+#endif /* ARM_UNWIND_INFO */
+
+/* Implement TARGET_EXCEPT_UNWIND_INFO. */
+
+static enum unwind_info_type
+arm_except_unwind_info (struct gcc_options *opts)
+{
+ /* Honor the --enable-sjlj-exceptions configure switch. */
+#ifdef CONFIG_SJLJ_EXCEPTIONS
+ if (CONFIG_SJLJ_EXCEPTIONS)
+ return UI_SJLJ;
+#endif
+
+ /* If not using ARM EABI unwind tables... */
+ if (ARM_UNWIND_INFO)
+ {
+ /* For simplicity elsewhere in this file, indicate that all unwind
+ info is disabled if we're not emitting unwind tables. */
+ if (!opts->x_flag_exceptions && !opts->x_flag_unwind_tables)
+ return UI_NONE;
+ else
+ return UI_TARGET;
+ }
+
+ /* ... we use sjlj exceptions for backwards compatibility. */
+ return UI_SJLJ;
+}
/* Handle UNSPEC DWARF call frame instructions. These are needed for dynamic
void
arm_output_fn_unwind (FILE * f, bool prologue)
{
- if (!ARM_EABI_UNWIND_TABLES)
+ if (arm_except_unwind_info (&global_options) != UI_TARGET)
return;
if (prologue)
fputs ("(tlsldo)", file);
}
-bool
+/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
+
+static bool
arm_output_addr_const_extra (FILE *fp, rtx x)
{
if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
{
case cortexr4:
case cortexr4f:
+ case cortexa5:
case cortexa8:
case cortexa9:
return 2;
sizeof (thumb_core_reg_alloc_order));
}
-/* Set default optimization options. */
-void
-arm_optimization_options (int level, int size ATTRIBUTE_UNUSED)
-{
- /* Enable section anchors by default at -O1 or higher.
- Use 2 to distinguish from an explicit -fsection-anchors
- given on the command line. */
- if (level > 0)
- flag_section_anchors = 2;
-}
-
/* Implement TARGET_FRAME_POINTER_REQUIRED. */
bool
return !TARGET_THUMB1;
}
+/* Legitimize a memory reference for sync primitive implemented using
+ ldrex / strex. We currently force the form of the reference to be
+ indirect without offset. We do not yet support the indirect offset
+ addressing supported by some ARM targets for these
+ instructions. */
+static rtx
+arm_legitimize_sync_memory (rtx memory)
+{
+ rtx addr = force_reg (Pmode, XEXP (memory, 0));
+ rtx legitimate_memory = gen_rtx_MEM (GET_MODE (memory), addr);
+
+ set_mem_alias_set (legitimate_memory, ALIAS_SET_MEMORY_BARRIER);
+ MEM_VOLATILE_P (legitimate_memory) = MEM_VOLATILE_P (memory);
+ return legitimate_memory;
+}
+
+/* An instruction emitter. */
+typedef void (* emit_f) (int label, const char *, rtx *);
+
+/* An instruction emitter that emits via the conventional
+ output_asm_insn. */
+static void
+arm_emit (int label ATTRIBUTE_UNUSED, const char *pattern, rtx *operands)
+{
+ output_asm_insn (pattern, operands);
+}
+
+/* Count the number of emitted synchronization instructions. */
+static unsigned arm_insn_count;
+
+/* An emitter that counts emitted instructions but does not actually
+ emit instruction into the the instruction stream. */
+static void
+arm_count (int label,
+ const char *pattern ATTRIBUTE_UNUSED,
+ rtx *operands ATTRIBUTE_UNUSED)
+{
+ if (! label)
+ ++ arm_insn_count;
+}
+
+/* Construct a pattern using conventional output formatting and feed
+ it to output_asm_insn. Provides a mechanism to construct the
+ output pattern on the fly. Note the hard limit on the pattern
+ buffer size. */
+static void ATTRIBUTE_PRINTF_4
+arm_output_asm_insn (emit_f emit, int label, rtx *operands,
+ const char *pattern, ...)
+{
+ va_list ap;
+ char buffer[256];
+
+ va_start (ap, pattern);
+ vsprintf (buffer, pattern, ap);
+ va_end (ap);
+ emit (label, buffer, operands);
+}
+
+/* Emit the memory barrier instruction, if any, provided by this
+ target to a specified emitter. */
+static void
+arm_process_output_memory_barrier (emit_f emit, rtx *operands)
+{
+ if (TARGET_HAVE_DMB)
+ {
+ /* Note we issue a system level barrier. We should consider
+ issuing a inner shareabilty zone barrier here instead, ie.
+ "DMB ISH". */
+ emit (0, "dmb\tsy", operands);
+ return;
+ }
+
+ if (TARGET_HAVE_DMB_MCR)
+ {
+ emit (0, "mcr\tp15, 0, r0, c7, c10, 5", operands);
+ return;
+ }
+
+ gcc_unreachable ();
+}
+
+/* Emit the memory barrier instruction, if any, provided by this
+ target. */
+const char *
+arm_output_memory_barrier (rtx *operands)
+{
+ arm_process_output_memory_barrier (arm_emit, operands);
+ return "";
+}
+
+/* Helper to figure out the instruction suffix required on ldrex/strex
+ for operations on an object of the specified mode. */
+static const char *
+arm_ldrex_suffix (enum machine_mode mode)
+{
+ switch (mode)
+ {
+ case QImode: return "b";
+ case HImode: return "h";
+ case SImode: return "";
+ case DImode: return "d";
+ default:
+ gcc_unreachable ();
+ }
+ return "";
+}
+
+/* Emit an ldrex{b,h,d, } instruction appropriate for the specified
+ mode. */
+static void
+arm_output_ldrex (emit_f emit,
+ enum machine_mode mode,
+ rtx target,
+ rtx memory)
+{
+ const char *suffix = arm_ldrex_suffix (mode);
+ rtx operands[2];
+
+ operands[0] = target;
+ operands[1] = memory;
+ arm_output_asm_insn (emit, 0, operands, "ldrex%s\t%%0, %%C1", suffix);
+}
+
+/* Emit a strex{b,h,d, } instruction appropriate for the specified
+ mode. */
+static void
+arm_output_strex (emit_f emit,
+ enum machine_mode mode,
+ const char *cc,
+ rtx result,
+ rtx value,
+ rtx memory)
+{
+ const char *suffix = arm_ldrex_suffix (mode);
+ rtx operands[3];
+
+ operands[0] = result;
+ operands[1] = value;
+ operands[2] = memory;
+ arm_output_asm_insn (emit, 0, operands, "strex%s%s\t%%0, %%1, %%C2", suffix,
+ cc);
+}
+
+/* Helper to emit a two operand instruction. */
+static void
+arm_output_op2 (emit_f emit, const char *mnemonic, rtx d, rtx s)
+{
+ rtx operands[2];
+
+ operands[0] = d;
+ operands[1] = s;
+ arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1", mnemonic);
+}
+
+/* Helper to emit a three operand instruction. */
+static void
+arm_output_op3 (emit_f emit, const char *mnemonic, rtx d, rtx a, rtx b)
+{
+ rtx operands[3];
+
+ operands[0] = d;
+ operands[1] = a;
+ operands[2] = b;
+ arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1, %%2", mnemonic);
+}
+
+/* Emit a load store exclusive synchronization loop.
+
+ do
+ old_value = [mem]
+ if old_value != required_value
+ break;
+ t1 = sync_op (old_value, new_value)
+ [mem] = t1, t2 = [0|1]
+ while ! t2
+
+ Note:
+ t1 == t2 is not permitted
+ t1 == old_value is permitted
+
+ required_value:
+
+ RTX register or const_int representing the required old_value for
+ the modify to continue, if NULL no comparsion is performed. */
+static void
+arm_output_sync_loop (emit_f emit,
+ enum machine_mode mode,
+ rtx old_value,
+ rtx memory,
+ rtx required_value,
+ rtx new_value,
+ rtx t1,
+ rtx t2,
+ enum attr_sync_op sync_op,
+ int early_barrier_required)
+{
+ rtx operands[1];
+
+ gcc_assert (t1 != t2);
+
+ if (early_barrier_required)
+ arm_process_output_memory_barrier (emit, NULL);
+
+ arm_output_asm_insn (emit, 1, operands, "%sLSYT%%=:", LOCAL_LABEL_PREFIX);
+
+ arm_output_ldrex (emit, mode, old_value, memory);
+
+ if (required_value)
+ {
+ rtx operands[2];
+
+ operands[0] = old_value;
+ operands[1] = required_value;
+ arm_output_asm_insn (emit, 0, operands, "cmp\t%%0, %%1");
+ arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYB%%=", LOCAL_LABEL_PREFIX);
+ }
+
+ switch (sync_op)
+ {
+ case SYNC_OP_ADD:
+ arm_output_op3 (emit, "add", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_SUB:
+ arm_output_op3 (emit, "sub", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_IOR:
+ arm_output_op3 (emit, "orr", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_XOR:
+ arm_output_op3 (emit, "eor", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_AND:
+ arm_output_op3 (emit,"and", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_NAND:
+ arm_output_op3 (emit, "and", t1, old_value, new_value);
+ arm_output_op2 (emit, "mvn", t1, t1);
+ break;
+
+ case SYNC_OP_NONE:
+ t1 = new_value;
+ break;
+ }
+
+ arm_output_strex (emit, mode, "", t2, t1, memory);
+ operands[0] = t2;
+ arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
+ arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=", LOCAL_LABEL_PREFIX);
+
+ arm_process_output_memory_barrier (emit, NULL);
+ arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX);
+}
+
+static rtx
+arm_get_sync_operand (rtx *operands, int index, rtx default_value)
+{
+ if (index > 0)
+ default_value = operands[index - 1];
+
+ return default_value;
+}
+
+#define FETCH_SYNC_OPERAND(NAME, DEFAULT) \
+ arm_get_sync_operand (operands, (int) get_attr_sync_##NAME (insn), DEFAULT);
+
+/* Extract the operands for a synchroniztion instruction from the
+ instructions attributes and emit the instruction. */
+static void
+arm_process_output_sync_insn (emit_f emit, rtx insn, rtx *operands)
+{
+ rtx result, memory, required_value, new_value, t1, t2;
+ int early_barrier;
+ enum machine_mode mode;
+ enum attr_sync_op sync_op;
+
+ result = FETCH_SYNC_OPERAND(result, 0);
+ memory = FETCH_SYNC_OPERAND(memory, 0);
+ required_value = FETCH_SYNC_OPERAND(required_value, 0);
+ new_value = FETCH_SYNC_OPERAND(new_value, 0);
+ t1 = FETCH_SYNC_OPERAND(t1, 0);
+ t2 = FETCH_SYNC_OPERAND(t2, 0);
+ early_barrier =
+ get_attr_sync_release_barrier (insn) == SYNC_RELEASE_BARRIER_YES;
+ sync_op = get_attr_sync_op (insn);
+ mode = GET_MODE (memory);
+
+ arm_output_sync_loop (emit, mode, result, memory, required_value,
+ new_value, t1, t2, sync_op, early_barrier);
+}
+
+/* Emit a synchronization instruction loop. */
+const char *
+arm_output_sync_insn (rtx insn, rtx *operands)
+{
+ arm_process_output_sync_insn (arm_emit, insn, operands);
+ return "";
+}
+
+/* Count the number of machine instruction that will be emitted for a
+ synchronization instruction. Note that the emitter used does not
+ emit instructions, it just counts instructions being carefull not
+ to count labels. */
+unsigned int
+arm_sync_loop_insns (rtx insn, rtx *operands)
+{
+ arm_insn_count = 0;
+ arm_process_output_sync_insn (arm_count, insn, operands);
+ return arm_insn_count;
+}
+
+/* Helper to call a target sync instruction generator, dealing with
+ the variation in operands required by the different generators. */
+static rtx
+arm_call_generator (struct arm_sync_generator *generator, rtx old_value,
+ rtx memory, rtx required_value, rtx new_value)
+{
+ switch (generator->op)
+ {
+ case arm_sync_generator_omn:
+ gcc_assert (! required_value);
+ return generator->u.omn (old_value, memory, new_value);
+
+ case arm_sync_generator_omrn:
+ gcc_assert (required_value);
+ return generator->u.omrn (old_value, memory, required_value, new_value);
+ }
+
+ return NULL;
+}
+
+/* Expand a synchronization loop. The synchronization loop is expanded
+ as an opaque block of instructions in order to ensure that we do
+ not subsequently get extraneous memory accesses inserted within the
+ critical region. The exclusive access property of ldrex/strex is
+ only guaranteed in there are no intervening memory accesses. */
+void
+arm_expand_sync (enum machine_mode mode,
+ struct arm_sync_generator *generator,
+ rtx target, rtx memory, rtx required_value, rtx new_value)
+{
+ if (target == NULL)
+ target = gen_reg_rtx (mode);
+
+ memory = arm_legitimize_sync_memory (memory);
+ if (mode != SImode)
+ {
+ rtx load_temp = gen_reg_rtx (SImode);
+
+ if (required_value)
+ required_value = convert_modes (SImode, mode, required_value, true);
+
+ new_value = convert_modes (SImode, mode, new_value, true);
+ emit_insn (arm_call_generator (generator, load_temp, memory,
+ required_value, new_value));
+ emit_move_insn (target, gen_lowpart (mode, load_temp));
+ }
+ else
+ {
+ emit_insn (arm_call_generator (generator, target, memory, required_value,
+ new_value));
+ }
+}
+
+static bool
+arm_vector_alignment_reachable (const_tree type, bool is_packed)
+{
+ /* Vectors which aren't in packed structures will not be less aligned than
+ the natural alignment of their element type, so this is safe. */
+ if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+ return !is_packed;
+
+ return default_builtin_vector_alignment_reachable (type, is_packed);
+}
+
+static bool
+arm_builtin_support_vector_misalignment (enum machine_mode mode,
+ const_tree type, int misalignment,
+ bool is_packed)
+{
+ if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+ {
+ HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
+
+ if (is_packed)
+ return align == 1;
+
+ /* If the misalignment is unknown, we should be able to handle the access
+ so long as it is not to a member of a packed data structure. */
+ if (misalignment == -1)
+ return true;
+
+ /* Return true if the misalignment is a multiple of the natural alignment
+ of the vector's element type. This is probably always going to be
+ true in practice, since we've already established that this isn't a
+ packed access. */
+ return ((misalignment % align) == 0);
+ }
+
+ return default_builtin_support_vector_misalignment (mode, type, misalignment,
+ is_packed);
+}
+
+static void
+arm_conditional_register_usage (void)
+{
+ int regno;
+
+ if (TARGET_SOFT_FLOAT || TARGET_THUMB1 || !TARGET_FPA)
+ {
+ for (regno = FIRST_FPA_REGNUM;
+ regno <= LAST_FPA_REGNUM; ++regno)
+ fixed_regs[regno] = call_used_regs[regno] = 1;
+ }
+
+ if (TARGET_THUMB1 && optimize_size)
+ {
+ /* When optimizing for size on Thumb-1, it's better not
+ to use the HI regs, because of the overhead of
+ stacking them. */
+ for (regno = FIRST_HI_REGNUM;
+ regno <= LAST_HI_REGNUM; ++regno)
+ fixed_regs[regno] = call_used_regs[regno] = 1;
+ }
+
+ /* The link register can be clobbered by any branch insn,
+ but we have no way to track that at present, so mark
+ it as unavailable. */
+ if (TARGET_THUMB1)
+ fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
+
+ if (TARGET_32BIT && TARGET_HARD_FLOAT)
+ {
+ if (TARGET_MAVERICK)
+ {
+ for (regno = FIRST_FPA_REGNUM;
+ regno <= LAST_FPA_REGNUM; ++ regno)
+ fixed_regs[regno] = call_used_regs[regno] = 1;
+ for (regno = FIRST_CIRRUS_FP_REGNUM;
+ regno <= LAST_CIRRUS_FP_REGNUM; ++ regno)
+ {
+ fixed_regs[regno] = 0;
+ call_used_regs[regno] = regno < FIRST_CIRRUS_FP_REGNUM + 4;
+ }
+ }
+ if (TARGET_VFP)
+ {
+ /* VFPv3 registers are disabled when earlier VFP
+ versions are selected due to the definition of
+ LAST_VFP_REGNUM. */
+ for (regno = FIRST_VFP_REGNUM;
+ regno <= LAST_VFP_REGNUM; ++ regno)
+ {
+ fixed_regs[regno] = 0;
+ call_used_regs[regno] = regno < FIRST_VFP_REGNUM + 16
+ || regno >= FIRST_VFP_REGNUM + 32;
+ }
+ }
+ }
+
+ if (TARGET_REALLY_IWMMXT)
+ {
+ regno = FIRST_IWMMXT_GR_REGNUM;
+ /* The 2002/10/09 revision of the XScale ABI has wCG0
+ and wCG1 as call-preserved registers. The 2002/11/21
+ revision changed this so that all wCG registers are
+ scratch registers. */
+ for (regno = FIRST_IWMMXT_GR_REGNUM;
+ regno <= LAST_IWMMXT_GR_REGNUM; ++ regno)
+ fixed_regs[regno] = 0;
+ /* The XScale ABI has wR0 - wR9 as scratch registers,
+ the rest as call-preserved registers. */
+ for (regno = FIRST_IWMMXT_REGNUM;
+ regno <= LAST_IWMMXT_REGNUM; ++ regno)
+ {
+ fixed_regs[regno] = 0;
+ call_used_regs[regno] = regno < FIRST_IWMMXT_REGNUM + 10;
+ }
+ }
+
+ if ((unsigned) PIC_OFFSET_TABLE_REGNUM != INVALID_REGNUM)
+ {
+ fixed_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
+ call_used_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
+ }
+ else if (TARGET_APCS_STACK)
+ {
+ fixed_regs[10] = 1;
+ call_used_regs[10] = 1;
+ }
+ /* -mcaller-super-interworking reserves r11 for calls to
+ _interwork_r11_call_via_rN(). Making the register global
+ is an easy way of ensuring that it remains valid for all
+ calls. */
+ if (TARGET_APCS_FRAME || TARGET_CALLER_INTERWORKING
+ || TARGET_TPCS_FRAME || TARGET_TPCS_LEAF_FRAME)
+ {
+ fixed_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+ call_used_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+ if (TARGET_CALLER_INTERWORKING)
+ global_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+ }
+ SUBTARGET_CONDITIONAL_REGISTER_USAGE
+}
+
+static reg_class_t
+arm_preferred_rename_class (reg_class_t class)
+{
+ /* Thumb-2 instructions using LO_REGS may be smaller than instructions
+ using GENERIC_REGS. During register rename pass, we prefer LO_REGS,
+ and code size can be reduced. */
+ if (TARGET_THUMB2 && class == GENERAL_REGS)
+ return LO_REGS;
+ else
+ return NO_REGS;
+}
+
#include "gt-arm.h"