static bool arm_cirrus_insn_p (rtx);
static void cirrus_reorg (rtx);
static void arm_init_builtins (void);
-static rtx arm_expand_builtin (tree, rtx, rtx, enum machine_mode, int);
static void arm_init_iwmmxt_builtins (void);
static rtx safe_vector_operand (rtx, enum machine_mode);
static rtx arm_expand_binop_builtin (enum insn_code, tree, rtx);
/* The processor for which instructions should be scheduled. */
enum processor_type arm_tune = arm_none;
+/* The current tuning set. */
+const struct tune_params *current_tune;
+
/* The default processor used if not overridden by commandline. */
static enum processor_type arm_default_cpu = arm_none;
the next function. */
static int after_arm_reorg = 0;
-/* The maximum number of insns to be used when loading a constant. */
-static int arm_constant_limit = 3;
-
static enum arm_pcs arm_pcs_default;
/* For an explanation of these variables, see final_prescan_insn below. */
enum processor_type core;
const char *arch;
const unsigned long flags;
- bool (* rtx_costs) (rtx, enum rtx_code, enum rtx_code, int *, bool);
+ const struct tune_params *const tune;
+};
+
+const struct tune_params arm_slowmul_tune =
+{
+ arm_slowmul_rtx_costs,
+ 3
+};
+
+const struct tune_params arm_fastmul_tune =
+{
+ arm_fastmul_rtx_costs,
+ 1
+};
+
+const struct tune_params arm_xscale_tune =
+{
+ arm_xscale_rtx_costs,
+ 2
+};
+
+const struct tune_params arm_9e_tune =
+{
+ arm_9e_rtx_costs,
+ 1
};
/* Not all of these give usefully different compilation alternatives,
{
/* ARM Cores */
#define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \
- {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, arm_##COSTS##_rtx_costs},
+ {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune},
#include "arm-cores.def"
#undef ARM_CORE
{NULL, arm_none, NULL, 0, NULL}
static const struct processors all_architectures[] =
{
/* ARM Architectures */
- /* We don't specify rtx_costs here as it will be figured out
+ /* We don't specify tuning costs here as it will be figured out
from the core. */
{"armv2", arm2, "2", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL},
TLS_LE32
};
+/* The maximum number of insns to be used when loading a constant. */
+inline static int
+arm_constant_limit (bool size_p)
+{
+ return size_p ? 1 : current_tune->constant_limit;
+}
+
/* Emit an insn that's a simple single-set. Both the operands must be known
to be valid. */
inline static rtx
gcc_assert (arm_tune != arm_none);
tune_flags = all_cores[(int)arm_tune].flags;
+ current_tune = all_cores[(int)arm_tune].tune;
if (target_fp16_format_name)
{
if (optimize_size)
{
- arm_constant_limit = 1;
-
/* If optimizing for size, bump the number of instructions that we
are prepared to conditionally execute (even on a StrongARM). */
max_insns_skipped = 6;
}
else
{
- /* For processors with load scheduling, it never costs more than
- 2 cycles to load a constant, and the load scheduler may well
- reduce that to 1. */
- if (arm_ld_sched)
- arm_constant_limit = 1;
-
- /* On XScale the longer latency of a load makes it more difficult
- to achieve a good schedule, so it's faster to synthesize
- constants that can be done in two insns. */
- if (arm_tune_xscale)
- arm_constant_limit = 2;
-
/* StrongARM has early execution of branches, so a sequence
that is worth skipping is shorter. */
if (arm_tune_strongarm)
&& !cond
&& (arm_gen_constant (code, mode, NULL_RTX, val, target, source,
1, 0)
- > arm_constant_limit + (code != SET)))
+ > (arm_constant_limit (optimize_function_for_size_p (cfun))
+ + (code != SET))))
{
if (code == SET)
{
return arm_size_rtx_costs (x, (enum rtx_code) code,
(enum rtx_code) outer_code, total);
else
- return all_cores[(int)arm_tune].rtx_costs (x, (enum rtx_code) code,
- (enum rtx_code) outer_code,
- total, speed);
+ return current_tune->rtx_costs (x, (enum rtx_code) code,
+ (enum rtx_code) outer_code,
+ total, speed);
}
/* RTX costs for cores with a slow MUL implementation. Thumb-2 is not
so it can be ignored. */
static bool
-arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, int *total, bool speed)
+arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code,
+ int *total, bool speed)
{
enum machine_mode mode = GET_MODE (x);
}
}
+/* Given the stack offsets and register mask in OFFSETS, decide
+ how many additional registers to push instead of subtracting
+ a constant from SP. */
+static int
+thumb1_extra_regs_pushed (arm_stack_offsets *offsets)
+{
+ HOST_WIDE_INT amount = offsets->outgoing_args - offsets->saved_regs;
+ unsigned long live_regs_mask = offsets->saved_regs_mask;
+ /* Extract a mask of the ones we can give to the Thumb's push instruction. */
+ unsigned long l_mask = live_regs_mask & 0x40ff;
+ /* Then count how many other high registers will need to be pushed. */
+ unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00);
+ int n_free;
+
+ /* If the stack frame size is 512 exactly, we can save one load
+ instruction, which should make this a win even when optimizing
+ for speed. */
+ if (!optimize_size && amount != 512)
+ return 0;
+
+ /* Can't do this if there are high registers to push, or if we
+ are not going to do a push at all. */
+ if (high_regs_pushed != 0 || l_mask == 0)
+ return 0;
+
+ /* Don't do this if thumb1_expand_prologue wants to emit instructions
+ between the push and the stack frame allocation. */
+ if ((flag_pic && arm_pic_register != INVALID_REGNUM)
+ || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))
+ return 0;
+
+ for (n_free = 0; n_free < 8 && !(live_regs_mask & 1); live_regs_mask >>= 1)
+ n_free++;
+
+ if (n_free == 0)
+ return 0;
+ gcc_assert (amount / 4 * 4 == amount);
+
+ if (amount >= 512 && (amount - n_free * 4) < 512)
+ return (amount - 508) / 4;
+ if (amount <= n_free * 4)
+ return amount / 4;
+ return 0;
+}
+
/* Generate the rest of a function's prologue. */
void
thumb1_expand_prologue (void)
stack_pointer_rtx);
amount = offsets->outgoing_args - offsets->saved_regs;
+ amount -= 4 * thumb1_extra_regs_pushed (offsets);
if (amount)
{
if (amount < 512)
register. */
else if ((l_mask & 0xff) != 0
|| (high_regs_pushed == 0 && l_mask))
- thumb_pushpop (f, l_mask, 1, &cfa_offset, l_mask);
+ {
+ unsigned long mask = l_mask;
+ mask |= (1 << thumb1_extra_regs_pushed (offsets)) - 1;
+ thumb_pushpop (f, mask, 1, &cfa_offset, mask);
+ }
if (high_regs_pushed)
{