#include "df.h"
#include "intl.h"
#include "libfuncs.h"
+#include "params.h"
/* Forward definitions of types. */
typedef struct minipool_node Mnode;
static rtx emit_set_insn (rtx, rtx);
static int arm_arg_partial_bytes (CUMULATIVE_ARGS *, enum machine_mode,
tree, bool);
+static rtx arm_function_arg (CUMULATIVE_ARGS *, enum machine_mode,
+ const_tree, bool);
+static void arm_function_arg_advance (CUMULATIVE_ARGS *, enum machine_mode,
+ const_tree, bool);
static rtx aapcs_allocate_return_reg (enum machine_mode, const_tree,
const_tree);
static int aapcs_select_return_coproc (const_tree, const_tree);
static void arm_trampoline_init (rtx, tree, rtx);
static rtx arm_trampoline_adjust_address (rtx);
static rtx arm_pic_static_addr (rtx orig, rtx reg);
+static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *);
+static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *);
\f
/* Table of machine attributes. */
#define TARGET_PASS_BY_REFERENCE arm_pass_by_reference
#undef TARGET_ARG_PARTIAL_BYTES
#define TARGET_ARG_PARTIAL_BYTES arm_arg_partial_bytes
+#undef TARGET_FUNCTION_ARG
+#define TARGET_FUNCTION_ARG arm_function_arg
+#undef TARGET_FUNCTION_ARG_ADVANCE
+#define TARGET_FUNCTION_ARG_ADVANCE arm_function_arg_advance
#undef TARGET_SETUP_INCOMING_VARARGS
#define TARGET_SETUP_INCOMING_VARARGS arm_setup_incoming_varargs
#define FL_NEON (1 << 20) /* Neon instructions. */
#define FL_ARCH7EM (1 << 21) /* Instructions present in the ARMv7E-M
architecture. */
+#define FL_ARCH7 (1 << 22) /* Architecture 7. */
#define FL_IWMMXT (1 << 29) /* XScale v2 or "Intel Wireless MMX technology". */
#define FL_FOR_ARCH6ZK FL_FOR_ARCH6K
#define FL_FOR_ARCH6T2 (FL_FOR_ARCH6 | FL_THUMB2)
#define FL_FOR_ARCH6M (FL_FOR_ARCH6 & ~FL_NOTM)
-#define FL_FOR_ARCH7 (FL_FOR_ARCH6T2 &~ FL_NOTM)
+#define FL_FOR_ARCH7 ((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7)
#define FL_FOR_ARCH7A (FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K)
#define FL_FOR_ARCH7R (FL_FOR_ARCH7A | FL_DIV)
#define FL_FOR_ARCH7M (FL_FOR_ARCH7 | FL_DIV)
/* Nonzero if this chip supports the ARM 6K extensions. */
int arm_arch6k = 0;
+/* Nonzero if this chip supports the ARM 7 extensions. */
+int arm_arch7 = 0;
+
/* Nonzero if instructions not present in the 'M' profile can be used. */
int arm_arch_notm = 0;
/* Nonzero if generating Thumb instructions. */
int thumb_code = 0;
+/* Nonzero if generating Thumb-1 instructions. */
+int thumb1_code = 0;
+
/* Nonzero if we should define __THUMB_INTERWORK__ in the
preprocessor.
XXX This is a bit of a hack, it's intended to help work around
the next function. */
static int after_arm_reorg = 0;
-static enum arm_pcs arm_pcs_default;
+enum arm_pcs arm_pcs_default;
/* For an explanation of these variables, see final_prescan_insn below. */
int arm_ccfsm_state;
/* arm_current_cc is also used for Thumb-2 cond_exec blocks. */
enum arm_cond_code arm_current_cc;
+
rtx arm_target_insn;
int arm_target_label;
/* The number of conditionally executed insns, including the current insn. */
"hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
};
+/* The register numbers in sequence, for passing to arm_gen_load_multiple. */
+int arm_regs_in_sequence[] =
+{
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
#define ARM_LSL_NAME (TARGET_UNIFIED_ASM ? "lsl" : "asl")
#define streq(string1, string2) (strcmp (string1, string2) == 0)
const struct tune_params arm_slowmul_tune =
{
arm_slowmul_rtx_costs,
+ NULL,
3
};
const struct tune_params arm_fastmul_tune =
{
arm_fastmul_rtx_costs,
+ NULL,
1
};
const struct tune_params arm_xscale_tune =
{
arm_xscale_rtx_costs,
+ xscale_sched_adjust_cost,
2
};
const struct tune_params arm_9e_tune =
{
arm_9e_rtx_costs,
+ NULL,
1
};
+const struct tune_params arm_cortex_a9_tune =
+{
+ arm_9e_rtx_costs,
+ cortex_a9_sched_adjust_cost,
+ 1
+};
+
+
/* Not all of these give usefully different compilation alternatives,
but there is no simple way of generalizing them. */
static const struct processors all_cores[] =
arm_selected_cpu = &all_cores[SUBTARGET_CPU_DEFAULT];
#endif
/* Default to ARM6. */
- if (arm_selected_cpu->name)
+ if (!arm_selected_cpu->name)
arm_selected_cpu = &all_cores[arm6];
}
/* Callee super interworking implies thumb interworking. Adding
this to the flags here simplifies the logic elsewhere. */
if (TARGET_THUMB && TARGET_CALLEE_INTERWORKING)
- target_flags |= MASK_INTERWORK;
+ target_flags |= MASK_INTERWORK;
/* TARGET_BACKTRACE calls leaf_function_p, which causes a crash if done
from here where no function is being compiled currently. */
if (TARGET_ARM && TARGET_CALLEE_INTERWORKING)
warning (0, "enabling callee interworking support is only meaningful when compiling for the Thumb");
- if (TARGET_ARM && TARGET_CALLER_INTERWORKING)
- warning (0, "enabling caller interworking support is only meaningful when compiling for the Thumb");
-
if (TARGET_APCS_STACK && !TARGET_APCS_FRAME)
{
warning (0, "-mapcs-stack-check incompatible with -mno-apcs-frame");
arm_arch6 = (insn_flags & FL_ARCH6) != 0;
arm_arch6k = (insn_flags & FL_ARCH6K) != 0;
arm_arch_notm = (insn_flags & FL_NOTM) != 0;
+ arm_arch7 = (insn_flags & FL_ARCH7) != 0;
arm_arch7em = (insn_flags & FL_ARCH7EM) != 0;
arm_arch_thumb2 = (insn_flags & FL_THUMB2) != 0;
arm_arch_xscale = (insn_flags & FL_XSCALE) != 0;
arm_ld_sched = (tune_flags & FL_LDSCHED) != 0;
arm_tune_strongarm = (tune_flags & FL_STRONG) != 0;
- thumb_code = (TARGET_ARM == 0);
+ thumb_code = TARGET_ARM == 0;
+ thumb1_code = TARGET_THUMB1 != 0;
arm_tune_wbuf = (tune_flags & FL_WBUF) != 0;
arm_tune_xscale = (tune_flags & FL_XSCALE) != 0;
arm_arch_iwmmxt = (insn_flags & FL_IWMMXT) != 0;
flag_reorder_blocks = 1;
}
+ if (!PARAM_SET_P (PARAM_GCSE_UNRESTRICTED_COST)
+ && flag_pic)
+ /* Hoisting PIC address calculations more aggressively provides a small,
+ but measurable, size reduction for PIC code. Therefore, we decrease
+ the bar for unrestricted expression hoisting to the cost of PIC address
+ calculation, which is 2 instructions. */
+ set_param_value ("gcse-unrestricted-cost", 2);
+
/* Register global variables with the garbage collector. */
arm_add_gc_roots ();
}
/* Detect varargs functions. These always use the base rules
(no argument is ever a candidate for a co-processor
register). */
- bool base_rules = (TYPE_ARG_TYPES (type) != 0
- && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (type)))
- != void_type_node));
+ bool base_rules = stdarg_p (type);
if (user_convention)
{
static int
aapcs_select_call_coproc (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
- tree type)
+ const_tree type)
{
int i;
numbers referred to here are those in the AAPCS. */
static void
aapcs_layout_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
- tree type, int named)
+ const_tree type, bool named)
{
int nregs, nregs2;
int ncrn;
/* Return true if mode/type need doubleword alignment. */
bool
-arm_needs_doubleword_align (enum machine_mode mode, tree type)
+arm_needs_doubleword_align (enum machine_mode mode, const_tree type)
{
return (GET_MODE_ALIGNMENT (mode) > PARM_BOUNDARY
|| (type && TYPE_ALIGN (type) > PARM_BOUNDARY));
CUM is a variable of type CUMULATIVE_ARGS which gives info about
the preceding args and about the function being called.
NAMED is nonzero if this argument is a named parameter
- (otherwise it is an extra parameter matching an ellipsis). */
+ (otherwise it is an extra parameter matching an ellipsis).
-rtx
+ On the ARM, normally the first 16 bytes are passed in registers r0-r3; all
+ other arguments are passed on the stack. If (NAMED == 0) (which happens
+ only in assign_parms, since TARGET_SETUP_INCOMING_VARARGS is
+ defined), say it is passed in the stack (function_prologue will
+ indeed make it pass in the stack if necessary). */
+
+static rtx
arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
- tree type, int named)
+ const_tree type, bool named)
{
int nregs;
&& arm_needs_doubleword_align (mode, type))
pcum->nregs++;
- if (mode == VOIDmode)
- /* Pick an arbitrary value for operand 2 of the call insn. */
- return const0_rtx;
-
/* Only allow splitting an arg between regs and memory if all preceding
args were allocated to regs. For args passed by reference we only count
the reference pointer. */
return 0;
}
-void
+/* Update the data in PCUM to advance over an argument
+ of mode MODE and data type TYPE.
+ (TYPE is null for libcalls where that information may not be available.) */
+
+static void
arm_function_arg_advance (CUMULATIVE_ARGS *pcum, enum machine_mode mode,
- tree type, bool named)
+ const_tree type, bool named)
{
if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL)
{
#define REG_OR_SUBREG_RTX(X) \
(GET_CODE (X) == REG ? (X) : SUBREG_REG (X))
-#ifndef COSTS_N_INSNS
-#define COSTS_N_INSNS(N) ((N) * 4 - 2)
-#endif
static inline int
thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer)
{
since then they might not be moved outside of loops. As a compromise
we allow integration with ops that have a constant as their second
operand. */
- if ((REG_OR_SUBREG_REG (XEXP (x, 0))
- && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
- && GET_CODE (XEXP (x, 1)) != CONST_INT)
- || (REG_OR_SUBREG_REG (XEXP (x, 0))
- && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))))
- *total = 4;
+ if (REG_OR_SUBREG_REG (XEXP (x, 0))
+ && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0)))
+ && GET_CODE (XEXP (x, 1)) != CONST_INT)
+ *total = COSTS_N_INSNS (1);
if (mode == DImode)
{
return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x);
}
-static int
-arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
+/* Adjust cost hook for XScale. */
+static bool
+xscale_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
{
- rtx i_pat, d_pat;
-
/* Some true dependencies can have a higher cost depending
on precisely how certain input operands are used. */
- if (arm_tune_xscale
- && REG_NOTE_KIND (link) == 0
+ if (REG_NOTE_KIND(link) == 0
&& recog_memoized (insn) >= 0
&& recog_memoized (dep) >= 0)
{
if (reg_overlap_mentioned_p (recog_data.operand[opno],
shifted_operand))
- return 2;
+ {
+ *cost = 2;
+ return false;
+ }
}
}
}
+ return true;
+}
+
+/* Adjust cost hook for Cortex A9. */
+static bool
+cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost)
+{
+ switch (REG_NOTE_KIND (link))
+ {
+ case REG_DEP_ANTI:
+ *cost = 0;
+ return false;
+
+ case REG_DEP_TRUE:
+ case REG_DEP_OUTPUT:
+ if (recog_memoized (insn) >= 0
+ && recog_memoized (dep) >= 0)
+ {
+ if (GET_CODE (PATTERN (insn)) == SET)
+ {
+ if (GET_MODE_CLASS
+ (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT
+ || GET_MODE_CLASS
+ (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT)
+ {
+ enum attr_type attr_type_insn = get_attr_type (insn);
+ enum attr_type attr_type_dep = get_attr_type (dep);
+
+ /* By default all dependencies of the form
+ s0 = s0 <op> s1
+ s0 = s0 <op> s2
+ have an extra latency of 1 cycle because
+ of the input and output dependency in this
+ case. However this gets modeled as an true
+ dependency and hence all these checks. */
+ if (REG_P (SET_DEST (PATTERN (insn)))
+ && REG_P (SET_DEST (PATTERN (dep)))
+ && reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)),
+ SET_DEST (PATTERN (dep))))
+ {
+ /* FMACS is a special case where the dependant
+ instruction can be issued 3 cycles before
+ the normal latency in case of an output
+ dependency. */
+ if ((attr_type_insn == TYPE_FMACS
+ || attr_type_insn == TYPE_FMACD)
+ && (attr_type_dep == TYPE_FMACS
+ || attr_type_dep == TYPE_FMACD))
+ {
+ if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
+ *cost = insn_default_latency (dep) - 3;
+ else
+ *cost = insn_default_latency (dep);
+ return false;
+ }
+ else
+ {
+ if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT)
+ *cost = insn_default_latency (dep) + 1;
+ else
+ *cost = insn_default_latency (dep);
+ }
+ return false;
+ }
+ }
+ }
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return true;
+}
+
+/* This function implements the target macro TARGET_SCHED_ADJUST_COST.
+ It corrects the value of COST based on the relationship between
+ INSN and DEP through the dependence LINK. It returns the new
+ value. There is a per-core adjust_cost hook to adjust scheduler costs
+ and the per-core hook can choose to completely override the generic
+ adjust_cost function. Only put bits of code into arm_adjust_cost that
+ are common across all cores. */
+static int
+arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost)
+{
+ rtx i_pat, d_pat;
+
+ /* When generating Thumb-1 code, we want to place flag-setting operations
+ close to a conditional branch which depends on them, so that we can
+ omit the comparison. */
+ if (TARGET_THUMB1
+ && REG_NOTE_KIND (link) == 0
+ && recog_memoized (insn) == CODE_FOR_cbranchsi4_insn
+ && recog_memoized (dep) >= 0
+ && get_attr_conds (dep) == CONDS_SET)
+ return 0;
+
+ if (current_tune->sched_adjust_cost != NULL)
+ {
+ if (!current_tune->sched_adjust_cost (insn, link, dep, &cost))
+ return cost;
+ }
/* XXX This is not strictly true for the FPA. */
if (REG_NOTE_KIND (link) == REG_DEP_ANTI
constant pool are cached, and that others will miss. This is a
hack. */
- if ((GET_CODE (src_mem) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (src_mem))
+ if ((GET_CODE (src_mem) == SYMBOL_REF
+ && CONSTANT_POOL_ADDRESS_P (src_mem))
|| reg_mentioned_p (stack_pointer_rtx, src_mem)
|| reg_mentioned_p (frame_pointer_rtx, src_mem)
|| reg_mentioned_p (hard_frame_pointer_rtx, src_mem))
if (nops == 2 && arm_ld_sched && add_offset != 0)
return false;
+ /* XScale has load-store double instructions, but they have stricter
+ alignment requirements than load-store multiple, so we cannot
+ use them.
+
+ For XScale ldm requires 2 + NREGS cycles to complete and blocks
+ the pipeline until completion.
+
+ NREGS CYCLES
+ 1 3
+ 2 4
+ 3 5
+ 4 6
+
+ An ldr instruction takes 1-3 cycles, but does not block the
+ pipeline.
+
+ NREGS CYCLES
+ 1 1-3
+ 2 2-6
+ 3 3-9
+ 4 4-12
+
+ Best case ldr will always win. However, the more ldr instructions
+ we issue, the less likely we are to be able to schedule them well.
+ Using ldr instructions also increases code size.
+
+ As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
+ for counts of 3 or 4 regs. */
+ if (nops <= 2 && arm_tune_xscale && !optimize_size)
+ return false;
return true;
}
return true;
}
-int
-load_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
- HOST_WIDE_INT *load_offset)
+/* Used to determine in a peephole whether a sequence of load
+ instructions can be changed into a load-multiple instruction.
+ NOPS is the number of separate load instructions we are examining. The
+ first NOPS entries in OPERANDS are the destination registers, the
+ next NOPS entries are memory operands. If this function is
+ successful, *BASE is set to the common base register of the memory
+ accesses; *LOAD_OFFSET is set to the first memory location's offset
+ from that base register.
+ REGS is an array filled in with the destination register numbers.
+ SAVED_ORDER (if nonnull), is an array filled in with an order that maps
+ insn numbers to to an ascending order of stores. If CHECK_REGS is true,
+ the sequence of registers in REGS matches the loads from ascending memory
+ locations, and the function verifies that the register numbers are
+ themselves ascending. If CHECK_REGS is false, the register numbers
+ are stored in the order they are found in the operands. */
+static int
+load_multiple_sequence (rtx *operands, int nops, int *regs, int *saved_order,
+ int *base, HOST_WIDE_INT *load_offset, bool check_regs)
{
int unsorted_regs[MAX_LDM_STM_OPS];
HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
int order[MAX_LDM_STM_OPS];
+ rtx base_reg_rtx = NULL;
int base_reg = -1;
int i, ldm_case;
== CONST_INT)))
{
if (i == 0)
- base_reg = REGNO (reg);
- else
{
- if (base_reg != (int) REGNO (reg))
- /* Not addressed from the same base register. */
+ base_reg = REGNO (reg);
+ base_reg_rtx = reg;
+ if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM)
return 0;
}
+ else if (base_reg != (int) REGNO (reg))
+ /* Not addressed from the same base register. */
+ return 0;
+
unsorted_regs[i] = (GET_CODE (operands[i]) == REG
? REGNO (operands[i])
: REGNO (SUBREG_REG (operands[i])));
/* If it isn't an integer register, or if it overwrites the
base register but isn't the last insn in the list, then
we can't do this. */
- if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14
+ if (unsorted_regs[i] < 0
+ || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM)
+ || unsorted_regs[i] > 14
|| (i != nops - 1 && unsorted_regs[i] == base_reg))
return 0;
order[0] has been set to the lowest offset in the list. Sort
the offsets into order, verifying that they are adjacent, and
check that the register numbers are ascending. */
- if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
+ if (!compute_offset_order (nops, unsorted_offsets, order,
+ check_regs ? unsorted_regs : NULL))
return 0;
+ if (saved_order)
+ memcpy (saved_order, order, sizeof order);
+
if (base)
{
*base = base_reg;
for (i = 0; i < nops; i++)
- regs[i] = unsorted_regs[order[i]];
+ regs[i] = unsorted_regs[check_regs ? order[i] : i];
*load_offset = unsorted_offsets[order[0]];
}
+ if (TARGET_THUMB1
+ && !peep2_reg_dead_p (nops, base_reg_rtx))
+ return 0;
+
if (unsorted_offsets[order[0]] == 0)
ldm_case = 1; /* ldmia */
else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
ldm_case = 2; /* ldmib */
else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
ldm_case = 3; /* ldmda */
- else if (unsorted_offsets[order[nops - 1]] == -4)
+ else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4)
ldm_case = 4; /* ldmdb */
else if (const_ok_for_arm (unsorted_offsets[order[0]])
|| const_ok_for_arm (-unsorted_offsets[order[0]]))
return ldm_case;
}
-const char *
-emit_ldm_seq (rtx *operands, int nops)
-{
- int regs[MAX_LDM_STM_OPS];
- int base_reg;
- HOST_WIDE_INT offset;
- char buf[100];
- int i;
-
- switch (load_multiple_sequence (operands, nops, regs, &base_reg, &offset))
- {
- case 1:
- strcpy (buf, "ldm%(ia%)\t");
- break;
-
- case 2:
- strcpy (buf, "ldm%(ib%)\t");
- break;
-
- case 3:
- strcpy (buf, "ldm%(da%)\t");
- break;
-
- case 4:
- strcpy (buf, "ldm%(db%)\t");
- break;
-
- case 5:
- if (offset >= 0)
- sprintf (buf, "add%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX,
- reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg],
- (long) offset);
- else
- sprintf (buf, "sub%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX,
- reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg],
- (long) -offset);
- output_asm_insn (buf, operands);
- base_reg = regs[0];
- strcpy (buf, "ldm%(ia%)\t");
- break;
-
- default:
- gcc_unreachable ();
- }
-
- sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX,
- reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]);
-
- for (i = 1; i < nops; i++)
- sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX,
- reg_names[regs[i]]);
-
- strcat (buf, "}\t%@ phole ldm");
-
- output_asm_insn (buf, operands);
- return "";
-}
-
-int
-store_multiple_sequence (rtx *operands, int nops, int *regs, int *base,
- HOST_WIDE_INT * load_offset)
+/* Used to determine in a peephole whether a sequence of store instructions can
+ be changed into a store-multiple instruction.
+ NOPS is the number of separate store instructions we are examining.
+ NOPS_TOTAL is the total number of instructions recognized by the peephole
+ pattern.
+ The first NOPS entries in OPERANDS are the source registers, the next
+ NOPS entries are memory operands. If this function is successful, *BASE is
+ set to the common base register of the memory accesses; *LOAD_OFFSET is set
+ to the first memory location's offset from that base register. REGS is an
+ array filled in with the source register numbers, REG_RTXS (if nonnull) is
+ likewise filled with the corresponding rtx's.
+ SAVED_ORDER (if nonnull), is an array filled in with an order that maps insn
+ numbers to to an ascending order of stores.
+ If CHECK_REGS is true, the sequence of registers in *REGS matches the stores
+ from ascending memory locations, and the function verifies that the register
+ numbers are themselves ascending. If CHECK_REGS is false, the register
+ numbers are stored in the order they are found in the operands. */
+static int
+store_multiple_sequence (rtx *operands, int nops, int nops_total,
+ int *regs, rtx *reg_rtxs, int *saved_order, int *base,
+ HOST_WIDE_INT *load_offset, bool check_regs)
{
int unsorted_regs[MAX_LDM_STM_OPS];
+ rtx unsorted_reg_rtxs[MAX_LDM_STM_OPS];
HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS];
int order[MAX_LDM_STM_OPS];
int base_reg = -1;
+ rtx base_reg_rtx = NULL;
int i, stm_case;
/* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be
&& (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1))
== CONST_INT)))
{
- unsorted_regs[i] = (GET_CODE (operands[i]) == REG
- ? REGNO (operands[i])
- : REGNO (SUBREG_REG (operands[i])));
+ unsorted_reg_rtxs[i] = (GET_CODE (operands[i]) == REG
+ ? operands[i] : SUBREG_REG (operands[i]));
+ unsorted_regs[i] = REGNO (unsorted_reg_rtxs[i]);
+
if (i == 0)
- base_reg = REGNO (reg);
+ {
+ base_reg = REGNO (reg);
+ base_reg_rtx = reg;
+ if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM)
+ return 0;
+ }
else if (base_reg != (int) REGNO (reg))
/* Not addressed from the same base register. */
return 0;
/* If it isn't an integer register, then we can't do this. */
- if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14)
+ if (unsorted_regs[i] < 0
+ || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM)
+ || (TARGET_THUMB2 && unsorted_regs[i] == base_reg)
+ || (TARGET_THUMB2 && unsorted_regs[i] == SP_REGNUM)
+ || unsorted_regs[i] > 14)
return 0;
unsorted_offsets[i] = INTVAL (offset);
order[0] has been set to the lowest offset in the list. Sort
the offsets into order, verifying that they are adjacent, and
check that the register numbers are ascending. */
- if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs))
+ if (!compute_offset_order (nops, unsorted_offsets, order,
+ check_regs ? unsorted_regs : NULL))
return 0;
+ if (saved_order)
+ memcpy (saved_order, order, sizeof order);
+
if (base)
{
*base = base_reg;
for (i = 0; i < nops; i++)
- regs[i] = unsorted_regs[order[i]];
+ {
+ regs[i] = unsorted_regs[check_regs ? order[i] : i];
+ if (reg_rtxs)
+ reg_rtxs[i] = unsorted_reg_rtxs[check_regs ? order[i] : i];
+ }
*load_offset = unsorted_offsets[order[0]];
}
+ if (TARGET_THUMB1
+ && !peep2_reg_dead_p (nops_total, base_reg_rtx))
+ return 0;
+
if (unsorted_offsets[order[0]] == 0)
stm_case = 1; /* stmia */
else if (TARGET_ARM && unsorted_offsets[order[0]] == 4)
stm_case = 2; /* stmib */
else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0)
stm_case = 3; /* stmda */
- else if (unsorted_offsets[order[nops - 1]] == -4)
+ else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4)
stm_case = 4; /* stmdb */
else
return 0;
return stm_case;
}
+\f
+/* Routines for use in generating RTL. */
-const char *
-emit_stm_seq (rtx *operands, int nops)
+/* Generate a load-multiple instruction. COUNT is the number of loads in
+ the instruction; REGS and MEMS are arrays containing the operands.
+ BASEREG is the base register to be used in addressing the memory operands.
+ WBACK_OFFSET is nonzero if the instruction should update the base
+ register. */
+
+static rtx
+arm_gen_load_multiple_1 (int count, int *regs, rtx *mems, rtx basereg,
+ HOST_WIDE_INT wback_offset)
{
- int regs[MAX_LDM_STM_OPS];
- int base_reg;
- HOST_WIDE_INT offset;
- char buf[100];
- int i;
+ int i = 0, j;
+ rtx result;
- switch (store_multiple_sequence (operands, nops, regs, &base_reg, &offset))
+ if (!multiple_operation_profitable_p (false, count, 0))
{
- case 1:
- strcpy (buf, "stm%(ia%)\t");
- break;
+ rtx seq;
- case 2:
- strcpy (buf, "stm%(ib%)\t");
- break;
+ start_sequence ();
- case 3:
- strcpy (buf, "stm%(da%)\t");
- break;
+ for (i = 0; i < count; i++)
+ emit_move_insn (gen_rtx_REG (SImode, regs[i]), mems[i]);
- case 4:
- strcpy (buf, "stm%(db%)\t");
- break;
+ if (wback_offset != 0)
+ emit_move_insn (basereg, plus_constant (basereg, wback_offset));
- default:
- gcc_unreachable ();
- }
+ seq = get_insns ();
+ end_sequence ();
- sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX,
- reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]);
+ return seq;
+ }
- for (i = 1; i < nops; i++)
- sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX,
- reg_names[regs[i]]);
+ result = gen_rtx_PARALLEL (VOIDmode,
+ rtvec_alloc (count + (wback_offset != 0 ? 1 : 0)));
+ if (wback_offset != 0)
+ {
+ XVECEXP (result, 0, 0)
+ = gen_rtx_SET (VOIDmode, basereg,
+ plus_constant (basereg, wback_offset));
+ i = 1;
+ count++;
+ }
- strcat (buf, "}\t%@ phole stm");
+ for (j = 0; i < count; i++, j++)
+ XVECEXP (result, 0, i)
+ = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, regs[j]), mems[j]);
- output_asm_insn (buf, operands);
- return "";
+ return result;
}
-\f
-/* Routines for use in generating RTL. */
-rtx
-arm_gen_load_multiple (int base_regno, int count, rtx from, int up,
- int write_back, rtx basemem, HOST_WIDE_INT *offsetp)
+/* Generate a store-multiple instruction. COUNT is the number of stores in
+ the instruction; REGS and MEMS are arrays containing the operands.
+ BASEREG is the base register to be used in addressing the memory operands.
+ WBACK_OFFSET is nonzero if the instruction should update the base
+ register. */
+
+static rtx
+arm_gen_store_multiple_1 (int count, int *regs, rtx *mems, rtx basereg,
+ HOST_WIDE_INT wback_offset)
{
- HOST_WIDE_INT offset = *offsetp;
int i = 0, j;
rtx result;
- int sign = up ? 1 : -1;
- rtx mem, addr;
- /* XScale has load-store double instructions, but they have stricter
- alignment requirements than load-store multiple, so we cannot
- use them.
+ if (GET_CODE (basereg) == PLUS)
+ basereg = XEXP (basereg, 0);
- For XScale ldm requires 2 + NREGS cycles to complete and blocks
- the pipeline until completion.
+ if (!multiple_operation_profitable_p (false, count, 0))
+ {
+ rtx seq;
- NREGS CYCLES
- 1 3
- 2 4
- 3 5
- 4 6
+ start_sequence ();
- An ldr instruction takes 1-3 cycles, but does not block the
- pipeline.
+ for (i = 0; i < count; i++)
+ emit_move_insn (mems[i], gen_rtx_REG (SImode, regs[i]));
- NREGS CYCLES
- 1 1-3
- 2 2-6
- 3 3-9
- 4 4-12
+ if (wback_offset != 0)
+ emit_move_insn (basereg, plus_constant (basereg, wback_offset));
- Best case ldr will always win. However, the more ldr instructions
- we issue, the less likely we are to be able to schedule them well.
- Using ldr instructions also increases code size.
+ seq = get_insns ();
+ end_sequence ();
- As a compromise, we use ldr for counts of 1 or 2 regs, and ldm
- for counts of 3 or 4 regs. */
- if (arm_tune_xscale && count <= 2 && ! optimize_size)
+ return seq;
+ }
+
+ result = gen_rtx_PARALLEL (VOIDmode,
+ rtvec_alloc (count + (wback_offset != 0 ? 1 : 0)));
+ if (wback_offset != 0)
{
- rtx seq;
+ XVECEXP (result, 0, 0)
+ = gen_rtx_SET (VOIDmode, basereg,
+ plus_constant (basereg, wback_offset));
+ i = 1;
+ count++;
+ }
- start_sequence ();
+ for (j = 0; i < count; i++, j++)
+ XVECEXP (result, 0, i)
+ = gen_rtx_SET (VOIDmode, mems[j], gen_rtx_REG (SImode, regs[j]));
- for (i = 0; i < count; i++)
+ return result;
+}
+
+/* Generate either a load-multiple or a store-multiple instruction. This
+ function can be used in situations where we can start with a single MEM
+ rtx and adjust its address upwards.
+ COUNT is the number of operations in the instruction, not counting a
+ possible update of the base register. REGS is an array containing the
+ register operands.
+ BASEREG is the base register to be used in addressing the memory operands,
+ which are constructed from BASEMEM.
+ WRITE_BACK specifies whether the generated instruction should include an
+ update of the base register.
+ OFFSETP is used to pass an offset to and from this function; this offset
+ is not used when constructing the address (instead BASEMEM should have an
+ appropriate offset in its address), it is used only for setting
+ MEM_OFFSET. It is updated only if WRITE_BACK is true.*/
+
+static rtx
+arm_gen_multiple_op (bool is_load, int *regs, int count, rtx basereg,
+ bool write_back, rtx basemem, HOST_WIDE_INT *offsetp)
+{
+ rtx mems[MAX_LDM_STM_OPS];
+ HOST_WIDE_INT offset = *offsetp;
+ int i;
+
+ gcc_assert (count <= MAX_LDM_STM_OPS);
+
+ if (GET_CODE (basereg) == PLUS)
+ basereg = XEXP (basereg, 0);
+
+ for (i = 0; i < count; i++)
+ {
+ rtx addr = plus_constant (basereg, i * 4);
+ mems[i] = adjust_automodify_address_nv (basemem, SImode, addr, offset);
+ offset += 4;
+ }
+
+ if (write_back)
+ *offsetp = offset;
+
+ if (is_load)
+ return arm_gen_load_multiple_1 (count, regs, mems, basereg,
+ write_back ? 4 * count : 0);
+ else
+ return arm_gen_store_multiple_1 (count, regs, mems, basereg,
+ write_back ? 4 * count : 0);
+}
+
+rtx
+arm_gen_load_multiple (int *regs, int count, rtx basereg, int write_back,
+ rtx basemem, HOST_WIDE_INT *offsetp)
+{
+ return arm_gen_multiple_op (TRUE, regs, count, basereg, write_back, basemem,
+ offsetp);
+}
+
+rtx
+arm_gen_store_multiple (int *regs, int count, rtx basereg, int write_back,
+ rtx basemem, HOST_WIDE_INT *offsetp)
+{
+ return arm_gen_multiple_op (FALSE, regs, count, basereg, write_back, basemem,
+ offsetp);
+}
+
+/* Called from a peephole2 expander to turn a sequence of loads into an
+ LDM instruction. OPERANDS are the operands found by the peephole matcher;
+ NOPS indicates how many separate loads we are trying to combine. SORT_REGS
+ is true if we can reorder the registers because they are used commutatively
+ subsequently.
+ Returns true iff we could generate a new instruction. */
+
+bool
+gen_ldm_seq (rtx *operands, int nops, bool sort_regs)
+{
+ int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
+ rtx mems[MAX_LDM_STM_OPS];
+ int i, j, base_reg;
+ rtx base_reg_rtx;
+ HOST_WIDE_INT offset;
+ int write_back = FALSE;
+ int ldm_case;
+ rtx addr;
+
+ ldm_case = load_multiple_sequence (operands, nops, regs, mem_order,
+ &base_reg, &offset, !sort_regs);
+
+ if (ldm_case == 0)
+ return false;
+
+ if (sort_regs)
+ for (i = 0; i < nops - 1; i++)
+ for (j = i + 1; j < nops; j++)
+ if (regs[i] > regs[j])
+ {
+ int t = regs[i];
+ regs[i] = regs[j];
+ regs[j] = t;
+ }
+ base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
+
+ if (TARGET_THUMB1)
+ {
+ gcc_assert (peep2_reg_dead_p (nops, base_reg_rtx));
+ gcc_assert (ldm_case == 1 || ldm_case == 5);
+ write_back = TRUE;
+ }
+
+ if (ldm_case == 5)
+ {
+ rtx newbase = TARGET_THUMB1 ? base_reg_rtx : gen_rtx_REG (SImode, regs[0]);
+ emit_insn (gen_addsi3 (newbase, base_reg_rtx, GEN_INT (offset)));
+ offset = 0;
+ if (!TARGET_THUMB1)
{
- addr = plus_constant (from, i * 4 * sign);
- mem = adjust_automodify_address (basemem, SImode, addr, offset);
- emit_move_insn (gen_rtx_REG (SImode, base_regno + i), mem);
- offset += 4 * sign;
+ base_reg = regs[0];
+ base_reg_rtx = newbase;
}
+ }
+
+ for (i = 0; i < nops; i++)
+ {
+ addr = plus_constant (base_reg_rtx, offset + i * 4);
+ mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
+ SImode, addr, 0);
+ }
+ emit_insn (arm_gen_load_multiple_1 (nops, regs, mems, base_reg_rtx,
+ write_back ? offset + i * 4 : 0));
+ return true;
+}
+
+/* Called from a peephole2 expander to turn a sequence of stores into an
+ STM instruction. OPERANDS are the operands found by the peephole matcher;
+ NOPS indicates how many separate stores we are trying to combine.
+ Returns true iff we could generate a new instruction. */
+
+bool
+gen_stm_seq (rtx *operands, int nops)
+{
+ int i;
+ int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
+ rtx mems[MAX_LDM_STM_OPS];
+ int base_reg;
+ rtx base_reg_rtx;
+ HOST_WIDE_INT offset;
+ int write_back = FALSE;
+ int stm_case;
+ rtx addr;
+ bool base_reg_dies;
- if (write_back)
- {
- emit_move_insn (from, plus_constant (from, count * 4 * sign));
- *offsetp = offset;
- }
+ stm_case = store_multiple_sequence (operands, nops, nops, regs, NULL,
+ mem_order, &base_reg, &offset, true);
- seq = get_insns ();
- end_sequence ();
+ if (stm_case == 0)
+ return false;
- return seq;
- }
+ base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
- result = gen_rtx_PARALLEL (VOIDmode,
- rtvec_alloc (count + (write_back ? 1 : 0)));
- if (write_back)
+ base_reg_dies = peep2_reg_dead_p (nops, base_reg_rtx);
+ if (TARGET_THUMB1)
{
- XVECEXP (result, 0, 0)
- = gen_rtx_SET (VOIDmode, from, plus_constant (from, count * 4 * sign));
- i = 1;
- count++;
+ gcc_assert (base_reg_dies);
+ write_back = TRUE;
}
- for (j = 0; i < count; i++, j++)
+ if (stm_case == 5)
{
- addr = plus_constant (from, j * 4 * sign);
- mem = adjust_automodify_address_nv (basemem, SImode, addr, offset);
- XVECEXP (result, 0, i)
- = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, base_regno + j), mem);
- offset += 4 * sign;
+ gcc_assert (base_reg_dies);
+ emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset)));
+ offset = 0;
}
- if (write_back)
- *offsetp = offset;
+ addr = plus_constant (base_reg_rtx, offset);
- return result;
+ for (i = 0; i < nops; i++)
+ {
+ addr = plus_constant (base_reg_rtx, offset + i * 4);
+ mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
+ SImode, addr, 0);
+ }
+ emit_insn (arm_gen_store_multiple_1 (nops, regs, mems, base_reg_rtx,
+ write_back ? offset + i * 4 : 0));
+ return true;
}
-rtx
-arm_gen_store_multiple (int base_regno, int count, rtx to, int up,
- int write_back, rtx basemem, HOST_WIDE_INT *offsetp)
+/* Called from a peephole2 expander to turn a sequence of stores that are
+ preceded by constant loads into an STM instruction. OPERANDS are the
+ operands found by the peephole matcher; NOPS indicates how many
+ separate stores we are trying to combine; there are 2 * NOPS
+ instructions in the peephole.
+ Returns true iff we could generate a new instruction. */
+
+bool
+gen_const_stm_seq (rtx *operands, int nops)
{
- HOST_WIDE_INT offset = *offsetp;
- int i = 0, j;
- rtx result;
- int sign = up ? 1 : -1;
- rtx mem, addr;
+ int regs[MAX_LDM_STM_OPS], sorted_regs[MAX_LDM_STM_OPS];
+ int reg_order[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS];
+ rtx reg_rtxs[MAX_LDM_STM_OPS], orig_reg_rtxs[MAX_LDM_STM_OPS];
+ rtx mems[MAX_LDM_STM_OPS];
+ int base_reg;
+ rtx base_reg_rtx;
+ HOST_WIDE_INT offset;
+ int write_back = FALSE;
+ int stm_case;
+ rtx addr;
+ bool base_reg_dies;
+ int i, j;
+ HARD_REG_SET allocated;
- /* See arm_gen_load_multiple for discussion of
- the pros/cons of ldm/stm usage for XScale. */
- if (arm_tune_xscale && count <= 2 && ! optimize_size)
- {
- rtx seq;
+ stm_case = store_multiple_sequence (operands, nops, 2 * nops, regs, reg_rtxs,
+ mem_order, &base_reg, &offset, false);
- start_sequence ();
+ if (stm_case == 0)
+ return false;
- for (i = 0; i < count; i++)
- {
- addr = plus_constant (to, i * 4 * sign);
- mem = adjust_automodify_address (basemem, SImode, addr, offset);
- emit_move_insn (mem, gen_rtx_REG (SImode, base_regno + i));
- offset += 4 * sign;
- }
+ memcpy (orig_reg_rtxs, reg_rtxs, sizeof orig_reg_rtxs);
- if (write_back)
- {
- emit_move_insn (to, plus_constant (to, count * 4 * sign));
- *offsetp = offset;
- }
+ /* If the same register is used more than once, try to find a free
+ register. */
+ CLEAR_HARD_REG_SET (allocated);
+ for (i = 0; i < nops; i++)
+ {
+ for (j = i + 1; j < nops; j++)
+ if (regs[i] == regs[j])
+ {
+ rtx t = peep2_find_free_register (0, nops * 2,
+ TARGET_THUMB1 ? "l" : "r",
+ SImode, &allocated);
+ if (t == NULL_RTX)
+ return false;
+ reg_rtxs[i] = t;
+ regs[i] = REGNO (t);
+ }
+ }
- seq = get_insns ();
- end_sequence ();
+ /* Compute an ordering that maps the register numbers to an ascending
+ sequence. */
+ reg_order[0] = 0;
+ for (i = 0; i < nops; i++)
+ if (regs[i] < regs[reg_order[0]])
+ reg_order[0] = i;
- return seq;
+ for (i = 1; i < nops; i++)
+ {
+ int this_order = reg_order[i - 1];
+ for (j = 0; j < nops; j++)
+ if (regs[j] > regs[reg_order[i - 1]]
+ && (this_order == reg_order[i - 1]
+ || regs[j] < regs[this_order]))
+ this_order = j;
+ reg_order[i] = this_order;
}
- result = gen_rtx_PARALLEL (VOIDmode,
- rtvec_alloc (count + (write_back ? 1 : 0)));
- if (write_back)
+ /* Ensure that registers that must be live after the instruction end
+ up with the correct value. */
+ for (i = 0; i < nops; i++)
{
- XVECEXP (result, 0, 0)
- = gen_rtx_SET (VOIDmode, to,
- plus_constant (to, count * 4 * sign));
- i = 1;
- count++;
+ int this_order = reg_order[i];
+ if ((this_order != mem_order[i]
+ || orig_reg_rtxs[this_order] != reg_rtxs[this_order])
+ && !peep2_reg_dead_p (nops * 2, orig_reg_rtxs[this_order]))
+ return false;
}
- for (j = 0; i < count; i++, j++)
+ /* Load the constants. */
+ for (i = 0; i < nops; i++)
{
- addr = plus_constant (to, j * 4 * sign);
- mem = adjust_automodify_address_nv (basemem, SImode, addr, offset);
- XVECEXP (result, 0, i)
- = gen_rtx_SET (VOIDmode, mem, gen_rtx_REG (SImode, base_regno + j));
- offset += 4 * sign;
+ rtx op = operands[2 * nops + mem_order[i]];
+ sorted_regs[i] = regs[reg_order[i]];
+ emit_move_insn (reg_rtxs[reg_order[i]], op);
}
- if (write_back)
- *offsetp = offset;
+ base_reg_rtx = gen_rtx_REG (Pmode, base_reg);
- return result;
+ base_reg_dies = peep2_reg_dead_p (nops * 2, base_reg_rtx);
+ if (TARGET_THUMB1)
+ {
+ gcc_assert (base_reg_dies);
+ write_back = TRUE;
+ }
+
+ if (stm_case == 5)
+ {
+ gcc_assert (base_reg_dies);
+ emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset)));
+ offset = 0;
+ }
+
+ addr = plus_constant (base_reg_rtx, offset);
+
+ for (i = 0; i < nops; i++)
+ {
+ addr = plus_constant (base_reg_rtx, offset + i * 4);
+ mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]],
+ SImode, addr, 0);
+ }
+ emit_insn (arm_gen_store_multiple_1 (nops, sorted_regs, mems, base_reg_rtx,
+ write_back ? offset + i * 4 : 0));
+ return true;
}
int
for (i = 0; in_words_to_go >= 2; i+=4)
{
if (in_words_to_go > 4)
- emit_insn (arm_gen_load_multiple (0, 4, src, TRUE, TRUE,
- srcbase, &srcoffset));
+ emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, src,
+ TRUE, srcbase, &srcoffset));
else
- emit_insn (arm_gen_load_multiple (0, in_words_to_go, src, TRUE,
- FALSE, srcbase, &srcoffset));
+ emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, in_words_to_go,
+ src, FALSE, srcbase,
+ &srcoffset));
if (out_words_to_go)
{
if (out_words_to_go > 4)
- emit_insn (arm_gen_store_multiple (0, 4, dst, TRUE, TRUE,
- dstbase, &dstoffset));
+ emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, dst,
+ TRUE, dstbase, &dstoffset));
else if (out_words_to_go != 1)
- emit_insn (arm_gen_store_multiple (0, out_words_to_go,
- dst, TRUE,
+ emit_insn (arm_gen_store_multiple (arm_regs_in_sequence,
+ out_words_to_go, dst,
(last_bytes == 0
? FALSE : TRUE),
dstbase, &dstoffset));
{
if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY)
{
- output_asm_insn ("ldr%?\t%0, [%1, %2]!", otherops);
- output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
+ output_asm_insn ("str%?\t%0, [%1, %2]!", otherops);
+ output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
}
else
{
- output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops);
- output_asm_insn ("ldr%?\t%0, [%1], %2", otherops);
+ output_asm_insn ("str%?\t%H0, [%1, #4]", otherops);
+ output_asm_insn ("str%?\t%0, [%1], %2", otherops);
}
}
else if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY)
&& !crtl->tail_call_emit)
{
unsigned long mask;
- mask = (1 << (arm_size_return_regs() / 4)) - 1;
+ /* Preserve return values, of any size. */
+ mask = (1 << ((arm_size_return_regs() + 3) / 4)) - 1;
mask ^= 0xf;
mask &= ~saved_regs_mask;
reg = 0;
}
return;
+ case 'C':
+ {
+ rtx addr;
+
+ gcc_assert (GET_CODE (x) == MEM);
+ addr = XEXP (x, 0);
+ gcc_assert (GET_CODE (addr) == REG);
+ asm_fprintf (stream, "[%r]", REGNO (addr));
+ }
+ return;
+
/* Translate an S register number into a D register number and element index. */
case 'y':
{
/* Return to caller. */
asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr);
}
-
\f
+/* Scan INSN just before assembler is output for it.
+ For Thumb-1, we track the status of the condition codes; this
+ information is used in the cbranchsi4_insn pattern. */
void
thumb1_final_prescan_insn (rtx insn)
{
if (flag_print_asm_name)
asm_fprintf (asm_out_file, "%@ 0x%04x\n",
INSN_ADDRESSES (INSN_UID (insn)));
+ /* Don't overwrite the previous setter when we get to a cbranch. */
+ if (INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn)
+ {
+ enum attr_conds conds;
+
+ if (cfun->machine->thumb1_cc_insn)
+ {
+ if (modified_in_p (cfun->machine->thumb1_cc_op0, insn)
+ || modified_in_p (cfun->machine->thumb1_cc_op1, insn))
+ CC_STATUS_INIT;
+ }
+ conds = get_attr_conds (insn);
+ if (conds == CONDS_SET)
+ {
+ rtx set = single_set (insn);
+ cfun->machine->thumb1_cc_insn = insn;
+ cfun->machine->thumb1_cc_op0 = SET_DEST (set);
+ cfun->machine->thumb1_cc_op1 = const0_rtx;
+ cfun->machine->thumb1_cc_mode = CC_NOOVmode;
+ if (INSN_CODE (insn) == CODE_FOR_thumb1_subsi3_insn)
+ {
+ rtx src1 = XEXP (SET_SRC (set), 1);
+ if (src1 == const0_rtx)
+ cfun->machine->thumb1_cc_mode = CCmode;
+ }
+ }
+ else if (conds != CONDS_NOCOND)
+ cfun->machine->thumb1_cc_insn = NULL_RTX;
+ }
}
int
{
case cortexr4:
case cortexr4f:
+ case cortexa5:
case cortexa8:
case cortexa9:
return 2;
return !TARGET_THUMB1;
}
+/* Legitimize a memory reference for sync primitive implemented using
+ ldrex / strex. We currently force the form of the reference to be
+ indirect without offset. We do not yet support the indirect offset
+ addressing supported by some ARM targets for these
+ instructions. */
+static rtx
+arm_legitimize_sync_memory (rtx memory)
+{
+ rtx addr = force_reg (Pmode, XEXP (memory, 0));
+ rtx legitimate_memory = gen_rtx_MEM (GET_MODE (memory), addr);
+
+ set_mem_alias_set (legitimate_memory, ALIAS_SET_MEMORY_BARRIER);
+ MEM_VOLATILE_P (legitimate_memory) = MEM_VOLATILE_P (memory);
+ return legitimate_memory;
+}
+
+/* An instruction emitter. */
+typedef void (* emit_f) (int label, const char *, rtx *);
+
+/* An instruction emitter that emits via the conventional
+ output_asm_insn. */
+static void
+arm_emit (int label ATTRIBUTE_UNUSED, const char *pattern, rtx *operands)
+{
+ output_asm_insn (pattern, operands);
+}
+
+/* Count the number of emitted synchronization instructions. */
+static unsigned arm_insn_count;
+
+/* An emitter that counts emitted instructions but does not actually
+ emit instruction into the the instruction stream. */
+static void
+arm_count (int label,
+ const char *pattern ATTRIBUTE_UNUSED,
+ rtx *operands ATTRIBUTE_UNUSED)
+{
+ if (! label)
+ ++ arm_insn_count;
+}
+
+/* Construct a pattern using conventional output formatting and feed
+ it to output_asm_insn. Provides a mechanism to construct the
+ output pattern on the fly. Note the hard limit on the pattern
+ buffer size. */
+static void
+arm_output_asm_insn (emit_f emit, int label, rtx *operands,
+ const char *pattern, ...)
+{
+ va_list ap;
+ char buffer[256];
+
+ va_start (ap, pattern);
+ vsprintf (buffer, pattern, ap);
+ va_end (ap);
+ emit (label, buffer, operands);
+}
+
+/* Emit the memory barrier instruction, if any, provided by this
+ target to a specified emitter. */
+static void
+arm_process_output_memory_barrier (emit_f emit, rtx *operands)
+{
+ if (TARGET_HAVE_DMB)
+ {
+ /* Note we issue a system level barrier. We should consider
+ issuing a inner shareabilty zone barrier here instead, ie.
+ "DMB ISH". */
+ emit (0, "dmb\tsy", operands);
+ return;
+ }
+
+ if (TARGET_HAVE_DMB_MCR)
+ {
+ emit (0, "mcr\tp15, 0, r0, c7, c10, 5", operands);
+ return;
+ }
+
+ gcc_unreachable ();
+}
+
+/* Emit the memory barrier instruction, if any, provided by this
+ target. */
+const char *
+arm_output_memory_barrier (rtx *operands)
+{
+ arm_process_output_memory_barrier (arm_emit, operands);
+ return "";
+}
+
+/* Helper to figure out the instruction suffix required on ldrex/strex
+ for operations on an object of the specified mode. */
+static const char *
+arm_ldrex_suffix (enum machine_mode mode)
+{
+ switch (mode)
+ {
+ case QImode: return "b";
+ case HImode: return "h";
+ case SImode: return "";
+ case DImode: return "d";
+ default:
+ gcc_unreachable ();
+ }
+ return "";
+}
+
+/* Emit an ldrex{b,h,d, } instruction appropriate for the specified
+ mode. */
+static void
+arm_output_ldrex (emit_f emit,
+ enum machine_mode mode,
+ rtx target,
+ rtx memory)
+{
+ const char *suffix = arm_ldrex_suffix (mode);
+ rtx operands[2];
+
+ operands[0] = target;
+ operands[1] = memory;
+ arm_output_asm_insn (emit, 0, operands, "ldrex%s\t%%0, %%C1", suffix);
+}
+
+/* Emit a strex{b,h,d, } instruction appropriate for the specified
+ mode. */
+static void
+arm_output_strex (emit_f emit,
+ enum machine_mode mode,
+ const char *cc,
+ rtx result,
+ rtx value,
+ rtx memory)
+{
+ const char *suffix = arm_ldrex_suffix (mode);
+ rtx operands[3];
+
+ operands[0] = result;
+ operands[1] = value;
+ operands[2] = memory;
+ arm_output_asm_insn (emit, 0, operands, "strex%s%s\t%%0, %%1, %%C2", suffix,
+ cc);
+}
+
+/* Helper to emit a two operand instruction. */
+static void
+arm_output_op2 (emit_f emit, const char *mnemonic, rtx d, rtx s)
+{
+ rtx operands[2];
+
+ operands[0] = d;
+ operands[1] = s;
+ arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1", mnemonic);
+}
+
+/* Helper to emit a three operand instruction. */
+static void
+arm_output_op3 (emit_f emit, const char *mnemonic, rtx d, rtx a, rtx b)
+{
+ rtx operands[3];
+
+ operands[0] = d;
+ operands[1] = a;
+ operands[2] = b;
+ arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1, %%2", mnemonic);
+}
+
+/* Emit a load store exclusive synchronization loop.
+
+ do
+ old_value = [mem]
+ if old_value != required_value
+ break;
+ t1 = sync_op (old_value, new_value)
+ [mem] = t1, t2 = [0|1]
+ while ! t2
+
+ Note:
+ t1 == t2 is not permitted
+ t1 == old_value is permitted
+
+ required_value:
+
+ RTX register or const_int representing the required old_value for
+ the modify to continue, if NULL no comparsion is performed. */
+static void
+arm_output_sync_loop (emit_f emit,
+ enum machine_mode mode,
+ rtx old_value,
+ rtx memory,
+ rtx required_value,
+ rtx new_value,
+ rtx t1,
+ rtx t2,
+ enum attr_sync_op sync_op,
+ int early_barrier_required)
+{
+ rtx operands[1];
+
+ gcc_assert (t1 != t2);
+
+ if (early_barrier_required)
+ arm_process_output_memory_barrier (emit, NULL);
+
+ arm_output_asm_insn (emit, 1, operands, "%sLSYT%%=:", LOCAL_LABEL_PREFIX);
+
+ arm_output_ldrex (emit, mode, old_value, memory);
+
+ if (required_value)
+ {
+ rtx operands[2];
+
+ operands[0] = old_value;
+ operands[1] = required_value;
+ arm_output_asm_insn (emit, 0, operands, "cmp\t%%0, %%1");
+ arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYB%%=", LOCAL_LABEL_PREFIX);
+ }
+
+ switch (sync_op)
+ {
+ case SYNC_OP_ADD:
+ arm_output_op3 (emit, "add", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_SUB:
+ arm_output_op3 (emit, "sub", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_IOR:
+ arm_output_op3 (emit, "orr", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_XOR:
+ arm_output_op3 (emit, "eor", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_AND:
+ arm_output_op3 (emit,"and", t1, old_value, new_value);
+ break;
+
+ case SYNC_OP_NAND:
+ arm_output_op3 (emit, "and", t1, old_value, new_value);
+ arm_output_op2 (emit, "mvn", t1, t1);
+ break;
+
+ case SYNC_OP_NONE:
+ t1 = new_value;
+ break;
+ }
+
+ arm_output_strex (emit, mode, "", t2, t1, memory);
+ operands[0] = t2;
+ arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
+ arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=", LOCAL_LABEL_PREFIX);
+
+ arm_process_output_memory_barrier (emit, NULL);
+ arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX);
+}
+
+static rtx
+arm_get_sync_operand (rtx *operands, int index, rtx default_value)
+{
+ if (index > 0)
+ default_value = operands[index - 1];
+
+ return default_value;
+}
+
+#define FETCH_SYNC_OPERAND(NAME, DEFAULT) \
+ arm_get_sync_operand (operands, (int) get_attr_sync_##NAME (insn), DEFAULT);
+
+/* Extract the operands for a synchroniztion instruction from the
+ instructions attributes and emit the instruction. */
+static void
+arm_process_output_sync_insn (emit_f emit, rtx insn, rtx *operands)
+{
+ rtx result, memory, required_value, new_value, t1, t2;
+ int early_barrier;
+ enum machine_mode mode;
+ enum attr_sync_op sync_op;
+
+ result = FETCH_SYNC_OPERAND(result, 0);
+ memory = FETCH_SYNC_OPERAND(memory, 0);
+ required_value = FETCH_SYNC_OPERAND(required_value, 0);
+ new_value = FETCH_SYNC_OPERAND(new_value, 0);
+ t1 = FETCH_SYNC_OPERAND(t1, 0);
+ t2 = FETCH_SYNC_OPERAND(t2, 0);
+ early_barrier =
+ get_attr_sync_release_barrier (insn) == SYNC_RELEASE_BARRIER_YES;
+ sync_op = get_attr_sync_op (insn);
+ mode = GET_MODE (memory);
+
+ arm_output_sync_loop (emit, mode, result, memory, required_value,
+ new_value, t1, t2, sync_op, early_barrier);
+}
+
+/* Emit a synchronization instruction loop. */
+const char *
+arm_output_sync_insn (rtx insn, rtx *operands)
+{
+ arm_process_output_sync_insn (arm_emit, insn, operands);
+ return "";
+}
+
+/* Count the number of machine instruction that will be emitted for a
+ synchronization instruction. Note that the emitter used does not
+ emit instructions, it just counts instructions being carefull not
+ to count labels. */
+unsigned int
+arm_sync_loop_insns (rtx insn, rtx *operands)
+{
+ arm_insn_count = 0;
+ arm_process_output_sync_insn (arm_count, insn, operands);
+ return arm_insn_count;
+}
+
+/* Helper to call a target sync instruction generator, dealing with
+ the variation in operands required by the different generators. */
+static rtx
+arm_call_generator (struct arm_sync_generator *generator, rtx old_value,
+ rtx memory, rtx required_value, rtx new_value)
+{
+ switch (generator->op)
+ {
+ case arm_sync_generator_omn:
+ gcc_assert (! required_value);
+ return generator->u.omn (old_value, memory, new_value);
+
+ case arm_sync_generator_omrn:
+ gcc_assert (required_value);
+ return generator->u.omrn (old_value, memory, required_value, new_value);
+ }
+
+ return NULL;
+}
+
+/* Expand a synchronization loop. The synchronization loop is expanded
+ as an opaque block of instructions in order to ensure that we do
+ not subsequently get extraneous memory accesses inserted within the
+ critical region. The exclusive access property of ldrex/strex is
+ only guaranteed in there are no intervening memory accesses. */
+void
+arm_expand_sync (enum machine_mode mode,
+ struct arm_sync_generator *generator,
+ rtx target, rtx memory, rtx required_value, rtx new_value)
+{
+ if (target == NULL)
+ target = gen_reg_rtx (mode);
+
+ memory = arm_legitimize_sync_memory (memory);
+ if (mode != SImode)
+ {
+ rtx load_temp = gen_reg_rtx (SImode);
+
+ if (required_value)
+ required_value = convert_modes (SImode, mode, required_value, true);
+
+ new_value = convert_modes (SImode, mode, new_value, true);
+ emit_insn (arm_call_generator (generator, load_temp, memory,
+ required_value, new_value));
+ emit_move_insn (target, gen_lowpart (mode, load_temp));
+ }
+ else
+ {
+ emit_insn (arm_call_generator (generator, target, memory, required_value,
+ new_value));
+ }
+}
+
#include "gt-arm.h"