X-Git-Url: http://git.sourceforge.jp/view?p=pf3gnuchains%2Fgcc-fork.git;a=blobdiff_plain;f=gcc%2Fconfig%2Farm%2Farm.c;h=440995ff420d6bac81996a18005ee46408bb07f0;hp=9f004167c04a35d0715ed32b8835b0cee1fdcbbc;hb=48b221d174d58a0c683fb8a838591d8bf28b0530;hpb=654d23578eb8fef8c9c6e6f41211b9318b9f96e5 diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 9f004167c04..440995ff420 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -56,6 +56,7 @@ #include "df.h" #include "intl.h" #include "libfuncs.h" +#include "params.h" /* Forward definitions of types. */ typedef struct minipool_node Mnode; @@ -163,6 +164,10 @@ static void emit_constant_insn (rtx cond, rtx pattern); static rtx emit_set_insn (rtx, rtx); static int arm_arg_partial_bytes (CUMULATIVE_ARGS *, enum machine_mode, tree, bool); +static rtx arm_function_arg (CUMULATIVE_ARGS *, enum machine_mode, + const_tree, bool); +static void arm_function_arg_advance (CUMULATIVE_ARGS *, enum machine_mode, + const_tree, bool); static rtx aapcs_allocate_return_reg (enum machine_mode, const_tree, const_tree); static int aapcs_select_return_coproc (const_tree, const_tree); @@ -227,6 +232,8 @@ static void arm_asm_trampoline_template (FILE *); static void arm_trampoline_init (rtx, tree, rtx); static rtx arm_trampoline_adjust_address (rtx); static rtx arm_pic_static_addr (rtx orig, rtx reg); +static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *); +static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *); /* Table of machine attributes. */ @@ -374,6 +381,10 @@ static const struct attribute_spec arm_attribute_table[] = #define TARGET_PASS_BY_REFERENCE arm_pass_by_reference #undef TARGET_ARG_PARTIAL_BYTES #define TARGET_ARG_PARTIAL_BYTES arm_arg_partial_bytes +#undef TARGET_FUNCTION_ARG +#define TARGET_FUNCTION_ARG arm_function_arg +#undef TARGET_FUNCTION_ARG_ADVANCE +#define TARGET_FUNCTION_ARG_ADVANCE arm_function_arg_advance #undef TARGET_SETUP_INCOMING_VARARGS #define TARGET_SETUP_INCOMING_VARARGS arm_setup_incoming_varargs @@ -590,6 +601,7 @@ static int thumb_call_reg_needed; #define FL_NEON (1 << 20) /* Neon instructions. */ #define FL_ARCH7EM (1 << 21) /* Instructions present in the ARMv7E-M architecture. */ +#define FL_ARCH7 (1 << 22) /* Architecture 7. */ #define FL_IWMMXT (1 << 29) /* XScale v2 or "Intel Wireless MMX technology". */ @@ -614,7 +626,7 @@ static int thumb_call_reg_needed; #define FL_FOR_ARCH6ZK FL_FOR_ARCH6K #define FL_FOR_ARCH6T2 (FL_FOR_ARCH6 | FL_THUMB2) #define FL_FOR_ARCH6M (FL_FOR_ARCH6 & ~FL_NOTM) -#define FL_FOR_ARCH7 (FL_FOR_ARCH6T2 &~ FL_NOTM) +#define FL_FOR_ARCH7 ((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7) #define FL_FOR_ARCH7A (FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K) #define FL_FOR_ARCH7R (FL_FOR_ARCH7A | FL_DIV) #define FL_FOR_ARCH7M (FL_FOR_ARCH7 | FL_DIV) @@ -652,6 +664,9 @@ int arm_arch6 = 0; /* Nonzero if this chip supports the ARM 6K extensions. */ int arm_arch6k = 0; +/* Nonzero if this chip supports the ARM 7 extensions. */ +int arm_arch7 = 0; + /* Nonzero if instructions not present in the 'M' profile can be used. */ int arm_arch_notm = 0; @@ -686,6 +701,9 @@ int arm_tune_cortex_a9 = 0; /* Nonzero if generating Thumb instructions. */ int thumb_code = 0; +/* Nonzero if generating Thumb-1 instructions. */ +int thumb1_code = 0; + /* Nonzero if we should define __THUMB_INTERWORK__ in the preprocessor. XXX This is a bit of a hack, it's intended to help work around @@ -711,12 +729,13 @@ unsigned arm_pic_register = INVALID_REGNUM; the next function. */ static int after_arm_reorg = 0; -static enum arm_pcs arm_pcs_default; +enum arm_pcs arm_pcs_default; /* For an explanation of these variables, see final_prescan_insn below. */ int arm_ccfsm_state; /* arm_current_cc is also used for Thumb-2 cond_exec blocks. */ enum arm_cond_code arm_current_cc; + rtx arm_target_insn; int arm_target_label; /* The number of conditionally executed insns, including the current insn. */ @@ -734,6 +753,12 @@ static const char * const arm_condition_codes[] = "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" }; +/* The register numbers in sequence, for passing to arm_gen_load_multiple. */ +int arm_regs_in_sequence[] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + #define ARM_LSL_NAME (TARGET_UNIFIED_ASM ? "lsl" : "asl") #define streq(string1, string2) (strcmp (string1, string2) == 0) @@ -755,27 +780,39 @@ struct processors const struct tune_params arm_slowmul_tune = { arm_slowmul_rtx_costs, + NULL, 3 }; const struct tune_params arm_fastmul_tune = { arm_fastmul_rtx_costs, + NULL, 1 }; const struct tune_params arm_xscale_tune = { arm_xscale_rtx_costs, + xscale_sched_adjust_cost, 2 }; const struct tune_params arm_9e_tune = { arm_9e_rtx_costs, + NULL, 1 }; +const struct tune_params arm_cortex_a9_tune = +{ + arm_9e_rtx_costs, + cortex_a9_sched_adjust_cost, + 1 +}; + + /* Not all of these give usefully different compilation alternatives, but there is no simple way of generalizing them. */ static const struct processors all_cores[] = @@ -1374,7 +1411,7 @@ arm_override_options (void) arm_selected_cpu = &all_cores[SUBTARGET_CPU_DEFAULT]; #endif /* Default to ARM6. */ - if (arm_selected_cpu->name) + if (!arm_selected_cpu->name) arm_selected_cpu = &all_cores[arm6]; } @@ -1516,7 +1553,7 @@ arm_override_options (void) /* Callee super interworking implies thumb interworking. Adding this to the flags here simplifies the logic elsewhere. */ if (TARGET_THUMB && TARGET_CALLEE_INTERWORKING) - target_flags |= MASK_INTERWORK; + target_flags |= MASK_INTERWORK; /* TARGET_BACKTRACE calls leaf_function_p, which causes a crash if done from here where no function is being compiled currently. */ @@ -1526,9 +1563,6 @@ arm_override_options (void) if (TARGET_ARM && TARGET_CALLEE_INTERWORKING) warning (0, "enabling callee interworking support is only meaningful when compiling for the Thumb"); - if (TARGET_ARM && TARGET_CALLER_INTERWORKING) - warning (0, "enabling caller interworking support is only meaningful when compiling for the Thumb"); - if (TARGET_APCS_STACK && !TARGET_APCS_FRAME) { warning (0, "-mapcs-stack-check incompatible with -mno-apcs-frame"); @@ -1564,6 +1598,7 @@ arm_override_options (void) arm_arch6 = (insn_flags & FL_ARCH6) != 0; arm_arch6k = (insn_flags & FL_ARCH6K) != 0; arm_arch_notm = (insn_flags & FL_NOTM) != 0; + arm_arch7 = (insn_flags & FL_ARCH7) != 0; arm_arch7em = (insn_flags & FL_ARCH7EM) != 0; arm_arch_thumb2 = (insn_flags & FL_THUMB2) != 0; arm_arch_xscale = (insn_flags & FL_XSCALE) != 0; @@ -1571,7 +1606,8 @@ arm_override_options (void) arm_ld_sched = (tune_flags & FL_LDSCHED) != 0; arm_tune_strongarm = (tune_flags & FL_STRONG) != 0; - thumb_code = (TARGET_ARM == 0); + thumb_code = TARGET_ARM == 0; + thumb1_code = TARGET_THUMB1 != 0; arm_tune_wbuf = (tune_flags & FL_WBUF) != 0; arm_tune_xscale = (tune_flags & FL_XSCALE) != 0; arm_arch_iwmmxt = (insn_flags & FL_IWMMXT) != 0; @@ -1872,6 +1908,14 @@ arm_override_options (void) flag_reorder_blocks = 1; } + if (!PARAM_SET_P (PARAM_GCSE_UNRESTRICTED_COST) + && flag_pic) + /* Hoisting PIC address calculations more aggressively provides a small, + but measurable, size reduction for PIC code. Therefore, we decrease + the bar for unrestricted expression hoisting to the cost of PIC address + calculation, which is 2 instructions. */ + set_param_value ("gcse-unrestricted-cost", 2); + /* Register global variables with the garbage collector. */ arm_add_gc_roots (); } @@ -3683,9 +3727,7 @@ arm_get_pcs_model (const_tree type, const_tree decl) /* Detect varargs functions. These always use the base rules (no argument is ever a candidate for a co-processor register). */ - bool base_rules = (TYPE_ARG_TYPES (type) != 0 - && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (type))) - != void_type_node)); + bool base_rules = stdarg_p (type); if (user_convention) { @@ -4144,7 +4186,7 @@ static struct static int aapcs_select_call_coproc (CUMULATIVE_ARGS *pcum, enum machine_mode mode, - tree type) + const_tree type) { int i; @@ -4256,7 +4298,7 @@ aapcs_libcall_value (enum machine_mode mode) numbers referred to here are those in the AAPCS. */ static void aapcs_layout_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode, - tree type, int named) + const_tree type, bool named) { int nregs, nregs2; int ncrn; @@ -4421,7 +4463,7 @@ arm_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree fntype, /* Return true if mode/type need doubleword alignment. */ bool -arm_needs_doubleword_align (enum machine_mode mode, tree type) +arm_needs_doubleword_align (enum machine_mode mode, const_tree type) { return (GET_MODE_ALIGNMENT (mode) > PARM_BOUNDARY || (type && TYPE_ALIGN (type) > PARM_BOUNDARY)); @@ -4439,11 +4481,17 @@ arm_needs_doubleword_align (enum machine_mode mode, tree type) CUM is a variable of type CUMULATIVE_ARGS which gives info about the preceding args and about the function being called. NAMED is nonzero if this argument is a named parameter - (otherwise it is an extra parameter matching an ellipsis). */ + (otherwise it is an extra parameter matching an ellipsis). -rtx + On the ARM, normally the first 16 bytes are passed in registers r0-r3; all + other arguments are passed on the stack. If (NAMED == 0) (which happens + only in assign_parms, since TARGET_SETUP_INCOMING_VARARGS is + defined), say it is passed in the stack (function_prologue will + indeed make it pass in the stack if necessary). */ + +static rtx arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode, - tree type, int named) + const_tree type, bool named) { int nregs; @@ -4479,10 +4527,6 @@ arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode, && arm_needs_doubleword_align (mode, type)) pcum->nregs++; - if (mode == VOIDmode) - /* Pick an arbitrary value for operand 2 of the call insn. */ - return const0_rtx; - /* Only allow splitting an arg between regs and memory if all preceding args were allocated to regs. For args passed by reference we only count the reference pointer. */ @@ -4520,9 +4564,13 @@ arm_arg_partial_bytes (CUMULATIVE_ARGS *pcum, enum machine_mode mode, return 0; } -void +/* Update the data in PCUM to advance over an argument + of mode MODE and data type TYPE. + (TYPE is null for libcalls where that information may not be available.) */ + +static void arm_function_arg_advance (CUMULATIVE_ARGS *pcum, enum machine_mode mode, - tree type, bool named) + const_tree type, bool named) { if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) { @@ -6288,9 +6336,6 @@ arm_cannot_force_const_mem (rtx x) #define REG_OR_SUBREG_RTX(X) \ (GET_CODE (X) == REG ? (X) : SUBREG_REG (X)) -#ifndef COSTS_N_INSNS -#define COSTS_N_INSNS(N) ((N) * 4 - 2) -#endif static inline int thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) { @@ -6633,12 +6678,10 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed) since then they might not be moved outside of loops. As a compromise we allow integration with ops that have a constant as their second operand. */ - if ((REG_OR_SUBREG_REG (XEXP (x, 0)) - && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0))) - && GET_CODE (XEXP (x, 1)) != CONST_INT) - || (REG_OR_SUBREG_REG (XEXP (x, 0)) - && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0))))) - *total = 4; + if (REG_OR_SUBREG_REG (XEXP (x, 0)) + && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0))) + && GET_CODE (XEXP (x, 1)) != CONST_INT) + *total = COSTS_N_INSNS (1); if (mode == DImode) { @@ -7681,15 +7724,13 @@ arm_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED) return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x); } -static int -arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) +/* Adjust cost hook for XScale. */ +static bool +xscale_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) { - rtx i_pat, d_pat; - /* Some true dependencies can have a higher cost depending on precisely how certain input operands are used. */ - if (arm_tune_xscale - && REG_NOTE_KIND (link) == 0 + if (REG_NOTE_KIND(link) == 0 && recog_memoized (insn) >= 0 && recog_memoized (dep) >= 0) { @@ -7723,10 +7764,116 @@ arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) if (reg_overlap_mentioned_p (recog_data.operand[opno], shifted_operand)) - return 2; + { + *cost = 2; + return false; + } } } } + return true; +} + +/* Adjust cost hook for Cortex A9. */ +static bool +cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) +{ + switch (REG_NOTE_KIND (link)) + { + case REG_DEP_ANTI: + *cost = 0; + return false; + + case REG_DEP_TRUE: + case REG_DEP_OUTPUT: + if (recog_memoized (insn) >= 0 + && recog_memoized (dep) >= 0) + { + if (GET_CODE (PATTERN (insn)) == SET) + { + if (GET_MODE_CLASS + (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT + || GET_MODE_CLASS + (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT) + { + enum attr_type attr_type_insn = get_attr_type (insn); + enum attr_type attr_type_dep = get_attr_type (dep); + + /* By default all dependencies of the form + s0 = s0 s1 + s0 = s0 s2 + have an extra latency of 1 cycle because + of the input and output dependency in this + case. However this gets modeled as an true + dependency and hence all these checks. */ + if (REG_P (SET_DEST (PATTERN (insn))) + && REG_P (SET_DEST (PATTERN (dep))) + && reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)), + SET_DEST (PATTERN (dep)))) + { + /* FMACS is a special case where the dependant + instruction can be issued 3 cycles before + the normal latency in case of an output + dependency. */ + if ((attr_type_insn == TYPE_FMACS + || attr_type_insn == TYPE_FMACD) + && (attr_type_dep == TYPE_FMACS + || attr_type_dep == TYPE_FMACD)) + { + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) + *cost = insn_default_latency (dep) - 3; + else + *cost = insn_default_latency (dep); + return false; + } + else + { + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) + *cost = insn_default_latency (dep) + 1; + else + *cost = insn_default_latency (dep); + } + return false; + } + } + } + } + break; + + default: + gcc_unreachable (); + } + + return true; +} + +/* This function implements the target macro TARGET_SCHED_ADJUST_COST. + It corrects the value of COST based on the relationship between + INSN and DEP through the dependence LINK. It returns the new + value. There is a per-core adjust_cost hook to adjust scheduler costs + and the per-core hook can choose to completely override the generic + adjust_cost function. Only put bits of code into arm_adjust_cost that + are common across all cores. */ +static int +arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) +{ + rtx i_pat, d_pat; + + /* When generating Thumb-1 code, we want to place flag-setting operations + close to a conditional branch which depends on them, so that we can + omit the comparison. */ + if (TARGET_THUMB1 + && REG_NOTE_KIND (link) == 0 + && recog_memoized (insn) == CODE_FOR_cbranchsi4_insn + && recog_memoized (dep) >= 0 + && get_attr_conds (dep) == CONDS_SET) + return 0; + + if (current_tune->sched_adjust_cost != NULL) + { + if (!current_tune->sched_adjust_cost (insn, link, dep, &cost)) + return cost; + } /* XXX This is not strictly true for the FPA. */ if (REG_NOTE_KIND (link) == REG_DEP_ANTI @@ -7749,7 +7896,8 @@ arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) constant pool are cached, and that others will miss. This is a hack. */ - if ((GET_CODE (src_mem) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (src_mem)) + if ((GET_CODE (src_mem) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (src_mem)) || reg_mentioned_p (stack_pointer_rtx, src_mem) || reg_mentioned_p (frame_pointer_rtx, src_mem) || reg_mentioned_p (hard_frame_pointer_rtx, src_mem)) @@ -9161,6 +9309,36 @@ multiple_operation_profitable_p (bool is_store ATTRIBUTE_UNUSED, if (nops == 2 && arm_ld_sched && add_offset != 0) return false; + /* XScale has load-store double instructions, but they have stricter + alignment requirements than load-store multiple, so we cannot + use them. + + For XScale ldm requires 2 + NREGS cycles to complete and blocks + the pipeline until completion. + + NREGS CYCLES + 1 3 + 2 4 + 3 5 + 4 6 + + An ldr instruction takes 1-3 cycles, but does not block the + pipeline. + + NREGS CYCLES + 1 1-3 + 2 2-6 + 3 3-9 + 4 4-12 + + Best case ldr will always win. However, the more ldr instructions + we issue, the less likely we are to be able to schedule them well. + Using ldr instructions also increases code size. + + As a compromise, we use ldr for counts of 1 or 2 regs, and ldm + for counts of 3 or 4 regs. */ + if (nops <= 2 && arm_tune_xscale && !optimize_size) + return false; return true; } @@ -9204,13 +9382,29 @@ compute_offset_order (int nops, HOST_WIDE_INT *unsorted_offsets, int *order, return true; } -int -load_multiple_sequence (rtx *operands, int nops, int *regs, int *base, - HOST_WIDE_INT *load_offset) +/* Used to determine in a peephole whether a sequence of load + instructions can be changed into a load-multiple instruction. + NOPS is the number of separate load instructions we are examining. The + first NOPS entries in OPERANDS are the destination registers, the + next NOPS entries are memory operands. If this function is + successful, *BASE is set to the common base register of the memory + accesses; *LOAD_OFFSET is set to the first memory location's offset + from that base register. + REGS is an array filled in with the destination register numbers. + SAVED_ORDER (if nonnull), is an array filled in with an order that maps + insn numbers to to an ascending order of stores. If CHECK_REGS is true, + the sequence of registers in REGS matches the loads from ascending memory + locations, and the function verifies that the register numbers are + themselves ascending. If CHECK_REGS is false, the register numbers + are stored in the order they are found in the operands. */ +static int +load_multiple_sequence (rtx *operands, int nops, int *regs, int *saved_order, + int *base, HOST_WIDE_INT *load_offset, bool check_regs) { int unsorted_regs[MAX_LDM_STM_OPS]; HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS]; int order[MAX_LDM_STM_OPS]; + rtx base_reg_rtx = NULL; int base_reg = -1; int i, ldm_case; @@ -9254,13 +9448,16 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base, == CONST_INT))) { if (i == 0) - base_reg = REGNO (reg); - else { - if (base_reg != (int) REGNO (reg)) - /* Not addressed from the same base register. */ + base_reg = REGNO (reg); + base_reg_rtx = reg; + if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM) return 0; } + else if (base_reg != (int) REGNO (reg)) + /* Not addressed from the same base register. */ + return 0; + unsorted_regs[i] = (GET_CODE (operands[i]) == REG ? REGNO (operands[i]) : REGNO (SUBREG_REG (operands[i]))); @@ -9268,7 +9465,9 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base, /* If it isn't an integer register, or if it overwrites the base register but isn't the last insn in the list, then we can't do this. */ - if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14 + if (unsorted_regs[i] < 0 + || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM) + || unsorted_regs[i] > 14 || (i != nops - 1 && unsorted_regs[i] == base_reg)) return 0; @@ -9286,26 +9485,34 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base, order[0] has been set to the lowest offset in the list. Sort the offsets into order, verifying that they are adjacent, and check that the register numbers are ascending. */ - if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs)) + if (!compute_offset_order (nops, unsorted_offsets, order, + check_regs ? unsorted_regs : NULL)) return 0; + if (saved_order) + memcpy (saved_order, order, sizeof order); + if (base) { *base = base_reg; for (i = 0; i < nops; i++) - regs[i] = unsorted_regs[order[i]]; + regs[i] = unsorted_regs[check_regs ? order[i] : i]; *load_offset = unsorted_offsets[order[0]]; } + if (TARGET_THUMB1 + && !peep2_reg_dead_p (nops, base_reg_rtx)) + return 0; + if (unsorted_offsets[order[0]] == 0) ldm_case = 1; /* ldmia */ else if (TARGET_ARM && unsorted_offsets[order[0]] == 4) ldm_case = 2; /* ldmib */ else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) ldm_case = 3; /* ldmda */ - else if (unsorted_offsets[order[nops - 1]] == -4) + else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4) ldm_case = 4; /* ldmdb */ else if (const_ok_for_arm (unsorted_offsets[order[0]]) || const_ok_for_arm (-unsorted_offsets[order[0]])) @@ -9321,72 +9528,34 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base, return ldm_case; } -const char * -emit_ldm_seq (rtx *operands, int nops) -{ - int regs[MAX_LDM_STM_OPS]; - int base_reg; - HOST_WIDE_INT offset; - char buf[100]; - int i; - - switch (load_multiple_sequence (operands, nops, regs, &base_reg, &offset)) - { - case 1: - strcpy (buf, "ldm%(ia%)\t"); - break; - - case 2: - strcpy (buf, "ldm%(ib%)\t"); - break; - - case 3: - strcpy (buf, "ldm%(da%)\t"); - break; - - case 4: - strcpy (buf, "ldm%(db%)\t"); - break; - - case 5: - if (offset >= 0) - sprintf (buf, "add%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX, - reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg], - (long) offset); - else - sprintf (buf, "sub%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX, - reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg], - (long) -offset); - output_asm_insn (buf, operands); - base_reg = regs[0]; - strcpy (buf, "ldm%(ia%)\t"); - break; - - default: - gcc_unreachable (); - } - - sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX, - reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]); - - for (i = 1; i < nops; i++) - sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX, - reg_names[regs[i]]); - - strcat (buf, "}\t%@ phole ldm"); - - output_asm_insn (buf, operands); - return ""; -} - -int -store_multiple_sequence (rtx *operands, int nops, int *regs, int *base, - HOST_WIDE_INT * load_offset) +/* Used to determine in a peephole whether a sequence of store instructions can + be changed into a store-multiple instruction. + NOPS is the number of separate store instructions we are examining. + NOPS_TOTAL is the total number of instructions recognized by the peephole + pattern. + The first NOPS entries in OPERANDS are the source registers, the next + NOPS entries are memory operands. If this function is successful, *BASE is + set to the common base register of the memory accesses; *LOAD_OFFSET is set + to the first memory location's offset from that base register. REGS is an + array filled in with the source register numbers, REG_RTXS (if nonnull) is + likewise filled with the corresponding rtx's. + SAVED_ORDER (if nonnull), is an array filled in with an order that maps insn + numbers to to an ascending order of stores. + If CHECK_REGS is true, the sequence of registers in *REGS matches the stores + from ascending memory locations, and the function verifies that the register + numbers are themselves ascending. If CHECK_REGS is false, the register + numbers are stored in the order they are found in the operands. */ +static int +store_multiple_sequence (rtx *operands, int nops, int nops_total, + int *regs, rtx *reg_rtxs, int *saved_order, int *base, + HOST_WIDE_INT *load_offset, bool check_regs) { int unsorted_regs[MAX_LDM_STM_OPS]; + rtx unsorted_reg_rtxs[MAX_LDM_STM_OPS]; HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS]; int order[MAX_LDM_STM_OPS]; int base_reg = -1; + rtx base_reg_rtx = NULL; int i, stm_case; /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be @@ -9428,17 +9597,27 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base, && (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1)) == CONST_INT))) { - unsorted_regs[i] = (GET_CODE (operands[i]) == REG - ? REGNO (operands[i]) - : REGNO (SUBREG_REG (operands[i]))); + unsorted_reg_rtxs[i] = (GET_CODE (operands[i]) == REG + ? operands[i] : SUBREG_REG (operands[i])); + unsorted_regs[i] = REGNO (unsorted_reg_rtxs[i]); + if (i == 0) - base_reg = REGNO (reg); + { + base_reg = REGNO (reg); + base_reg_rtx = reg; + if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM) + return 0; + } else if (base_reg != (int) REGNO (reg)) /* Not addressed from the same base register. */ return 0; /* If it isn't an integer register, then we can't do this. */ - if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14) + if (unsorted_regs[i] < 0 + || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM) + || (TARGET_THUMB2 && unsorted_regs[i] == base_reg) + || (TARGET_THUMB2 && unsorted_regs[i] == SP_REGNUM) + || unsorted_regs[i] > 14) return 0; unsorted_offsets[i] = INTVAL (offset); @@ -9455,26 +9634,38 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base, order[0] has been set to the lowest offset in the list. Sort the offsets into order, verifying that they are adjacent, and check that the register numbers are ascending. */ - if (!compute_offset_order (nops, unsorted_offsets, order, unsorted_regs)) + if (!compute_offset_order (nops, unsorted_offsets, order, + check_regs ? unsorted_regs : NULL)) return 0; + if (saved_order) + memcpy (saved_order, order, sizeof order); + if (base) { *base = base_reg; for (i = 0; i < nops; i++) - regs[i] = unsorted_regs[order[i]]; + { + regs[i] = unsorted_regs[check_regs ? order[i] : i]; + if (reg_rtxs) + reg_rtxs[i] = unsorted_reg_rtxs[check_regs ? order[i] : i]; + } *load_offset = unsorted_offsets[order[0]]; } + if (TARGET_THUMB1 + && !peep2_reg_dead_p (nops_total, base_reg_rtx)) + return 0; + if (unsorted_offsets[order[0]] == 0) stm_case = 1; /* stmia */ else if (TARGET_ARM && unsorted_offsets[order[0]] == 4) stm_case = 2; /* stmib */ else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) stm_case = 3; /* stmda */ - else if (unsorted_offsets[order[nops - 1]] == -4) + else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4) stm_case = 4; /* stmdb */ else return 0; @@ -9484,204 +9675,406 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base, return stm_case; } + +/* Routines for use in generating RTL. */ -const char * -emit_stm_seq (rtx *operands, int nops) +/* Generate a load-multiple instruction. COUNT is the number of loads in + the instruction; REGS and MEMS are arrays containing the operands. + BASEREG is the base register to be used in addressing the memory operands. + WBACK_OFFSET is nonzero if the instruction should update the base + register. */ + +static rtx +arm_gen_load_multiple_1 (int count, int *regs, rtx *mems, rtx basereg, + HOST_WIDE_INT wback_offset) { - int regs[MAX_LDM_STM_OPS]; - int base_reg; - HOST_WIDE_INT offset; - char buf[100]; - int i; + int i = 0, j; + rtx result; - switch (store_multiple_sequence (operands, nops, regs, &base_reg, &offset)) + if (!multiple_operation_profitable_p (false, count, 0)) { - case 1: - strcpy (buf, "stm%(ia%)\t"); - break; + rtx seq; - case 2: - strcpy (buf, "stm%(ib%)\t"); - break; + start_sequence (); - case 3: - strcpy (buf, "stm%(da%)\t"); - break; + for (i = 0; i < count; i++) + emit_move_insn (gen_rtx_REG (SImode, regs[i]), mems[i]); - case 4: - strcpy (buf, "stm%(db%)\t"); - break; + if (wback_offset != 0) + emit_move_insn (basereg, plus_constant (basereg, wback_offset)); - default: - gcc_unreachable (); - } + seq = get_insns (); + end_sequence (); - sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX, - reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]); + return seq; + } - for (i = 1; i < nops; i++) - sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX, - reg_names[regs[i]]); + result = gen_rtx_PARALLEL (VOIDmode, + rtvec_alloc (count + (wback_offset != 0 ? 1 : 0))); + if (wback_offset != 0) + { + XVECEXP (result, 0, 0) + = gen_rtx_SET (VOIDmode, basereg, + plus_constant (basereg, wback_offset)); + i = 1; + count++; + } - strcat (buf, "}\t%@ phole stm"); + for (j = 0; i < count; i++, j++) + XVECEXP (result, 0, i) + = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, regs[j]), mems[j]); - output_asm_insn (buf, operands); - return ""; + return result; } - -/* Routines for use in generating RTL. */ -rtx -arm_gen_load_multiple (int base_regno, int count, rtx from, int up, - int write_back, rtx basemem, HOST_WIDE_INT *offsetp) +/* Generate a store-multiple instruction. COUNT is the number of stores in + the instruction; REGS and MEMS are arrays containing the operands. + BASEREG is the base register to be used in addressing the memory operands. + WBACK_OFFSET is nonzero if the instruction should update the base + register. */ + +static rtx +arm_gen_store_multiple_1 (int count, int *regs, rtx *mems, rtx basereg, + HOST_WIDE_INT wback_offset) { - HOST_WIDE_INT offset = *offsetp; int i = 0, j; rtx result; - int sign = up ? 1 : -1; - rtx mem, addr; - /* XScale has load-store double instructions, but they have stricter - alignment requirements than load-store multiple, so we cannot - use them. + if (GET_CODE (basereg) == PLUS) + basereg = XEXP (basereg, 0); - For XScale ldm requires 2 + NREGS cycles to complete and blocks - the pipeline until completion. + if (!multiple_operation_profitable_p (false, count, 0)) + { + rtx seq; - NREGS CYCLES - 1 3 - 2 4 - 3 5 - 4 6 + start_sequence (); - An ldr instruction takes 1-3 cycles, but does not block the - pipeline. + for (i = 0; i < count; i++) + emit_move_insn (mems[i], gen_rtx_REG (SImode, regs[i])); - NREGS CYCLES - 1 1-3 - 2 2-6 - 3 3-9 - 4 4-12 + if (wback_offset != 0) + emit_move_insn (basereg, plus_constant (basereg, wback_offset)); - Best case ldr will always win. However, the more ldr instructions - we issue, the less likely we are to be able to schedule them well. - Using ldr instructions also increases code size. + seq = get_insns (); + end_sequence (); - As a compromise, we use ldr for counts of 1 or 2 regs, and ldm - for counts of 3 or 4 regs. */ - if (arm_tune_xscale && count <= 2 && ! optimize_size) + return seq; + } + + result = gen_rtx_PARALLEL (VOIDmode, + rtvec_alloc (count + (wback_offset != 0 ? 1 : 0))); + if (wback_offset != 0) { - rtx seq; + XVECEXP (result, 0, 0) + = gen_rtx_SET (VOIDmode, basereg, + plus_constant (basereg, wback_offset)); + i = 1; + count++; + } - start_sequence (); + for (j = 0; i < count; i++, j++) + XVECEXP (result, 0, i) + = gen_rtx_SET (VOIDmode, mems[j], gen_rtx_REG (SImode, regs[j])); - for (i = 0; i < count; i++) + return result; +} + +/* Generate either a load-multiple or a store-multiple instruction. This + function can be used in situations where we can start with a single MEM + rtx and adjust its address upwards. + COUNT is the number of operations in the instruction, not counting a + possible update of the base register. REGS is an array containing the + register operands. + BASEREG is the base register to be used in addressing the memory operands, + which are constructed from BASEMEM. + WRITE_BACK specifies whether the generated instruction should include an + update of the base register. + OFFSETP is used to pass an offset to and from this function; this offset + is not used when constructing the address (instead BASEMEM should have an + appropriate offset in its address), it is used only for setting + MEM_OFFSET. It is updated only if WRITE_BACK is true.*/ + +static rtx +arm_gen_multiple_op (bool is_load, int *regs, int count, rtx basereg, + bool write_back, rtx basemem, HOST_WIDE_INT *offsetp) +{ + rtx mems[MAX_LDM_STM_OPS]; + HOST_WIDE_INT offset = *offsetp; + int i; + + gcc_assert (count <= MAX_LDM_STM_OPS); + + if (GET_CODE (basereg) == PLUS) + basereg = XEXP (basereg, 0); + + for (i = 0; i < count; i++) + { + rtx addr = plus_constant (basereg, i * 4); + mems[i] = adjust_automodify_address_nv (basemem, SImode, addr, offset); + offset += 4; + } + + if (write_back) + *offsetp = offset; + + if (is_load) + return arm_gen_load_multiple_1 (count, regs, mems, basereg, + write_back ? 4 * count : 0); + else + return arm_gen_store_multiple_1 (count, regs, mems, basereg, + write_back ? 4 * count : 0); +} + +rtx +arm_gen_load_multiple (int *regs, int count, rtx basereg, int write_back, + rtx basemem, HOST_WIDE_INT *offsetp) +{ + return arm_gen_multiple_op (TRUE, regs, count, basereg, write_back, basemem, + offsetp); +} + +rtx +arm_gen_store_multiple (int *regs, int count, rtx basereg, int write_back, + rtx basemem, HOST_WIDE_INT *offsetp) +{ + return arm_gen_multiple_op (FALSE, regs, count, basereg, write_back, basemem, + offsetp); +} + +/* Called from a peephole2 expander to turn a sequence of loads into an + LDM instruction. OPERANDS are the operands found by the peephole matcher; + NOPS indicates how many separate loads we are trying to combine. SORT_REGS + is true if we can reorder the registers because they are used commutatively + subsequently. + Returns true iff we could generate a new instruction. */ + +bool +gen_ldm_seq (rtx *operands, int nops, bool sort_regs) +{ + int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS]; + rtx mems[MAX_LDM_STM_OPS]; + int i, j, base_reg; + rtx base_reg_rtx; + HOST_WIDE_INT offset; + int write_back = FALSE; + int ldm_case; + rtx addr; + + ldm_case = load_multiple_sequence (operands, nops, regs, mem_order, + &base_reg, &offset, !sort_regs); + + if (ldm_case == 0) + return false; + + if (sort_regs) + for (i = 0; i < nops - 1; i++) + for (j = i + 1; j < nops; j++) + if (regs[i] > regs[j]) + { + int t = regs[i]; + regs[i] = regs[j]; + regs[j] = t; + } + base_reg_rtx = gen_rtx_REG (Pmode, base_reg); + + if (TARGET_THUMB1) + { + gcc_assert (peep2_reg_dead_p (nops, base_reg_rtx)); + gcc_assert (ldm_case == 1 || ldm_case == 5); + write_back = TRUE; + } + + if (ldm_case == 5) + { + rtx newbase = TARGET_THUMB1 ? base_reg_rtx : gen_rtx_REG (SImode, regs[0]); + emit_insn (gen_addsi3 (newbase, base_reg_rtx, GEN_INT (offset))); + offset = 0; + if (!TARGET_THUMB1) { - addr = plus_constant (from, i * 4 * sign); - mem = adjust_automodify_address (basemem, SImode, addr, offset); - emit_move_insn (gen_rtx_REG (SImode, base_regno + i), mem); - offset += 4 * sign; + base_reg = regs[0]; + base_reg_rtx = newbase; } + } + + for (i = 0; i < nops; i++) + { + addr = plus_constant (base_reg_rtx, offset + i * 4); + mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]], + SImode, addr, 0); + } + emit_insn (arm_gen_load_multiple_1 (nops, regs, mems, base_reg_rtx, + write_back ? offset + i * 4 : 0)); + return true; +} + +/* Called from a peephole2 expander to turn a sequence of stores into an + STM instruction. OPERANDS are the operands found by the peephole matcher; + NOPS indicates how many separate stores we are trying to combine. + Returns true iff we could generate a new instruction. */ + +bool +gen_stm_seq (rtx *operands, int nops) +{ + int i; + int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS]; + rtx mems[MAX_LDM_STM_OPS]; + int base_reg; + rtx base_reg_rtx; + HOST_WIDE_INT offset; + int write_back = FALSE; + int stm_case; + rtx addr; + bool base_reg_dies; - if (write_back) - { - emit_move_insn (from, plus_constant (from, count * 4 * sign)); - *offsetp = offset; - } + stm_case = store_multiple_sequence (operands, nops, nops, regs, NULL, + mem_order, &base_reg, &offset, true); - seq = get_insns (); - end_sequence (); + if (stm_case == 0) + return false; - return seq; - } + base_reg_rtx = gen_rtx_REG (Pmode, base_reg); - result = gen_rtx_PARALLEL (VOIDmode, - rtvec_alloc (count + (write_back ? 1 : 0))); - if (write_back) + base_reg_dies = peep2_reg_dead_p (nops, base_reg_rtx); + if (TARGET_THUMB1) { - XVECEXP (result, 0, 0) - = gen_rtx_SET (VOIDmode, from, plus_constant (from, count * 4 * sign)); - i = 1; - count++; + gcc_assert (base_reg_dies); + write_back = TRUE; } - for (j = 0; i < count; i++, j++) + if (stm_case == 5) { - addr = plus_constant (from, j * 4 * sign); - mem = adjust_automodify_address_nv (basemem, SImode, addr, offset); - XVECEXP (result, 0, i) - = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, base_regno + j), mem); - offset += 4 * sign; + gcc_assert (base_reg_dies); + emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset))); + offset = 0; } - if (write_back) - *offsetp = offset; + addr = plus_constant (base_reg_rtx, offset); - return result; + for (i = 0; i < nops; i++) + { + addr = plus_constant (base_reg_rtx, offset + i * 4); + mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]], + SImode, addr, 0); + } + emit_insn (arm_gen_store_multiple_1 (nops, regs, mems, base_reg_rtx, + write_back ? offset + i * 4 : 0)); + return true; } -rtx -arm_gen_store_multiple (int base_regno, int count, rtx to, int up, - int write_back, rtx basemem, HOST_WIDE_INT *offsetp) +/* Called from a peephole2 expander to turn a sequence of stores that are + preceded by constant loads into an STM instruction. OPERANDS are the + operands found by the peephole matcher; NOPS indicates how many + separate stores we are trying to combine; there are 2 * NOPS + instructions in the peephole. + Returns true iff we could generate a new instruction. */ + +bool +gen_const_stm_seq (rtx *operands, int nops) { - HOST_WIDE_INT offset = *offsetp; - int i = 0, j; - rtx result; - int sign = up ? 1 : -1; - rtx mem, addr; + int regs[MAX_LDM_STM_OPS], sorted_regs[MAX_LDM_STM_OPS]; + int reg_order[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS]; + rtx reg_rtxs[MAX_LDM_STM_OPS], orig_reg_rtxs[MAX_LDM_STM_OPS]; + rtx mems[MAX_LDM_STM_OPS]; + int base_reg; + rtx base_reg_rtx; + HOST_WIDE_INT offset; + int write_back = FALSE; + int stm_case; + rtx addr; + bool base_reg_dies; + int i, j; + HARD_REG_SET allocated; - /* See arm_gen_load_multiple for discussion of - the pros/cons of ldm/stm usage for XScale. */ - if (arm_tune_xscale && count <= 2 && ! optimize_size) - { - rtx seq; + stm_case = store_multiple_sequence (operands, nops, 2 * nops, regs, reg_rtxs, + mem_order, &base_reg, &offset, false); - start_sequence (); + if (stm_case == 0) + return false; - for (i = 0; i < count; i++) - { - addr = plus_constant (to, i * 4 * sign); - mem = adjust_automodify_address (basemem, SImode, addr, offset); - emit_move_insn (mem, gen_rtx_REG (SImode, base_regno + i)); - offset += 4 * sign; - } + memcpy (orig_reg_rtxs, reg_rtxs, sizeof orig_reg_rtxs); - if (write_back) - { - emit_move_insn (to, plus_constant (to, count * 4 * sign)); - *offsetp = offset; - } + /* If the same register is used more than once, try to find a free + register. */ + CLEAR_HARD_REG_SET (allocated); + for (i = 0; i < nops; i++) + { + for (j = i + 1; j < nops; j++) + if (regs[i] == regs[j]) + { + rtx t = peep2_find_free_register (0, nops * 2, + TARGET_THUMB1 ? "l" : "r", + SImode, &allocated); + if (t == NULL_RTX) + return false; + reg_rtxs[i] = t; + regs[i] = REGNO (t); + } + } - seq = get_insns (); - end_sequence (); + /* Compute an ordering that maps the register numbers to an ascending + sequence. */ + reg_order[0] = 0; + for (i = 0; i < nops; i++) + if (regs[i] < regs[reg_order[0]]) + reg_order[0] = i; - return seq; + for (i = 1; i < nops; i++) + { + int this_order = reg_order[i - 1]; + for (j = 0; j < nops; j++) + if (regs[j] > regs[reg_order[i - 1]] + && (this_order == reg_order[i - 1] + || regs[j] < regs[this_order])) + this_order = j; + reg_order[i] = this_order; } - result = gen_rtx_PARALLEL (VOIDmode, - rtvec_alloc (count + (write_back ? 1 : 0))); - if (write_back) + /* Ensure that registers that must be live after the instruction end + up with the correct value. */ + for (i = 0; i < nops; i++) { - XVECEXP (result, 0, 0) - = gen_rtx_SET (VOIDmode, to, - plus_constant (to, count * 4 * sign)); - i = 1; - count++; + int this_order = reg_order[i]; + if ((this_order != mem_order[i] + || orig_reg_rtxs[this_order] != reg_rtxs[this_order]) + && !peep2_reg_dead_p (nops * 2, orig_reg_rtxs[this_order])) + return false; } - for (j = 0; i < count; i++, j++) + /* Load the constants. */ + for (i = 0; i < nops; i++) { - addr = plus_constant (to, j * 4 * sign); - mem = adjust_automodify_address_nv (basemem, SImode, addr, offset); - XVECEXP (result, 0, i) - = gen_rtx_SET (VOIDmode, mem, gen_rtx_REG (SImode, base_regno + j)); - offset += 4 * sign; + rtx op = operands[2 * nops + mem_order[i]]; + sorted_regs[i] = regs[reg_order[i]]; + emit_move_insn (reg_rtxs[reg_order[i]], op); } - if (write_back) - *offsetp = offset; + base_reg_rtx = gen_rtx_REG (Pmode, base_reg); - return result; + base_reg_dies = peep2_reg_dead_p (nops * 2, base_reg_rtx); + if (TARGET_THUMB1) + { + gcc_assert (base_reg_dies); + write_back = TRUE; + } + + if (stm_case == 5) + { + gcc_assert (base_reg_dies); + emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset))); + offset = 0; + } + + addr = plus_constant (base_reg_rtx, offset); + + for (i = 0; i < nops; i++) + { + addr = plus_constant (base_reg_rtx, offset + i * 4); + mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]], + SImode, addr, 0); + } + emit_insn (arm_gen_store_multiple_1 (nops, sorted_regs, mems, base_reg_rtx, + write_back ? offset + i * 4 : 0)); + return true; } int @@ -9717,20 +10110,21 @@ arm_gen_movmemqi (rtx *operands) for (i = 0; in_words_to_go >= 2; i+=4) { if (in_words_to_go > 4) - emit_insn (arm_gen_load_multiple (0, 4, src, TRUE, TRUE, - srcbase, &srcoffset)); + emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, src, + TRUE, srcbase, &srcoffset)); else - emit_insn (arm_gen_load_multiple (0, in_words_to_go, src, TRUE, - FALSE, srcbase, &srcoffset)); + emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, in_words_to_go, + src, FALSE, srcbase, + &srcoffset)); if (out_words_to_go) { if (out_words_to_go > 4) - emit_insn (arm_gen_store_multiple (0, 4, dst, TRUE, TRUE, - dstbase, &dstoffset)); + emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, dst, + TRUE, dstbase, &dstoffset)); else if (out_words_to_go != 1) - emit_insn (arm_gen_store_multiple (0, out_words_to_go, - dst, TRUE, + emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, + out_words_to_go, dst, (last_bytes == 0 ? FALSE : TRUE), dstbase, &dstoffset)); @@ -12565,13 +12959,13 @@ output_move_double (rtx *operands) { if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY) { - output_asm_insn ("ldr%?\t%0, [%1, %2]!", otherops); - output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops); + output_asm_insn ("str%?\t%0, [%1, %2]!", otherops); + output_asm_insn ("str%?\t%H0, [%1, #4]", otherops); } else { - output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops); - output_asm_insn ("ldr%?\t%0, [%1], %2", otherops); + output_asm_insn ("str%?\t%H0, [%1, #4]", otherops); + output_asm_insn ("str%?\t%0, [%1], %2", otherops); } } else if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY) @@ -14088,7 +14482,8 @@ arm_output_epilogue (rtx sibling) && !crtl->tail_call_emit) { unsigned long mask; - mask = (1 << (arm_size_return_regs() / 4)) - 1; + /* Preserve return values, of any size. */ + mask = (1 << ((arm_size_return_regs() + 3) / 4)) - 1; mask ^= 0xf; mask &= ~saved_regs_mask; reg = 0; @@ -15873,6 +16268,17 @@ arm_print_operand (FILE *stream, rtx x, int code) } return; + case 'C': + { + rtx addr; + + gcc_assert (GET_CODE (x) == MEM); + addr = XEXP (x, 0); + gcc_assert (GET_CODE (addr) == REG); + asm_fprintf (stream, "[%r]", REGNO (addr)); + } + return; + /* Translate an S register number into a D register number and element index. */ case 'y': { @@ -19472,14 +19878,45 @@ thumb_exit (FILE *f, int reg_containing_return_addr) /* Return to caller. */ asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); } - +/* Scan INSN just before assembler is output for it. + For Thumb-1, we track the status of the condition codes; this + information is used in the cbranchsi4_insn pattern. */ void thumb1_final_prescan_insn (rtx insn) { if (flag_print_asm_name) asm_fprintf (asm_out_file, "%@ 0x%04x\n", INSN_ADDRESSES (INSN_UID (insn))); + /* Don't overwrite the previous setter when we get to a cbranch. */ + if (INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn) + { + enum attr_conds conds; + + if (cfun->machine->thumb1_cc_insn) + { + if (modified_in_p (cfun->machine->thumb1_cc_op0, insn) + || modified_in_p (cfun->machine->thumb1_cc_op1, insn)) + CC_STATUS_INIT; + } + conds = get_attr_conds (insn); + if (conds == CONDS_SET) + { + rtx set = single_set (insn); + cfun->machine->thumb1_cc_insn = insn; + cfun->machine->thumb1_cc_op0 = SET_DEST (set); + cfun->machine->thumb1_cc_op1 = const0_rtx; + cfun->machine->thumb1_cc_mode = CC_NOOVmode; + if (INSN_CODE (insn) == CODE_FOR_thumb1_subsi3_insn) + { + rtx src1 = XEXP (SET_SRC (set), 1); + if (src1 == const0_rtx) + cfun->machine->thumb1_cc_mode = CCmode; + } + } + else if (conds != CONDS_NOCOND) + cfun->machine->thumb1_cc_insn = NULL_RTX; + } } int @@ -22024,6 +22461,7 @@ arm_issue_rate (void) { case cortexr4: case cortexr4f: + case cortexa5: case cortexa8: case cortexa9: return 2; @@ -22170,4 +22608,372 @@ arm_have_conditional_execution (void) return !TARGET_THUMB1; } +/* Legitimize a memory reference for sync primitive implemented using + ldrex / strex. We currently force the form of the reference to be + indirect without offset. We do not yet support the indirect offset + addressing supported by some ARM targets for these + instructions. */ +static rtx +arm_legitimize_sync_memory (rtx memory) +{ + rtx addr = force_reg (Pmode, XEXP (memory, 0)); + rtx legitimate_memory = gen_rtx_MEM (GET_MODE (memory), addr); + + set_mem_alias_set (legitimate_memory, ALIAS_SET_MEMORY_BARRIER); + MEM_VOLATILE_P (legitimate_memory) = MEM_VOLATILE_P (memory); + return legitimate_memory; +} + +/* An instruction emitter. */ +typedef void (* emit_f) (int label, const char *, rtx *); + +/* An instruction emitter that emits via the conventional + output_asm_insn. */ +static void +arm_emit (int label ATTRIBUTE_UNUSED, const char *pattern, rtx *operands) +{ + output_asm_insn (pattern, operands); +} + +/* Count the number of emitted synchronization instructions. */ +static unsigned arm_insn_count; + +/* An emitter that counts emitted instructions but does not actually + emit instruction into the the instruction stream. */ +static void +arm_count (int label, + const char *pattern ATTRIBUTE_UNUSED, + rtx *operands ATTRIBUTE_UNUSED) +{ + if (! label) + ++ arm_insn_count; +} + +/* Construct a pattern using conventional output formatting and feed + it to output_asm_insn. Provides a mechanism to construct the + output pattern on the fly. Note the hard limit on the pattern + buffer size. */ +static void +arm_output_asm_insn (emit_f emit, int label, rtx *operands, + const char *pattern, ...) +{ + va_list ap; + char buffer[256]; + + va_start (ap, pattern); + vsprintf (buffer, pattern, ap); + va_end (ap); + emit (label, buffer, operands); +} + +/* Emit the memory barrier instruction, if any, provided by this + target to a specified emitter. */ +static void +arm_process_output_memory_barrier (emit_f emit, rtx *operands) +{ + if (TARGET_HAVE_DMB) + { + /* Note we issue a system level barrier. We should consider + issuing a inner shareabilty zone barrier here instead, ie. + "DMB ISH". */ + emit (0, "dmb\tsy", operands); + return; + } + + if (TARGET_HAVE_DMB_MCR) + { + emit (0, "mcr\tp15, 0, r0, c7, c10, 5", operands); + return; + } + + gcc_unreachable (); +} + +/* Emit the memory barrier instruction, if any, provided by this + target. */ +const char * +arm_output_memory_barrier (rtx *operands) +{ + arm_process_output_memory_barrier (arm_emit, operands); + return ""; +} + +/* Helper to figure out the instruction suffix required on ldrex/strex + for operations on an object of the specified mode. */ +static const char * +arm_ldrex_suffix (enum machine_mode mode) +{ + switch (mode) + { + case QImode: return "b"; + case HImode: return "h"; + case SImode: return ""; + case DImode: return "d"; + default: + gcc_unreachable (); + } + return ""; +} + +/* Emit an ldrex{b,h,d, } instruction appropriate for the specified + mode. */ +static void +arm_output_ldrex (emit_f emit, + enum machine_mode mode, + rtx target, + rtx memory) +{ + const char *suffix = arm_ldrex_suffix (mode); + rtx operands[2]; + + operands[0] = target; + operands[1] = memory; + arm_output_asm_insn (emit, 0, operands, "ldrex%s\t%%0, %%C1", suffix); +} + +/* Emit a strex{b,h,d, } instruction appropriate for the specified + mode. */ +static void +arm_output_strex (emit_f emit, + enum machine_mode mode, + const char *cc, + rtx result, + rtx value, + rtx memory) +{ + const char *suffix = arm_ldrex_suffix (mode); + rtx operands[3]; + + operands[0] = result; + operands[1] = value; + operands[2] = memory; + arm_output_asm_insn (emit, 0, operands, "strex%s%s\t%%0, %%1, %%C2", suffix, + cc); +} + +/* Helper to emit a two operand instruction. */ +static void +arm_output_op2 (emit_f emit, const char *mnemonic, rtx d, rtx s) +{ + rtx operands[2]; + + operands[0] = d; + operands[1] = s; + arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1", mnemonic); +} + +/* Helper to emit a three operand instruction. */ +static void +arm_output_op3 (emit_f emit, const char *mnemonic, rtx d, rtx a, rtx b) +{ + rtx operands[3]; + + operands[0] = d; + operands[1] = a; + operands[2] = b; + arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1, %%2", mnemonic); +} + +/* Emit a load store exclusive synchronization loop. + + do + old_value = [mem] + if old_value != required_value + break; + t1 = sync_op (old_value, new_value) + [mem] = t1, t2 = [0|1] + while ! t2 + + Note: + t1 == t2 is not permitted + t1 == old_value is permitted + + required_value: + + RTX register or const_int representing the required old_value for + the modify to continue, if NULL no comparsion is performed. */ +static void +arm_output_sync_loop (emit_f emit, + enum machine_mode mode, + rtx old_value, + rtx memory, + rtx required_value, + rtx new_value, + rtx t1, + rtx t2, + enum attr_sync_op sync_op, + int early_barrier_required) +{ + rtx operands[1]; + + gcc_assert (t1 != t2); + + if (early_barrier_required) + arm_process_output_memory_barrier (emit, NULL); + + arm_output_asm_insn (emit, 1, operands, "%sLSYT%%=:", LOCAL_LABEL_PREFIX); + + arm_output_ldrex (emit, mode, old_value, memory); + + if (required_value) + { + rtx operands[2]; + + operands[0] = old_value; + operands[1] = required_value; + arm_output_asm_insn (emit, 0, operands, "cmp\t%%0, %%1"); + arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYB%%=", LOCAL_LABEL_PREFIX); + } + + switch (sync_op) + { + case SYNC_OP_ADD: + arm_output_op3 (emit, "add", t1, old_value, new_value); + break; + + case SYNC_OP_SUB: + arm_output_op3 (emit, "sub", t1, old_value, new_value); + break; + + case SYNC_OP_IOR: + arm_output_op3 (emit, "orr", t1, old_value, new_value); + break; + + case SYNC_OP_XOR: + arm_output_op3 (emit, "eor", t1, old_value, new_value); + break; + + case SYNC_OP_AND: + arm_output_op3 (emit,"and", t1, old_value, new_value); + break; + + case SYNC_OP_NAND: + arm_output_op3 (emit, "and", t1, old_value, new_value); + arm_output_op2 (emit, "mvn", t1, t1); + break; + + case SYNC_OP_NONE: + t1 = new_value; + break; + } + + arm_output_strex (emit, mode, "", t2, t1, memory); + operands[0] = t2; + arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0"); + arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=", LOCAL_LABEL_PREFIX); + + arm_process_output_memory_barrier (emit, NULL); + arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX); +} + +static rtx +arm_get_sync_operand (rtx *operands, int index, rtx default_value) +{ + if (index > 0) + default_value = operands[index - 1]; + + return default_value; +} + +#define FETCH_SYNC_OPERAND(NAME, DEFAULT) \ + arm_get_sync_operand (operands, (int) get_attr_sync_##NAME (insn), DEFAULT); + +/* Extract the operands for a synchroniztion instruction from the + instructions attributes and emit the instruction. */ +static void +arm_process_output_sync_insn (emit_f emit, rtx insn, rtx *operands) +{ + rtx result, memory, required_value, new_value, t1, t2; + int early_barrier; + enum machine_mode mode; + enum attr_sync_op sync_op; + + result = FETCH_SYNC_OPERAND(result, 0); + memory = FETCH_SYNC_OPERAND(memory, 0); + required_value = FETCH_SYNC_OPERAND(required_value, 0); + new_value = FETCH_SYNC_OPERAND(new_value, 0); + t1 = FETCH_SYNC_OPERAND(t1, 0); + t2 = FETCH_SYNC_OPERAND(t2, 0); + early_barrier = + get_attr_sync_release_barrier (insn) == SYNC_RELEASE_BARRIER_YES; + sync_op = get_attr_sync_op (insn); + mode = GET_MODE (memory); + + arm_output_sync_loop (emit, mode, result, memory, required_value, + new_value, t1, t2, sync_op, early_barrier); +} + +/* Emit a synchronization instruction loop. */ +const char * +arm_output_sync_insn (rtx insn, rtx *operands) +{ + arm_process_output_sync_insn (arm_emit, insn, operands); + return ""; +} + +/* Count the number of machine instruction that will be emitted for a + synchronization instruction. Note that the emitter used does not + emit instructions, it just counts instructions being carefull not + to count labels. */ +unsigned int +arm_sync_loop_insns (rtx insn, rtx *operands) +{ + arm_insn_count = 0; + arm_process_output_sync_insn (arm_count, insn, operands); + return arm_insn_count; +} + +/* Helper to call a target sync instruction generator, dealing with + the variation in operands required by the different generators. */ +static rtx +arm_call_generator (struct arm_sync_generator *generator, rtx old_value, + rtx memory, rtx required_value, rtx new_value) +{ + switch (generator->op) + { + case arm_sync_generator_omn: + gcc_assert (! required_value); + return generator->u.omn (old_value, memory, new_value); + + case arm_sync_generator_omrn: + gcc_assert (required_value); + return generator->u.omrn (old_value, memory, required_value, new_value); + } + + return NULL; +} + +/* Expand a synchronization loop. The synchronization loop is expanded + as an opaque block of instructions in order to ensure that we do + not subsequently get extraneous memory accesses inserted within the + critical region. The exclusive access property of ldrex/strex is + only guaranteed in there are no intervening memory accesses. */ +void +arm_expand_sync (enum machine_mode mode, + struct arm_sync_generator *generator, + rtx target, rtx memory, rtx required_value, rtx new_value) +{ + if (target == NULL) + target = gen_reg_rtx (mode); + + memory = arm_legitimize_sync_memory (memory); + if (mode != SImode) + { + rtx load_temp = gen_reg_rtx (SImode); + + if (required_value) + required_value = convert_modes (SImode, mode, required_value, true); + + new_value = convert_modes (SImode, mode, new_value, true); + emit_insn (arm_call_generator (generator, load_temp, memory, + required_value, new_value)); + emit_move_insn (target, gen_lowpart (mode, load_temp)); + } + else + { + emit_insn (arm_call_generator (generator, target, memory, required_value, + new_value)); + } +} + #include "gt-arm.h"