X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Fconfig%2Farm%2Farm.c;h=0d53896aee0837effa2c376b2ed7b160ac98e99a;hb=686e27690af4295ea88041c1d2d7fd31ee8f1cb7;hp=5df6a7722ed3be0f65e916d4a77bb46733fd9904;hpb=2359dba8edb140d03efcac93bddf28d0da97bf24;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 5df6a7722ed..855f39e3858 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -1,6 +1,7 @@ /* Output routines for GCC for ARM. Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. + 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 + Free Software Foundation, Inc. Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl) and Martin Simmons (@harleqn.co.uk). More major hacks by Richard Earnshaw (rearnsha@arm.com). @@ -30,7 +31,6 @@ #include "obstack.h" #include "regs.h" #include "hard-reg-set.h" -#include "real.h" #include "insn-config.h" #include "conditions.h" #include "output.h" @@ -40,11 +40,13 @@ #include "function.h" #include "expr.h" #include "optabs.h" +#include "diagnostic-core.h" #include "toplev.h" #include "recog.h" +#include "cgraph.h" #include "ggc.h" #include "except.h" -#include "c-pragma.h" +#include "c-family/c-pragma.h" /* ??? */ #include "integrate.h" #include "tm_p.h" #include "target.h" @@ -52,16 +54,18 @@ #include "debug.h" #include "langhooks.h" #include "df.h" +#include "intl.h" +#include "libfuncs.h" +#include "params.h" /* Forward definitions of types. */ typedef struct minipool_node Mnode; typedef struct minipool_fixup Mfix; -const struct attribute_spec arm_attribute_table[]; - void (*arm_lang_output_object_attributes_hook)(void); /* Forward function declarations. */ +static int arm_compute_static_chain_stack_bytes (void); static arm_stack_offsets *arm_get_frame_offsets (void); static void arm_add_gc_roots (void); static int arm_gen_constant (enum rtx_code, enum machine_mode, rtx, @@ -71,13 +75,19 @@ static int arm_address_register_rtx_p (rtx, int); static int arm_legitimate_index_p (enum machine_mode, rtx, RTX_CODE, int); static int thumb2_legitimate_index_p (enum machine_mode, rtx, int); static int thumb1_base_register_rtx_p (rtx, enum machine_mode, int); +static rtx arm_legitimize_address (rtx, rtx, enum machine_mode); +static rtx thumb_legitimize_address (rtx, rtx, enum machine_mode); inline static int thumb1_index_register_rtx_p (rtx, int); +static bool arm_legitimate_address_p (enum machine_mode, rtx, bool); static int thumb_far_jump_used_p (void); static bool thumb_force_lr_save (void); static int const_ok_for_op (HOST_WIDE_INT, enum rtx_code); static rtx emit_sfm (int, int); static unsigned arm_size_return_regs (void); static bool arm_assemble_integer (rtx, unsigned int, int); +static void arm_print_operand (FILE *, rtx, int); +static void arm_print_operand_address (FILE *, rtx); +static bool arm_print_operand_punct_valid_p (unsigned char code); static const char *fp_const_from_val (REAL_VALUE_TYPE *); static arm_cc get_arm_condition_code (rtx); static HOST_WIDE_INT int_log2 (HOST_WIDE_INT); @@ -108,6 +118,7 @@ static unsigned long arm_compute_save_reg_mask (void); static unsigned long arm_isr_value (tree); static unsigned long arm_compute_func_type (void); static tree arm_handle_fndecl_attribute (tree *, tree, tree, int, bool *); +static tree arm_handle_pcs_attribute (tree *, tree, tree, int, bool *); static tree arm_handle_isr_attribute (tree *, tree, tree, int, bool *); #if TARGET_DLLIMPORT_DECL_ATTRIBUTES static tree arm_handle_notshared_attribute (tree *, tree, tree, int, bool *); @@ -121,21 +132,29 @@ static int arm_adjust_cost (rtx, rtx, rtx, int); static int count_insns_for_constant (HOST_WIDE_INT, int); static int arm_get_strip_length (int); static bool arm_function_ok_for_sibcall (tree, tree); +static enum machine_mode arm_promote_function_mode (const_tree, + enum machine_mode, int *, + const_tree, int); +static bool arm_return_in_memory (const_tree, const_tree); +static rtx arm_function_value (const_tree, const_tree, bool); +static rtx arm_libcall_value (enum machine_mode, const_rtx); + static void arm_internal_label (FILE *, const char *, unsigned long); static void arm_output_mi_thunk (FILE *, tree, HOST_WIDE_INT, HOST_WIDE_INT, tree); -static int arm_rtx_costs_1 (rtx, enum rtx_code, enum rtx_code); -static bool arm_size_rtx_costs (rtx, int, int, int *); -static bool arm_slowmul_rtx_costs (rtx, int, int, int *); -static bool arm_fastmul_rtx_costs (rtx, int, int, int *); -static bool arm_xscale_rtx_costs (rtx, int, int, int *); -static bool arm_9e_rtx_costs (rtx, int, int, int *); -static int arm_address_cost (rtx); +static bool arm_have_conditional_execution (void); +static bool arm_rtx_costs_1 (rtx, enum rtx_code, int*, bool); +static bool arm_size_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *); +static bool arm_slowmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool); +static bool arm_fastmul_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool); +static bool arm_xscale_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool); +static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool); +static bool arm_rtx_costs (rtx, int, int, int *, bool); +static int arm_address_cost (rtx, bool); static bool arm_memory_load_p (rtx); static bool arm_cirrus_insn_p (rtx); static void cirrus_reorg (rtx); static void arm_init_builtins (void); -static rtx arm_expand_builtin (tree, rtx, rtx, enum machine_mode, int); static void arm_init_iwmmxt_builtins (void); static rtx safe_vector_operand (rtx, enum machine_mode); static rtx arm_expand_binop_builtin (enum insn_code, tree, rtx); @@ -145,6 +164,13 @@ static void emit_constant_insn (rtx cond, rtx pattern); static rtx emit_set_insn (rtx, rtx); static int arm_arg_partial_bytes (CUMULATIVE_ARGS *, enum machine_mode, tree, bool); +static rtx arm_function_arg (CUMULATIVE_ARGS *, enum machine_mode, + const_tree, bool); +static void arm_function_arg_advance (CUMULATIVE_ARGS *, enum machine_mode, + const_tree, bool); +static rtx aapcs_allocate_return_reg (enum machine_mode, const_tree, + const_tree); +static int aapcs_select_return_coproc (const_tree, const_tree); #ifdef OBJECT_FORMAT_ELF static void arm_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED; @@ -166,11 +192,16 @@ static bool arm_default_short_enums (void); static bool arm_align_anon_bitfield (void); static bool arm_return_in_msb (const_tree); static bool arm_must_pass_in_stack (enum machine_mode, const_tree); -#ifdef TARGET_UNWIND_INFO +static bool arm_return_in_memory (const_tree, const_tree); +#if ARM_UNWIND_INFO static void arm_unwind_emit (FILE *, rtx); static bool arm_output_ttype (rtx); +static void arm_asm_emit_except_personality (rtx); +static void arm_asm_init_sections (void); #endif +static enum unwind_info_type arm_except_unwind_info (void); static void arm_dwarf_handle_frame_unspec (const char *, rtx, int); +static rtx arm_dwarf_register_span (rtx); static tree arm_cxx_guard_type (void); static bool arm_cxx_guard_mask_bit (void); @@ -182,6 +213,11 @@ static void arm_cxx_determine_class_data_visibility (tree); static bool arm_cxx_class_data_always_comdat (void); static bool arm_cxx_use_aeabi_atexit (void); static void arm_init_libfuncs (void); +static tree arm_build_builtin_va_list (void); +static void arm_expand_builtin_va_start (tree, rtx); +static tree arm_gimplify_va_arg_expr (tree, tree, gimple_seq *, gimple_seq *); +static void arm_option_override (void); +static void arm_option_optimization (int, int); static bool arm_handle_option (size_t, const char *, int); static void arm_target_help (void); static unsigned HOST_WIDE_INT arm_shift_truncation_mask (enum machine_mode); @@ -189,8 +225,67 @@ static bool arm_cannot_copy_insn_p (rtx); static bool arm_tls_symbol_p (rtx x); static int arm_issue_rate (void); static void arm_output_dwarf_dtprel (FILE *, int, rtx) ATTRIBUTE_UNUSED; +static bool arm_output_addr_const_extra (FILE *, rtx); static bool arm_allocate_stack_slots_for_args (void); +static const char *arm_invalid_parameter_type (const_tree t); +static const char *arm_invalid_return_type (const_tree t); +static tree arm_promoted_type (const_tree t); +static tree arm_convert_to_type (tree type, tree expr); +static bool arm_scalar_mode_supported_p (enum machine_mode); +static bool arm_frame_pointer_required (void); +static bool arm_can_eliminate (const int, const int); +static void arm_asm_trampoline_template (FILE *); +static void arm_trampoline_init (rtx, tree, rtx); +static rtx arm_trampoline_adjust_address (rtx); +static rtx arm_pic_static_addr (rtx orig, rtx reg); +static bool cortex_a9_sched_adjust_cost (rtx, rtx, rtx, int *); +static bool xscale_sched_adjust_cost (rtx, rtx, rtx, int *); +static enum machine_mode arm_preferred_simd_mode (enum machine_mode); +static bool arm_class_likely_spilled_p (reg_class_t); +static bool arm_vector_alignment_reachable (const_tree type, bool is_packed); +static bool arm_builtin_support_vector_misalignment (enum machine_mode mode, + const_tree type, + int misalignment, + bool is_packed); + + +/* Table of machine attributes. */ +static const struct attribute_spec arm_attribute_table[] = +{ + /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */ + /* Function calls made to this symbol must be done indirectly, because + it may lie outside of the 26 bit addressing range of a normal function + call. */ + { "long_call", 0, 0, false, true, true, NULL }, + /* Whereas these functions are always known to reside within the 26 bit + addressing range. */ + { "short_call", 0, 0, false, true, true, NULL }, + /* Specify the procedure call conventions for a function. */ + { "pcs", 1, 1, false, true, true, arm_handle_pcs_attribute }, + /* Interrupt Service Routines have special prologue and epilogue requirements. */ + { "isr", 0, 1, false, false, false, arm_handle_isr_attribute }, + { "interrupt", 0, 1, false, false, false, arm_handle_isr_attribute }, + { "naked", 0, 0, true, false, false, arm_handle_fndecl_attribute }, +#ifdef ARM_PE + /* ARM/PE has three new attributes: + interfacearm - ? + dllexport - for exporting a function/variable that will live in a dll + dllimport - for importing a function/variable from a dll + Microsoft allows multiple declspecs in one __declspec, separating + them with spaces. We do NOT support this. Instead, use __declspec + multiple times. + */ + { "dllimport", 0, 0, true, false, false, NULL }, + { "dllexport", 0, 0, true, false, false, NULL }, + { "interfacearm", 0, 0, true, false, false, arm_handle_fndecl_attribute }, +#elif TARGET_DLLIMPORT_DECL_ATTRIBUTES + { "dllimport", 0, 0, false, false, false, handle_dll_attribute }, + { "dllexport", 0, 0, false, false, false, handle_dll_attribute }, + { "notshared", 0, 0, false, true, false, arm_handle_notshared_attribute }, +#endif + { NULL, 0, 0, false, false, false, NULL } +}; /* Initialize the GCC target structure. */ #if TARGET_DLLIMPORT_DECL_ATTRIBUTES @@ -198,6 +293,9 @@ static bool arm_allocate_stack_slots_for_args (void); #define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes #endif +#undef TARGET_LEGITIMIZE_ADDRESS +#define TARGET_LEGITIMIZE_ADDRESS arm_legitimize_address + #undef TARGET_ATTRIBUTE_TABLE #define TARGET_ATTRIBUTE_TABLE arm_attribute_table @@ -211,6 +309,16 @@ static bool arm_allocate_stack_slots_for_args (void); #undef TARGET_ASM_INTEGER #define TARGET_ASM_INTEGER arm_assemble_integer +#undef TARGET_PRINT_OPERAND +#define TARGET_PRINT_OPERAND arm_print_operand +#undef TARGET_PRINT_OPERAND_ADDRESS +#define TARGET_PRINT_OPERAND_ADDRESS arm_print_operand_address +#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P +#define TARGET_PRINT_OPERAND_PUNCT_VALID_P arm_print_operand_punct_valid_p + +#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA +#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA arm_output_addr_const_extra + #undef TARGET_ASM_FUNCTION_PROLOGUE #define TARGET_ASM_FUNCTION_PROLOGUE arm_output_function_prologue @@ -223,6 +331,10 @@ static bool arm_allocate_stack_slots_for_args (void); #define TARGET_HANDLE_OPTION arm_handle_option #undef TARGET_HELP #define TARGET_HELP arm_target_help +#undef TARGET_OPTION_OVERRIDE +#define TARGET_OPTION_OVERRIDE arm_option_override +#undef TARGET_OPTION_OPTIMIZATION +#define TARGET_OPTION_OPTIMIZATION arm_option_optimization #undef TARGET_COMP_TYPE_ATTRIBUTES #define TARGET_COMP_TYPE_ATTRIBUTES arm_comp_type_attributes @@ -249,14 +361,19 @@ static bool arm_allocate_stack_slots_for_args (void); #undef TARGET_FUNCTION_OK_FOR_SIBCALL #define TARGET_FUNCTION_OK_FOR_SIBCALL arm_function_ok_for_sibcall +#undef TARGET_FUNCTION_VALUE +#define TARGET_FUNCTION_VALUE arm_function_value + +#undef TARGET_LIBCALL_VALUE +#define TARGET_LIBCALL_VALUE arm_libcall_value + #undef TARGET_ASM_OUTPUT_MI_THUNK #define TARGET_ASM_OUTPUT_MI_THUNK arm_output_mi_thunk #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK #define TARGET_ASM_CAN_OUTPUT_MI_THUNK default_can_output_mi_thunk_no_vcall -/* This will be overridden in arm_override_options. */ #undef TARGET_RTX_COSTS -#define TARGET_RTX_COSTS arm_slowmul_rtx_costs +#define TARGET_RTX_COSTS arm_rtx_costs #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST arm_address_cost @@ -264,6 +381,8 @@ static bool arm_allocate_stack_slots_for_args (void); #define TARGET_SHIFT_TRUNCATION_MASK arm_shift_truncation_mask #undef TARGET_VECTOR_MODE_SUPPORTED_P #define TARGET_VECTOR_MODE_SUPPORTED_P arm_vector_mode_supported_p +#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE +#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE arm_preferred_simd_mode #undef TARGET_MACHINE_DEPENDENT_REORG #define TARGET_MACHINE_DEPENDENT_REORG arm_reorg @@ -276,16 +395,18 @@ static bool arm_allocate_stack_slots_for_args (void); #undef TARGET_INIT_LIBFUNCS #define TARGET_INIT_LIBFUNCS arm_init_libfuncs -#undef TARGET_PROMOTE_FUNCTION_ARGS -#define TARGET_PROMOTE_FUNCTION_ARGS hook_bool_const_tree_true -#undef TARGET_PROMOTE_FUNCTION_RETURN -#define TARGET_PROMOTE_FUNCTION_RETURN hook_bool_const_tree_true +#undef TARGET_PROMOTE_FUNCTION_MODE +#define TARGET_PROMOTE_FUNCTION_MODE arm_promote_function_mode #undef TARGET_PROMOTE_PROTOTYPES #define TARGET_PROMOTE_PROTOTYPES arm_promote_prototypes #undef TARGET_PASS_BY_REFERENCE #define TARGET_PASS_BY_REFERENCE arm_pass_by_reference #undef TARGET_ARG_PARTIAL_BYTES #define TARGET_ARG_PARTIAL_BYTES arm_arg_partial_bytes +#undef TARGET_FUNCTION_ARG +#define TARGET_FUNCTION_ARG arm_function_arg +#undef TARGET_FUNCTION_ARG_ADVANCE +#define TARGET_FUNCTION_ARG_ADVANCE arm_function_arg_advance #undef TARGET_SETUP_INCOMING_VARARGS #define TARGET_SETUP_INCOMING_VARARGS arm_setup_incoming_varargs @@ -293,6 +414,13 @@ static bool arm_allocate_stack_slots_for_args (void); #undef TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS #define TARGET_ALLOCATE_STACK_SLOTS_FOR_ARGS arm_allocate_stack_slots_for_args +#undef TARGET_ASM_TRAMPOLINE_TEMPLATE +#define TARGET_ASM_TRAMPOLINE_TEMPLATE arm_asm_trampoline_template +#undef TARGET_TRAMPOLINE_INIT +#define TARGET_TRAMPOLINE_INIT arm_trampoline_init +#undef TARGET_TRAMPOLINE_ADJUST_ADDRESS +#define TARGET_TRAMPOLINE_ADJUST_ADDRESS arm_trampoline_adjust_address + #undef TARGET_DEFAULT_SHORT_ENUMS #define TARGET_DEFAULT_SHORT_ENUMS arm_default_short_enums @@ -333,12 +461,15 @@ static bool arm_allocate_stack_slots_for_args (void); #undef TARGET_RETURN_IN_MSB #define TARGET_RETURN_IN_MSB arm_return_in_msb +#undef TARGET_RETURN_IN_MEMORY +#define TARGET_RETURN_IN_MEMORY arm_return_in_memory + #undef TARGET_MUST_PASS_IN_STACK #define TARGET_MUST_PASS_IN_STACK arm_must_pass_in_stack -#ifdef TARGET_UNWIND_INFO -#undef TARGET_UNWIND_EMIT -#define TARGET_UNWIND_EMIT arm_unwind_emit +#if ARM_UNWIND_INFO +#undef TARGET_ASM_UNWIND_EMIT +#define TARGET_ASM_UNWIND_EMIT arm_unwind_emit /* EABI unwinding tables use a different format for the typeinfo tables. */ #undef TARGET_ASM_TTYPE @@ -346,11 +477,23 @@ static bool arm_allocate_stack_slots_for_args (void); #undef TARGET_ARM_EABI_UNWINDER #define TARGET_ARM_EABI_UNWINDER true -#endif /* TARGET_UNWIND_INFO */ + +#undef TARGET_ASM_EMIT_EXCEPT_PERSONALITY +#define TARGET_ASM_EMIT_EXCEPT_PERSONALITY arm_asm_emit_except_personality + +#undef TARGET_ASM_INIT_SECTIONS +#define TARGET_ASM_INIT_SECTIONS arm_asm_init_sections +#endif /* ARM_UNWIND_INFO */ + +#undef TARGET_EXCEPT_UNWIND_INFO +#define TARGET_EXCEPT_UNWIND_INFO arm_except_unwind_info #undef TARGET_DWARF_HANDLE_FRAME_UNSPEC #define TARGET_DWARF_HANDLE_FRAME_UNSPEC arm_dwarf_handle_frame_unspec +#undef TARGET_DWARF_REGISTER_SPAN +#define TARGET_DWARF_REGISTER_SPAN arm_dwarf_register_span + #undef TARGET_CANNOT_COPY_INSN_P #define TARGET_CANNOT_COPY_INSN_P arm_cannot_copy_insn_p @@ -359,20 +502,74 @@ static bool arm_allocate_stack_slots_for_args (void); #define TARGET_HAVE_TLS true #endif +#undef TARGET_HAVE_CONDITIONAL_EXECUTION +#define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution + #undef TARGET_CANNOT_FORCE_CONST_MEM #define TARGET_CANNOT_FORCE_CONST_MEM arm_cannot_force_const_mem +#undef TARGET_MAX_ANCHOR_OFFSET +#define TARGET_MAX_ANCHOR_OFFSET 4095 + +/* The minimum is set such that the total size of the block + for a particular anchor is -4088 + 1 + 4095 bytes, which is + divisible by eight, ensuring natural spacing of anchors. */ +#undef TARGET_MIN_ANCHOR_OFFSET +#define TARGET_MIN_ANCHOR_OFFSET -4088 + #undef TARGET_SCHED_ISSUE_RATE #define TARGET_SCHED_ISSUE_RATE arm_issue_rate #undef TARGET_MANGLE_TYPE #define TARGET_MANGLE_TYPE arm_mangle_type +#undef TARGET_BUILD_BUILTIN_VA_LIST +#define TARGET_BUILD_BUILTIN_VA_LIST arm_build_builtin_va_list +#undef TARGET_EXPAND_BUILTIN_VA_START +#define TARGET_EXPAND_BUILTIN_VA_START arm_expand_builtin_va_start +#undef TARGET_GIMPLIFY_VA_ARG_EXPR +#define TARGET_GIMPLIFY_VA_ARG_EXPR arm_gimplify_va_arg_expr + #ifdef HAVE_AS_TLS #undef TARGET_ASM_OUTPUT_DWARF_DTPREL #define TARGET_ASM_OUTPUT_DWARF_DTPREL arm_output_dwarf_dtprel #endif +#undef TARGET_LEGITIMATE_ADDRESS_P +#define TARGET_LEGITIMATE_ADDRESS_P arm_legitimate_address_p + +#undef TARGET_INVALID_PARAMETER_TYPE +#define TARGET_INVALID_PARAMETER_TYPE arm_invalid_parameter_type + +#undef TARGET_INVALID_RETURN_TYPE +#define TARGET_INVALID_RETURN_TYPE arm_invalid_return_type + +#undef TARGET_PROMOTED_TYPE +#define TARGET_PROMOTED_TYPE arm_promoted_type + +#undef TARGET_CONVERT_TO_TYPE +#define TARGET_CONVERT_TO_TYPE arm_convert_to_type + +#undef TARGET_SCALAR_MODE_SUPPORTED_P +#define TARGET_SCALAR_MODE_SUPPORTED_P arm_scalar_mode_supported_p + +#undef TARGET_FRAME_POINTER_REQUIRED +#define TARGET_FRAME_POINTER_REQUIRED arm_frame_pointer_required + +#undef TARGET_CAN_ELIMINATE +#define TARGET_CAN_ELIMINATE arm_can_eliminate + +#undef TARGET_CLASS_LIKELY_SPILLED_P +#define TARGET_CLASS_LIKELY_SPILLED_P arm_class_likely_spilled_p + +#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE +#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \ + arm_vector_alignment_reachable + +#undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT +#define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \ + arm_builtin_support_vector_misalignment + struct gcc_target targetm = TARGET_INITIALIZER; /* Obstack for minipool constant handling. */ @@ -388,28 +585,24 @@ extern FILE * asm_out_file; /* True if we are currently building a constant table. */ int making_const_table; -/* Define the information needed to generate branch insns. This is - stored from the compare operation. */ -rtx arm_compare_op0, arm_compare_op1; - /* The processor for which instructions should be scheduled. */ enum processor_type arm_tune = arm_none; -/* The default processor used if not overridden by commandline. */ -static enum processor_type arm_default_cpu = arm_none; - -/* Which floating point model to use. */ -enum arm_fp_model arm_fp_model; - -/* Which floating point hardware is available. */ -enum fputype arm_fpu_arch; +/* The current tuning set. */ +const struct tune_params *current_tune; /* Which floating point hardware to schedule for. */ -enum fputype arm_fpu_tune; +int arm_fpu_attr; + +/* Which floating popint hardware to use. */ +const struct arm_fpu_desc *arm_fpu_desc; /* Whether to use floating point hardware. */ enum float_abi_type arm_float_abi; +/* Which __fp16 format to use. */ +enum arm_fp16_format_type arm_fp16_format; + /* Which ABI to use. */ enum arm_abi_type arm_abi; @@ -448,9 +641,16 @@ static int thumb_call_reg_needed; #define FL_DIV (1 << 18) /* Hardware divide. */ #define FL_VFPV3 (1 << 19) /* Vector Floating Point V3. */ #define FL_NEON (1 << 20) /* Neon instructions. */ +#define FL_ARCH7EM (1 << 21) /* Instructions present in the ARMv7E-M + architecture. */ +#define FL_ARCH7 (1 << 22) /* Architecture 7. */ #define FL_IWMMXT (1 << 29) /* XScale v2 or "Intel Wireless MMX technology". */ +/* Flags that only effect tuning, not available instructions. */ +#define FL_TUNE (FL_WBUF | FL_VFPV2 | FL_STRONG | FL_LDSCHED \ + | FL_CO_PROC) + #define FL_FOR_ARCH2 FL_NOTM #define FL_FOR_ARCH3 (FL_FOR_ARCH2 | FL_MODE32) #define FL_FOR_ARCH3M (FL_FOR_ARCH3 | FL_ARCH3M) @@ -468,10 +668,11 @@ static int thumb_call_reg_needed; #define FL_FOR_ARCH6ZK FL_FOR_ARCH6K #define FL_FOR_ARCH6T2 (FL_FOR_ARCH6 | FL_THUMB2) #define FL_FOR_ARCH6M (FL_FOR_ARCH6 & ~FL_NOTM) -#define FL_FOR_ARCH7 (FL_FOR_ARCH6T2 &~ FL_NOTM) -#define FL_FOR_ARCH7A (FL_FOR_ARCH7 | FL_NOTM) +#define FL_FOR_ARCH7 ((FL_FOR_ARCH6T2 & ~FL_NOTM) | FL_ARCH7) +#define FL_FOR_ARCH7A (FL_FOR_ARCH7 | FL_NOTM | FL_ARCH6K) #define FL_FOR_ARCH7R (FL_FOR_ARCH7A | FL_DIV) #define FL_FOR_ARCH7M (FL_FOR_ARCH7 | FL_DIV) +#define FL_FOR_ARCH7EM (FL_FOR_ARCH7M | FL_ARCH7EM) /* The bits in this mask specify which instructions we are allowed to generate. */ @@ -505,9 +706,15 @@ int arm_arch6 = 0; /* Nonzero if this chip supports the ARM 6K extensions. */ int arm_arch6k = 0; +/* Nonzero if this chip supports the ARM 7 extensions. */ +int arm_arch7 = 0; + /* Nonzero if instructions not present in the 'M' profile can be used. */ int arm_arch_notm = 0; +/* Nonzero if instructions present in ARMv7E-M can be used. */ +int arm_arch7em = 0; + /* Nonzero if this chip can benefit from load scheduling. */ int arm_ld_sched = 0; @@ -530,9 +737,15 @@ int arm_tune_xscale = 0; This typically means an ARM6 or ARM7 with MMU or MPU. */ int arm_tune_wbuf = 0; +/* Nonzero if tuning for Cortex-A9. */ +int arm_tune_cortex_a9 = 0; + /* Nonzero if generating Thumb instructions. */ int thumb_code = 0; +/* Nonzero if generating Thumb-1 instructions. */ +int thumb1_code = 0; + /* Nonzero if we should define __THUMB_INTERWORK__ in the preprocessor. XXX This is a bit of a hack, it's intended to help work around @@ -546,29 +759,25 @@ int arm_arch_thumb2; /* Nonzero if chip supports integer division instruction. */ int arm_arch_hwdiv; -/* In case of a PRE_INC, POST_INC, PRE_DEC, POST_DEC memory reference, we - must report the mode of the memory reference from PRINT_OPERAND to - PRINT_OPERAND_ADDRESS. */ +/* In case of a PRE_INC, POST_INC, PRE_DEC, POST_DEC memory reference, + we must report the mode of the memory reference from + TARGET_PRINT_OPERAND to TARGET_PRINT_OPERAND_ADDRESS. */ enum machine_mode output_memory_reference_mode; /* The register number to be used for the PIC offset register. */ unsigned arm_pic_register = INVALID_REGNUM; -/* Set to 1 when a return insn is output, this means that the epilogue - is not needed. */ -int return_used_this_function; - /* Set to 1 after arm_reorg has started. Reset to start at the start of the next function. */ static int after_arm_reorg = 0; -/* The maximum number of insns to be used when loading a constant. */ -static int arm_constant_limit = 3; +enum arm_pcs arm_pcs_default; /* For an explanation of these variables, see final_prescan_insn below. */ int arm_ccfsm_state; /* arm_current_cc is also used for Thumb-2 cond_exec blocks. */ enum arm_cond_code arm_current_cc; + rtx arm_target_insn; int arm_target_label; /* The number of conditionally executed insns, including the current insn. */ @@ -586,6 +795,12 @@ static const char * const arm_condition_codes[] = "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" }; +/* The register numbers in sequence, for passing to arm_gen_load_multiple. */ +int arm_regs_in_sequence[] = +{ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + #define ARM_LSL_NAME (TARGET_UNIFIED_ASM ? "lsl" : "asl") #define streq(string1, string2) (strcmp (string1, string2) == 0) @@ -601,16 +816,52 @@ struct processors enum processor_type core; const char *arch; const unsigned long flags; - bool (* rtx_costs) (rtx, int, int, int *); + const struct tune_params *const tune; }; +const struct tune_params arm_slowmul_tune = +{ + arm_slowmul_rtx_costs, + NULL, + 3 +}; + +const struct tune_params arm_fastmul_tune = +{ + arm_fastmul_rtx_costs, + NULL, + 1 +}; + +const struct tune_params arm_xscale_tune = +{ + arm_xscale_rtx_costs, + xscale_sched_adjust_cost, + 2 +}; + +const struct tune_params arm_9e_tune = +{ + arm_9e_rtx_costs, + NULL, + 1 +}; + +const struct tune_params arm_cortex_a9_tune = +{ + arm_9e_rtx_costs, + cortex_a9_sched_adjust_cost, + 1 +}; + + /* Not all of these give usefully different compilation alternatives, but there is no simple way of generalizing them. */ static const struct processors all_cores[] = { /* ARM Cores */ #define ARM_CORE(NAME, IDENT, ARCH, FLAGS, COSTS) \ - {NAME, arm_none, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, arm_##COSTS##_rtx_costs}, + {NAME, IDENT, #ARCH, FLAGS | FL_FOR_ARCH##ARCH, &arm_##COSTS##_tune}, #include "arm-cores.def" #undef ARM_CORE {NULL, arm_none, NULL, 0, NULL} @@ -619,7 +870,7 @@ static const struct processors all_cores[] = static const struct processors all_architectures[] = { /* ARM Architectures */ - /* We don't specify rtx_costs here as it will be figured out + /* We don't specify tuning costs here as it will be figured out from the core. */ {"armv2", arm2, "2", FL_CO_PROC | FL_MODE26 | FL_FOR_ARCH2, NULL}, @@ -645,74 +896,47 @@ static const struct processors all_architectures[] = {"armv7-a", cortexa8, "7A", FL_CO_PROC | FL_FOR_ARCH7A, NULL}, {"armv7-r", cortexr4, "7R", FL_CO_PROC | FL_FOR_ARCH7R, NULL}, {"armv7-m", cortexm3, "7M", FL_CO_PROC | FL_FOR_ARCH7M, NULL}, + {"armv7e-m", cortexm4, "7EM", FL_CO_PROC | FL_FOR_ARCH7EM, NULL}, {"ep9312", ep9312, "4T", FL_LDSCHED | FL_CIRRUS | FL_FOR_ARCH4, NULL}, {"iwmmxt", iwmmxt, "5TE", FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT , NULL}, + {"iwmmxt2", iwmmxt2, "5TE", FL_LDSCHED | FL_STRONG | FL_FOR_ARCH5TE | FL_XSCALE | FL_IWMMXT , NULL}, {NULL, arm_none, NULL, 0 , NULL} }; -struct arm_cpu_select -{ - const char * string; - const char * name; - const struct processors * processors; -}; - -/* This is a magic structure. The 'string' field is magically filled in - with a pointer to the value specified by the user on the command line - assuming that the user has specified such a value. */ - -static struct arm_cpu_select arm_select[] = -{ - /* string name processors */ - { NULL, "-mcpu=", all_cores }, - { NULL, "-march=", all_architectures }, - { NULL, "-mtune=", all_cores } -}; -/* Defines representing the indexes into the above table. */ -#define ARM_OPT_SET_CPU 0 -#define ARM_OPT_SET_ARCH 1 -#define ARM_OPT_SET_TUNE 2 +/* These are populated as commandline arguments are processed, or NULL + if not specified. */ +static const struct processors *arm_selected_arch; +static const struct processors *arm_selected_cpu; +static const struct processors *arm_selected_tune; /* The name of the preprocessor macro to define for this architecture. */ char arm_arch_name[] = "__ARM_ARCH_0UNK__"; -struct fpu_desc -{ - const char * name; - enum fputype fpu; -}; - - /* Available values for -mfpu=. */ -static const struct fpu_desc all_fpus[] = -{ - {"fpa", FPUTYPE_FPA}, - {"fpe2", FPUTYPE_FPA_EMU2}, - {"fpe3", FPUTYPE_FPA_EMU2}, - {"maverick", FPUTYPE_MAVERICK}, - {"vfp", FPUTYPE_VFP}, - {"vfp3", FPUTYPE_VFP3}, - {"neon", FPUTYPE_NEON} -}; - - -/* Floating point models used by the different hardware. - See fputype in arm.h. */ - -static const enum fputype fp_model_for_fpu[] = -{ - /* No FP hardware. */ - ARM_FP_MODEL_UNKNOWN, /* FPUTYPE_NONE */ - ARM_FP_MODEL_FPA, /* FPUTYPE_FPA */ - ARM_FP_MODEL_FPA, /* FPUTYPE_FPA_EMU2 */ - ARM_FP_MODEL_FPA, /* FPUTYPE_FPA_EMU3 */ - ARM_FP_MODEL_MAVERICK, /* FPUTYPE_MAVERICK */ - ARM_FP_MODEL_VFP, /* FPUTYPE_VFP */ - ARM_FP_MODEL_VFP, /* FPUTYPE_VFP3 */ - ARM_FP_MODEL_VFP /* FPUTYPE_NEON */ +static const struct arm_fpu_desc all_fpus[] = +{ + {"fpa", ARM_FP_MODEL_FPA, 0, VFP_NONE, false, false}, + {"fpe2", ARM_FP_MODEL_FPA, 2, VFP_NONE, false, false}, + {"fpe3", ARM_FP_MODEL_FPA, 3, VFP_NONE, false, false}, + {"maverick", ARM_FP_MODEL_MAVERICK, 0, VFP_NONE, false, false}, + {"vfp", ARM_FP_MODEL_VFP, 2, VFP_REG_D16, false, false}, + {"vfpv3", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false}, + {"vfpv3-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, true}, + {"vfpv3-d16", ARM_FP_MODEL_VFP, 3, VFP_REG_D16, false, false}, + {"vfpv3-d16-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_D16, false, true}, + {"vfpv3xd", ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, false, false}, + {"vfpv3xd-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_SINGLE, false, true}, + {"neon", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , false}, + {"neon-fp16", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, true , true }, + {"vfpv4", ARM_FP_MODEL_VFP, 4, VFP_REG_D32, false, true}, + {"vfpv4-d16", ARM_FP_MODEL_VFP, 4, VFP_REG_D16, false, true}, + {"fpv4-sp-d16", ARM_FP_MODEL_VFP, 4, VFP_REG_SINGLE, false, true}, + {"neon-vfpv4", ARM_FP_MODEL_VFP, 4, VFP_REG_D32, true, true}, + /* Compatibility aliases. */ + {"vfp3", ARM_FP_MODEL_VFP, 3, VFP_REG_D32, false, false}, }; @@ -733,6 +957,23 @@ static const struct float_abi all_float_abis[] = }; +struct fp16_format +{ + const char *name; + enum arm_fp16_format_type fp16_format_type; +}; + + +/* Available values for -mfp16-format=. */ + +static const struct fp16_format all_fp16_formats[] = +{ + {"none", ARM_FP16_FORMAT_NONE}, + {"ieee", ARM_FP16_FORMAT_IEEE}, + {"alternative", ARM_FP16_FORMAT_ALTERNATIVE} +}; + + struct abi_name { const char *name; @@ -761,6 +1002,13 @@ enum tls_reloc { TLS_LE32 }; +/* The maximum number of insns to be used when loading a constant. */ +inline static int +arm_constant_limit (bool size_p) +{ + return size_p ? 1 : current_tune->constant_limit; +} + /* Emit an insn that's a simple single-set. Both the operands must be known to be valid. */ inline static rtx @@ -890,6 +1138,154 @@ arm_init_libfuncs (void) set_optab_libfunc (umod_optab, DImode, NULL); set_optab_libfunc (smod_optab, SImode, NULL); set_optab_libfunc (umod_optab, SImode, NULL); + + /* Half-precision float operations. The compiler handles all operations + with NULL libfuncs by converting the SFmode. */ + switch (arm_fp16_format) + { + case ARM_FP16_FORMAT_IEEE: + case ARM_FP16_FORMAT_ALTERNATIVE: + + /* Conversions. */ + set_conv_libfunc (trunc_optab, HFmode, SFmode, + (arm_fp16_format == ARM_FP16_FORMAT_IEEE + ? "__gnu_f2h_ieee" + : "__gnu_f2h_alternative")); + set_conv_libfunc (sext_optab, SFmode, HFmode, + (arm_fp16_format == ARM_FP16_FORMAT_IEEE + ? "__gnu_h2f_ieee" + : "__gnu_h2f_alternative")); + + /* Arithmetic. */ + set_optab_libfunc (add_optab, HFmode, NULL); + set_optab_libfunc (sdiv_optab, HFmode, NULL); + set_optab_libfunc (smul_optab, HFmode, NULL); + set_optab_libfunc (neg_optab, HFmode, NULL); + set_optab_libfunc (sub_optab, HFmode, NULL); + + /* Comparisons. */ + set_optab_libfunc (eq_optab, HFmode, NULL); + set_optab_libfunc (ne_optab, HFmode, NULL); + set_optab_libfunc (lt_optab, HFmode, NULL); + set_optab_libfunc (le_optab, HFmode, NULL); + set_optab_libfunc (ge_optab, HFmode, NULL); + set_optab_libfunc (gt_optab, HFmode, NULL); + set_optab_libfunc (unord_optab, HFmode, NULL); + break; + + default: + break; + } + + if (TARGET_AAPCS_BASED) + synchronize_libfunc = init_one_libfunc ("__sync_synchronize"); +} + +/* On AAPCS systems, this is the "struct __va_list". */ +static GTY(()) tree va_list_type; + +/* Return the type to use as __builtin_va_list. */ +static tree +arm_build_builtin_va_list (void) +{ + tree va_list_name; + tree ap_field; + + if (!TARGET_AAPCS_BASED) + return std_build_builtin_va_list (); + + /* AAPCS \S 7.1.4 requires that va_list be a typedef for a type + defined as: + + struct __va_list + { + void *__ap; + }; + + The C Library ABI further reinforces this definition in \S + 4.1. + + We must follow this definition exactly. The structure tag + name is visible in C++ mangled names, and thus forms a part + of the ABI. The field name may be used by people who + #include . */ + /* Create the type. */ + va_list_type = lang_hooks.types.make_type (RECORD_TYPE); + /* Give it the required name. */ + va_list_name = build_decl (BUILTINS_LOCATION, + TYPE_DECL, + get_identifier ("__va_list"), + va_list_type); + DECL_ARTIFICIAL (va_list_name) = 1; + TYPE_NAME (va_list_type) = va_list_name; + /* Create the __ap field. */ + ap_field = build_decl (BUILTINS_LOCATION, + FIELD_DECL, + get_identifier ("__ap"), + ptr_type_node); + DECL_ARTIFICIAL (ap_field) = 1; + DECL_FIELD_CONTEXT (ap_field) = va_list_type; + TYPE_FIELDS (va_list_type) = ap_field; + /* Compute its layout. */ + layout_type (va_list_type); + + return va_list_type; +} + +/* Return an expression of type "void *" pointing to the next + available argument in a variable-argument list. VALIST is the + user-level va_list object, of type __builtin_va_list. */ +static tree +arm_extract_valist_ptr (tree valist) +{ + if (TREE_TYPE (valist) == error_mark_node) + return error_mark_node; + + /* On an AAPCS target, the pointer is stored within "struct + va_list". */ + if (TARGET_AAPCS_BASED) + { + tree ap_field = TYPE_FIELDS (TREE_TYPE (valist)); + valist = build3 (COMPONENT_REF, TREE_TYPE (ap_field), + valist, ap_field, NULL_TREE); + } + + return valist; +} + +/* Implement TARGET_EXPAND_BUILTIN_VA_START. */ +static void +arm_expand_builtin_va_start (tree valist, rtx nextarg) +{ + valist = arm_extract_valist_ptr (valist); + std_expand_builtin_va_start (valist, nextarg); +} + +/* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */ +static tree +arm_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p, + gimple_seq *post_p) +{ + valist = arm_extract_valist_ptr (valist); + return std_gimplify_va_arg_expr (valist, type, pre_p, post_p); +} + +/* Lookup NAME in SEL. */ + +static const struct processors * +arm_find_cpu (const char *name, const struct processors *sel, const char *desc) +{ + if (!(name && *name)) + return NULL; + + for (; sel->name != NULL; sel++) + { + if (streq (name, sel->name)) + return sel; + } + + error ("bad value (%s) for %s switch", name, desc); + return NULL; } /* Implement TARGET_HANDLE_OPTION. */ @@ -900,11 +1296,11 @@ arm_handle_option (size_t code, const char *arg, int value ATTRIBUTE_UNUSED) switch (code) { case OPT_march_: - arm_select[1].string = arg; + arm_selected_arch = arm_find_cpu(arg, all_architectures, "-march"); return true; case OPT_mcpu_: - arm_select[0].string = arg; + arm_selected_cpu = arm_find_cpu(arg, all_cores, "-mcpu"); return true; case OPT_mhard_float: @@ -916,7 +1312,7 @@ arm_handle_option (size_t code, const char *arg, int value ATTRIBUTE_UNUSED) return true; case OPT_mtune_: - arm_select[2].string = arg; + arm_selected_tune = arm_find_cpu(arg, all_cores, "-mtune"); return true; default: @@ -1010,91 +1406,61 @@ arm_target_help (void) } -/* Fix up any incompatible options that the user has specified. - This has now turned into a maze. */ -void -arm_override_options (void) +/* Fix up any incompatible options that the user has specified. */ +static void +arm_option_override (void) { unsigned i; - enum processor_type target_arch_cpu = arm_none; - - /* Set up the flags based on the cpu/architecture selected by the user. */ - for (i = ARRAY_SIZE (arm_select); i--;) - { - struct arm_cpu_select * ptr = arm_select + i; - - if (ptr->string != NULL && ptr->string[0] != '\0') - { - const struct processors * sel; - - for (sel = ptr->processors; sel->name != NULL; sel++) - if (streq (ptr->string, sel->name)) - { - /* Set the architecture define. */ - if (i != ARM_OPT_SET_TUNE) - sprintf (arm_arch_name, "__ARM_ARCH_%s__", sel->arch); - - /* Determine the processor core for which we should - tune code-generation. */ - if (/* -mcpu= is a sensible default. */ - i == ARM_OPT_SET_CPU - /* -mtune= overrides -mcpu= and -march=. */ - || i == ARM_OPT_SET_TUNE) - arm_tune = (enum processor_type) (sel - ptr->processors); - - /* Remember the CPU associated with this architecture. - If no other option is used to set the CPU type, - we'll use this to guess the most suitable tuning - options. */ - if (i == ARM_OPT_SET_ARCH) - target_arch_cpu = sel->core; - - if (i != ARM_OPT_SET_TUNE) - { - /* If we have been given an architecture and a processor - make sure that they are compatible. We only generate - a warning though, and we prefer the CPU over the - architecture. */ - if (insn_flags != 0 && (insn_flags ^ sel->flags)) - warning (0, "switch -mcpu=%s conflicts with -march= switch", - ptr->string); - - insn_flags = sel->flags; - } - break; - } +#ifdef SUBTARGET_OVERRIDE_OPTIONS + SUBTARGET_OVERRIDE_OPTIONS; +#endif - if (sel->name == NULL) - error ("bad value (%s) for %s switch", ptr->string, ptr->name); - } + if (arm_selected_arch) + { + if (arm_selected_cpu) + { + /* Check for conflict between mcpu and march. */ + if ((arm_selected_cpu->flags ^ arm_selected_arch->flags) & ~FL_TUNE) + { + warning (0, "switch -mcpu=%s conflicts with -march=%s switch", + arm_selected_cpu->name, arm_selected_arch->name); + /* -march wins for code generation. + -mcpu wins for default tuning. */ + if (!arm_selected_tune) + arm_selected_tune = arm_selected_cpu; + + arm_selected_cpu = arm_selected_arch; + } + else + /* -mcpu wins. */ + arm_selected_arch = NULL; + } + else + /* Pick a CPU based on the architecture. */ + arm_selected_cpu = arm_selected_arch; } - /* Guess the tuning options from the architecture if necessary. */ - if (arm_tune == arm_none) - arm_tune = target_arch_cpu; - /* If the user did not specify a processor, choose one for them. */ - if (insn_flags == 0) + if (!arm_selected_cpu) { const struct processors * sel; unsigned int sought; - enum processor_type cpu; - cpu = TARGET_CPU_DEFAULT; - if (cpu == arm_none) + arm_selected_cpu = &all_cores[TARGET_CPU_DEFAULT]; + if (!arm_selected_cpu->name) { #ifdef SUBTARGET_CPU_DEFAULT /* Use the subtarget default CPU if none was specified by configure. */ - cpu = SUBTARGET_CPU_DEFAULT; + arm_selected_cpu = &all_cores[SUBTARGET_CPU_DEFAULT]; #endif /* Default to ARM6. */ - if (cpu == arm_none) - cpu = arm6; + if (!arm_selected_cpu->name) + arm_selected_cpu = &all_cores[arm6]; } - sel = &all_cores[cpu]; + sel = arm_selected_cpu; insn_flags = sel->flags; /* Now check to see if the user has specified some command line @@ -1155,30 +1521,63 @@ arm_override_options (void) sel = best_fit; } - insn_flags = sel->flags; + arm_selected_cpu = sel; } - sprintf (arm_arch_name, "__ARM_ARCH_%s__", sel->arch); - arm_default_cpu = (enum processor_type) (sel - all_cores); - if (arm_tune == arm_none) - arm_tune = arm_default_cpu; } - /* The processor for which we should tune should now have been - chosen. */ - gcc_assert (arm_tune != arm_none); + gcc_assert (arm_selected_cpu); + /* The selected cpu may be an architecture, so lookup tuning by core ID. */ + if (!arm_selected_tune) + arm_selected_tune = &all_cores[arm_selected_cpu->core]; - tune_flags = all_cores[(int)arm_tune].flags; - if (optimize_size) - targetm.rtx_costs = arm_size_rtx_costs; + sprintf (arm_arch_name, "__ARM_ARCH_%s__", arm_selected_cpu->arch); + insn_flags = arm_selected_cpu->flags; + + arm_tune = arm_selected_tune->core; + tune_flags = arm_selected_tune->flags; + current_tune = arm_selected_tune->tune; + + if (target_fp16_format_name) + { + for (i = 0; i < ARRAY_SIZE (all_fp16_formats); i++) + { + if (streq (all_fp16_formats[i].name, target_fp16_format_name)) + { + arm_fp16_format = all_fp16_formats[i].fp16_format_type; + break; + } + } + if (i == ARRAY_SIZE (all_fp16_formats)) + error ("invalid __fp16 format option: -mfp16-format=%s", + target_fp16_format_name); + } + else + arm_fp16_format = ARM_FP16_FORMAT_NONE; + + if (target_abi_name) + { + for (i = 0; i < ARRAY_SIZE (arm_all_abis); i++) + { + if (streq (arm_all_abis[i].name, target_abi_name)) + { + arm_abi = arm_all_abis[i].abi_type; + break; + } + } + if (i == ARRAY_SIZE (arm_all_abis)) + error ("invalid ABI option: -mabi=%s", target_abi_name); + } else - targetm.rtx_costs = all_cores[(int)arm_tune].rtx_costs; + arm_abi = ARM_DEFAULT_ABI; /* Make sure that the processor choice does not conflict with any of the other command line choices. */ if (TARGET_ARM && !(insn_flags & FL_NOTM)) error ("target CPU does not support ARM mode"); - if (TARGET_INTERWORK && !(insn_flags & FL_THUMB)) + /* BPABI targets use linker tricks to allow interworking on cores + without thumb support. */ + if (TARGET_INTERWORK && !((insn_flags & FL_THUMB) || TARGET_BPABI)) { warning (0, "target CPU does not support interworking" ); target_flags &= ~MASK_INTERWORK; @@ -1199,7 +1598,7 @@ arm_override_options (void) /* Callee super interworking implies thumb interworking. Adding this to the flags here simplifies the logic elsewhere. */ if (TARGET_THUMB && TARGET_CALLEE_INTERWORKING) - target_flags |= MASK_INTERWORK; + target_flags |= MASK_INTERWORK; /* TARGET_BACKTRACE calls leaf_function_p, which causes a crash if done from here where no function is being compiled currently. */ @@ -1209,9 +1608,6 @@ arm_override_options (void) if (TARGET_ARM && TARGET_CALLEE_INTERWORKING) warning (0, "enabling callee interworking support is only meaningful when compiling for the Thumb"); - if (TARGET_ARM && TARGET_CALLER_INTERWORKING) - warning (0, "enabling caller interworking support is only meaningful when compiling for the Thumb"); - if (TARGET_APCS_STACK && !TARGET_APCS_FRAME) { warning (0, "-mapcs-stack-check incompatible with -mno-apcs-frame"); @@ -1247,17 +1643,42 @@ arm_override_options (void) arm_arch6 = (insn_flags & FL_ARCH6) != 0; arm_arch6k = (insn_flags & FL_ARCH6K) != 0; arm_arch_notm = (insn_flags & FL_NOTM) != 0; + arm_arch7 = (insn_flags & FL_ARCH7) != 0; + arm_arch7em = (insn_flags & FL_ARCH7EM) != 0; arm_arch_thumb2 = (insn_flags & FL_THUMB2) != 0; arm_arch_xscale = (insn_flags & FL_XSCALE) != 0; arm_arch_cirrus = (insn_flags & FL_CIRRUS) != 0; arm_ld_sched = (tune_flags & FL_LDSCHED) != 0; arm_tune_strongarm = (tune_flags & FL_STRONG) != 0; - thumb_code = (TARGET_ARM == 0); + thumb_code = TARGET_ARM == 0; + thumb1_code = TARGET_THUMB1 != 0; arm_tune_wbuf = (tune_flags & FL_WBUF) != 0; arm_tune_xscale = (tune_flags & FL_XSCALE) != 0; arm_arch_iwmmxt = (insn_flags & FL_IWMMXT) != 0; arm_arch_hwdiv = (insn_flags & FL_DIV) != 0; + arm_tune_cortex_a9 = (arm_tune == cortexa9) != 0; + + /* If we are not using the default (ARM mode) section anchor offset + ranges, then set the correct ranges now. */ + if (TARGET_THUMB1) + { + /* Thumb-1 LDR instructions cannot have negative offsets. + Permissible positive offset ranges are 5-bit (for byte loads), + 6-bit (for halfword loads), or 7-bit (for word loads). + Empirical results suggest a 7-bit anchor range gives the best + overall code size. */ + targetm.min_anchor_offset = 0; + targetm.max_anchor_offset = 127; + } + else if (TARGET_THUMB2) + { + /* The minimum is set such that the total size of the block + for a particular anchor is 248 + 1 + 4095 bytes, which is + divisible by eight, ensuring natural spacing of anchors. */ + targetm.min_anchor_offset = -248; + targetm.max_anchor_offset = 4095; + } /* V5 code we generate is completely interworking capable, so we turn off TARGET_INTERWORK here to avoid many tests later on. */ @@ -1270,29 +1691,12 @@ arm_override_options (void) if (arm_arch5) target_flags &= ~MASK_INTERWORK; - if (target_abi_name) - { - for (i = 0; i < ARRAY_SIZE (arm_all_abis); i++) - { - if (streq (arm_all_abis[i].name, target_abi_name)) - { - arm_abi = arm_all_abis[i].abi_type; - break; - } - } - if (i == ARRAY_SIZE (arm_all_abis)) - error ("invalid ABI option: -mabi=%s", target_abi_name); - } - else - arm_abi = ARM_DEFAULT_ABI; - if (TARGET_IWMMXT && !ARM_DOUBLEWORD_ALIGN) error ("iwmmxt requires an AAPCS compatible ABI for proper operation"); if (TARGET_IWMMXT_ABI && !TARGET_IWMMXT) error ("iwmmxt abi requires an iwmmxt capable cpu"); - arm_fp_model = ARM_FP_MODEL_UNKNOWN; if (target_fpu_name == NULL && target_fpe_name != NULL) { if (streq (target_fpe_name, "2")) @@ -1303,52 +1707,62 @@ arm_override_options (void) error ("invalid floating point emulation option: -mfpe=%s", target_fpe_name); } - if (target_fpu_name != NULL) - { - /* The user specified a FPU. */ - for (i = 0; i < ARRAY_SIZE (all_fpus); i++) - { - if (streq (all_fpus[i].name, target_fpu_name)) - { - arm_fpu_arch = all_fpus[i].fpu; - arm_fpu_tune = arm_fpu_arch; - arm_fp_model = fp_model_for_fpu[arm_fpu_arch]; - break; - } - } - if (arm_fp_model == ARM_FP_MODEL_UNKNOWN) - error ("invalid floating point option: -mfpu=%s", target_fpu_name); - } - else + + if (target_fpu_name == NULL) { #ifdef FPUTYPE_DEFAULT - /* Use the default if it is specified for this platform. */ - arm_fpu_arch = FPUTYPE_DEFAULT; - arm_fpu_tune = FPUTYPE_DEFAULT; + target_fpu_name = FPUTYPE_DEFAULT; #else - /* Pick one based on CPU type. */ - /* ??? Some targets assume FPA is the default. - if ((insn_flags & FL_VFP) != 0) - arm_fpu_arch = FPUTYPE_VFP; - else - */ if (arm_arch_cirrus) - arm_fpu_arch = FPUTYPE_MAVERICK; + target_fpu_name = "maverick"; else - arm_fpu_arch = FPUTYPE_FPA_EMU2; + target_fpu_name = "fpe2"; #endif - if (tune_flags & FL_CO_PROC && arm_fpu_arch == FPUTYPE_FPA_EMU2) - arm_fpu_tune = FPUTYPE_FPA; - else - arm_fpu_tune = arm_fpu_arch; - arm_fp_model = fp_model_for_fpu[arm_fpu_arch]; - gcc_assert (arm_fp_model != ARM_FP_MODEL_UNKNOWN); } - if (target_float_abi_name != NULL) + arm_fpu_desc = NULL; + for (i = 0; i < ARRAY_SIZE (all_fpus); i++) { - /* The user specified a FP ABI. */ - for (i = 0; i < ARRAY_SIZE (all_float_abis); i++) + if (streq (all_fpus[i].name, target_fpu_name)) + { + arm_fpu_desc = &all_fpus[i]; + break; + } + } + + if (!arm_fpu_desc) + { + error ("invalid floating point option: -mfpu=%s", target_fpu_name); + return; + } + + switch (arm_fpu_desc->model) + { + case ARM_FP_MODEL_FPA: + if (arm_fpu_desc->rev == 2) + arm_fpu_attr = FPU_FPE2; + else if (arm_fpu_desc->rev == 3) + arm_fpu_attr = FPU_FPE3; + else + arm_fpu_attr = FPU_FPA; + break; + + case ARM_FP_MODEL_MAVERICK: + arm_fpu_attr = FPU_MAVERICK; + break; + + case ARM_FP_MODEL_VFP: + arm_fpu_attr = FPU_VFP; + break; + + default: + gcc_unreachable(); + } + + if (target_float_abi_name != NULL) + { + /* The user specified a FP ABI. */ + for (i = 0; i < ARRAY_SIZE (all_float_abis); i++) { if (streq (all_float_abis[i].name, target_float_abi_name)) { @@ -1363,8 +1777,18 @@ arm_override_options (void) else arm_float_abi = TARGET_DEFAULT_FLOAT_ABI; - if (arm_float_abi == ARM_FLOAT_ABI_HARD && TARGET_VFP) - sorry ("-mfloat-abi=hard and VFP"); + if (TARGET_AAPCS_BASED + && (arm_fpu_desc->model == ARM_FP_MODEL_FPA)) + error ("FPA is unsupported in the AAPCS"); + + if (TARGET_AAPCS_BASED) + { + if (TARGET_CALLER_INTERWORKING) + error ("AAPCS does not support -mcaller-super-interworking"); + else + if (TARGET_CALLEE_INTERWORKING) + error ("AAPCS does not support -mcallee-super-interworking"); + } /* FPA and iWMMXt are incompatible because the insn encodings overlap. VFP and iWMMXt can theoretically coexist, but it's unlikely such silicon @@ -1376,15 +1800,40 @@ arm_override_options (void) if (TARGET_THUMB2 && TARGET_IWMMXT) sorry ("Thumb-2 iWMMXt"); + /* __fp16 support currently assumes the core has ldrh. */ + if (!arm_arch4 && arm_fp16_format != ARM_FP16_FORMAT_NONE) + sorry ("__fp16 and no ldrh"); + /* If soft-float is specified then don't use FPU. */ if (TARGET_SOFT_FLOAT) - arm_fpu_arch = FPUTYPE_NONE; + arm_fpu_attr = FPU_NONE; + + if (TARGET_AAPCS_BASED) + { + if (arm_abi == ARM_ABI_IWMMXT) + arm_pcs_default = ARM_PCS_AAPCS_IWMMXT; + else if (arm_float_abi == ARM_FLOAT_ABI_HARD + && TARGET_HARD_FLOAT + && TARGET_VFP) + arm_pcs_default = ARM_PCS_AAPCS_VFP; + else + arm_pcs_default = ARM_PCS_AAPCS; + } + else + { + if (arm_float_abi == ARM_FLOAT_ABI_HARD && TARGET_VFP) + sorry ("-mfloat-abi=hard and VFP"); + + if (arm_abi == ARM_ABI_APCS) + arm_pcs_default = ARM_PCS_APCS; + else + arm_pcs_default = ARM_PCS_ATPCS; + } /* For arm2/3 there is no need to do any scheduling if there is only a floating point emulator, or we are doing software floating-point. */ if ((TARGET_SOFT_FLOAT - || arm_fpu_tune == FPUTYPE_FPA_EMU2 - || arm_fpu_tune == FPUTYPE_FPA_EMU3) + || (TARGET_FPA && arm_fpu_desc->rev)) && (tune_flags & FL_MODE32) == 0) flag_schedule_insns = flag_schedule_insns_after_reload = 0; @@ -1403,7 +1852,7 @@ arm_override_options (void) /* Use the cp15 method if it is available. */ if (target_thread_pointer == TP_AUTO) { - if (arm_arch6k && !TARGET_THUMB) + if (arm_arch6k && !TARGET_THUMB1) target_thread_pointer = TP_CP15; else target_thread_pointer = TP_SOFT; @@ -1465,8 +1914,16 @@ arm_override_options (void) arm_pic_register = pic_register; } - /* ??? We might want scheduling for thumb2. */ - if (TARGET_THUMB && flag_schedule_insns) + /* Enable -mfix-cortex-m3-ldrd by default for Cortex-M3 cores. */ + if (fix_cm3_ldrd == 2) + { + if (arm_selected_cpu->core == cortexm3) + fix_cm3_ldrd = 1; + else + fix_cm3_ldrd = 0; + } + + if (TARGET_THUMB1 && flag_schedule_insns) { /* Don't warn since it's on by default in -O2. */ flag_schedule_insns = 0; @@ -1474,32 +1931,35 @@ arm_override_options (void) if (optimize_size) { - arm_constant_limit = 1; - /* If optimizing for size, bump the number of instructions that we are prepared to conditionally execute (even on a StrongARM). */ max_insns_skipped = 6; } else { - /* For processors with load scheduling, it never costs more than - 2 cycles to load a constant, and the load scheduler may well - reduce that to 1. */ - if (arm_ld_sched) - arm_constant_limit = 1; - - /* On XScale the longer latency of a load makes it more difficult - to achieve a good schedule, so it's faster to synthesize - constants that can be done in two insns. */ - if (arm_tune_xscale) - arm_constant_limit = 2; - /* StrongARM has early execution of branches, so a sequence that is worth skipping is shorter. */ if (arm_tune_strongarm) max_insns_skipped = 3; } + /* Hot/Cold partitioning is not currently supported, since we can't + handle literal pool placement in that case. */ + if (flag_reorder_blocks_and_partition) + { + inform (input_location, + "-freorder-blocks-and-partition not supported on this architecture"); + flag_reorder_blocks_and_partition = 0; + flag_reorder_blocks = 1; + } + + if (flag_pic) + /* Hoisting PIC address calculations more aggressively provides a small, + but measurable, size reduction for PIC code. Therefore, we decrease + the bar for unrestricted expression hoisting to the cost of PIC address + calculation, which is 2 instructions. */ + maybe_set_param_value (PARAM_GCSE_UNRESTRICTED_COST, 2); + /* Register global variables with the garbage collector. */ arm_add_gc_roots (); } @@ -1588,7 +2048,7 @@ arm_compute_func_type (void) if (optimize > 0 && (TREE_NOTHROW (current_function_decl) || !(flag_unwind_tables - || (flag_exceptions && !USING_SJLJ_EXCEPTIONS))) + || (flag_exceptions && arm_except_unwind_info () != UI_SJLJ))) && TREE_THIS_VOLATILE (current_function_decl)) type |= ARM_FT_VOLATILE; @@ -1632,6 +2092,84 @@ arm_allocate_stack_slots_for_args (void) } +/* Output assembler code for a block containing the constant parts + of a trampoline, leaving space for the variable parts. + + On the ARM, (if r8 is the static chain regnum, and remembering that + referencing pc adds an offset of 8) the trampoline looks like: + ldr r8, [pc, #0] + ldr pc, [pc] + .word static chain value + .word function's address + XXX FIXME: When the trampoline returns, r8 will be clobbered. */ + +static void +arm_asm_trampoline_template (FILE *f) +{ + if (TARGET_ARM) + { + asm_fprintf (f, "\tldr\t%r, [%r, #0]\n", STATIC_CHAIN_REGNUM, PC_REGNUM); + asm_fprintf (f, "\tldr\t%r, [%r, #0]\n", PC_REGNUM, PC_REGNUM); + } + else if (TARGET_THUMB2) + { + /* The Thumb-2 trampoline is similar to the arm implementation. + Unlike 16-bit Thumb, we enter the stub in thumb mode. */ + asm_fprintf (f, "\tldr.w\t%r, [%r, #4]\n", + STATIC_CHAIN_REGNUM, PC_REGNUM); + asm_fprintf (f, "\tldr.w\t%r, [%r, #4]\n", PC_REGNUM, PC_REGNUM); + } + else + { + ASM_OUTPUT_ALIGN (f, 2); + fprintf (f, "\t.code\t16\n"); + fprintf (f, ".Ltrampoline_start:\n"); + asm_fprintf (f, "\tpush\t{r0, r1}\n"); + asm_fprintf (f, "\tldr\tr0, [%r, #8]\n", PC_REGNUM); + asm_fprintf (f, "\tmov\t%r, r0\n", STATIC_CHAIN_REGNUM); + asm_fprintf (f, "\tldr\tr0, [%r, #8]\n", PC_REGNUM); + asm_fprintf (f, "\tstr\tr0, [%r, #4]\n", SP_REGNUM); + asm_fprintf (f, "\tpop\t{r0, %r}\n", PC_REGNUM); + } + assemble_aligned_integer (UNITS_PER_WORD, const0_rtx); + assemble_aligned_integer (UNITS_PER_WORD, const0_rtx); +} + +/* Emit RTL insns to initialize the variable parts of a trampoline. */ + +static void +arm_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) +{ + rtx fnaddr, mem, a_tramp; + + emit_block_move (m_tramp, assemble_trampoline_template (), + GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL); + + mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 8 : 12); + emit_move_insn (mem, chain_value); + + mem = adjust_address (m_tramp, SImode, TARGET_32BIT ? 12 : 16); + fnaddr = XEXP (DECL_RTL (fndecl), 0); + emit_move_insn (mem, fnaddr); + + a_tramp = XEXP (m_tramp, 0); + emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"), + LCT_NORMAL, VOIDmode, 2, a_tramp, Pmode, + plus_constant (a_tramp, TRAMPOLINE_SIZE), Pmode); +} + +/* Thumb trampolines should be entered in thumb mode, so set + the bottom bit of the address. */ + +static rtx +arm_trampoline_adjust_address (rtx addr) +{ + if (TARGET_THUMB) + addr = expand_simple_binop (Pmode, IOR, addr, const1_rtx, + NULL, 0, OPTAB_LIB_WIDEN); + return addr; +} + /* Return 1 if it is possible to return using a single instruction. If SIBLING is non-null, this is a test for a return before a sibling call. SIBLING is the call insn, so we can examine its register usage. */ @@ -1830,11 +2368,33 @@ const_ok_for_op (HOST_WIDE_INT i, enum rtx_code code) switch (code) { case PLUS: + case COMPARE: + case EQ: + case NE: + case GT: + case LE: + case LT: + case GE: + case GEU: + case LTU: + case GTU: + case LEU: + case UNORDERED: + case ORDERED: + case UNEQ: + case UNGE: + case UNLT: + case UNGT: + case UNLE: return const_ok_for_arm (ARM_SIGN_EXTEND (-i)); case MINUS: /* Should only occur with (MINUS I reg) => rsb */ case XOR: + return 0; + case IOR: + if (TARGET_THUMB2) + return const_ok_for_arm (ARM_SIGN_EXTEND (~i)); return 0; case AND: @@ -1884,20 +2444,29 @@ arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn, && !cond && (arm_gen_constant (code, mode, NULL_RTX, val, target, source, 1, 0) - > arm_constant_limit + (code != SET))) + > (arm_constant_limit (optimize_function_for_size_p (cfun)) + + (code != SET)))) { if (code == SET) { /* Currently SET is the only monadic value for CODE, all the rest are diadic. */ - emit_set_insn (target, GEN_INT (val)); + if (TARGET_USE_MOVT) + arm_emit_movpair (target, GEN_INT (val)); + else + emit_set_insn (target, GEN_INT (val)); + return 1; } else { rtx temp = subtargets ? gen_reg_rtx (mode) : target; - emit_set_insn (temp, GEN_INT (val)); + if (TARGET_USE_MOVT) + arm_emit_movpair (temp, GEN_INT (val)); + else + emit_set_insn (temp, GEN_INT (val)); + /* For MINUS, the value is subtracted from, since we never have subtraction of a constant. */ if (code == MINUS) @@ -1914,20 +2483,24 @@ arm_split_constant (enum rtx_code code, enum machine_mode mode, rtx insn, 1); } -/* Return the number of ARM instructions required to synthesize the given - constant. */ +/* Return the number of instructions required to synthesize the given + constant, if we start emitting them from bit-position I. */ static int count_insns_for_constant (HOST_WIDE_INT remainder, int i) { HOST_WIDE_INT temp1; + int step_size = TARGET_ARM ? 2 : 1; int num_insns = 0; + + gcc_assert (TARGET_ARM || i == 0); + do { int end; if (i <= 0) i += 32; - if (remainder & (3 << (i - 2))) + if (remainder & (((1 << step_size) - 1) << (i - step_size))) { end = i - 8; if (end < 0) @@ -1936,13 +2509,77 @@ count_insns_for_constant (HOST_WIDE_INT remainder, int i) | ((i < end) ? (0xff >> (32 - end)) : 0)); remainder &= ~temp1; num_insns++; - i -= 6; + i -= 8 - step_size; } - i -= 2; + i -= step_size; } while (remainder); return num_insns; } +static int +find_best_start (unsigned HOST_WIDE_INT remainder) +{ + int best_consecutive_zeros = 0; + int i; + int best_start = 0; + + /* If we aren't targetting ARM, the best place to start is always at + the bottom. */ + if (! TARGET_ARM) + return 0; + + for (i = 0; i < 32; i += 2) + { + int consecutive_zeros = 0; + + if (!(remainder & (3 << i))) + { + while ((i < 32) && !(remainder & (3 << i))) + { + consecutive_zeros += 2; + i += 2; + } + if (consecutive_zeros > best_consecutive_zeros) + { + best_consecutive_zeros = consecutive_zeros; + best_start = i - consecutive_zeros; + } + i -= 2; + } + } + + /* So long as it won't require any more insns to do so, it's + desirable to emit a small constant (in bits 0...9) in the last + insn. This way there is more chance that it can be combined with + a later addressing insn to form a pre-indexed load or store + operation. Consider: + + *((volatile int *)0xe0000100) = 1; + *((volatile int *)0xe0000110) = 2; + + We want this to wind up as: + + mov rA, #0xe0000000 + mov rB, #1 + str rB, [rA, #0x100] + mov rB, #2 + str rB, [rA, #0x110] + + rather than having to synthesize both large constants from scratch. + + Therefore, we calculate how many insns would be required to emit + the constant starting from `best_start', and also starting from + zero (i.e. with bit 31 first to be output). If `best_start' doesn't + yield a shorter sequence, we may as well use zero. */ + if (best_start != 0 + && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder) + && (count_insns_for_constant (remainder, 0) <= + count_insns_for_constant (remainder, best_start))) + best_start = 0; + + return best_start; +} + /* Emit an instruction with the indicated PATTERN. If COND is non-NULL, conditionalize the execution of the instruction on COND being true. */ @@ -1966,8 +2603,8 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, { int can_invert = 0; int can_negate = 0; + int final_invert = 0; int can_negate_initial = 0; - int can_shift = 0; int i; int num_bits_set = 0; int set_sign_bit_copies = 0; @@ -1977,6 +2614,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, int insns = 0; unsigned HOST_WIDE_INT temp1, temp2; unsigned HOST_WIDE_INT remainder = val & 0xffffffff; + int step_size = TARGET_ARM ? 2 : 1; /* Find out which operations are safe for a given CODE. Also do a quick check for degenerate cases; these can occur when DImode operations @@ -1985,7 +2623,6 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, { case SET: can_invert = 1; - can_shift = 1; can_negate = 1; break; @@ -2003,15 +2640,20 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, GEN_INT (ARM_SIGN_EXTEND (val)))); return 1; } + if (remainder == 0) { if (reload_completed && rtx_equal_p (target, source)) return 0; + if (generate) emit_constant_insn (cond, gen_rtx_SET (VOIDmode, target, source)); return 1; } + + if (TARGET_THUMB2) + can_invert = 1; break; case AND: @@ -2045,14 +2687,15 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, return 1; } - /* We don't know how to handle other cases yet. */ - gcc_assert (remainder == 0xffffffff); - - if (generate) - emit_constant_insn (cond, - gen_rtx_SET (VOIDmode, target, - gen_rtx_NOT (mode, source))); - return 1; + if (remainder == 0xffffffff) + { + if (generate) + emit_constant_insn (cond, + gen_rtx_SET (VOIDmode, target, + gen_rtx_NOT (mode, source))); + return 1; + } + break; case MINUS: /* We treat MINUS as (val - source), since (source - val) is always @@ -2099,6 +2742,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, /* Calculate a few attributes that may be useful for specific optimizations. */ + /* Count number of leading zeros. */ for (i = 31; i >= 0; i--) { if ((remainder & (1 << i)) == 0) @@ -2107,6 +2751,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, break; } + /* Count number of leading 1's. */ for (i = 31; i >= 0; i--) { if ((remainder & (1 << i)) != 0) @@ -2115,6 +2760,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, break; } + /* Count number of trailing zero's. */ for (i = 0; i <= 31; i++) { if ((remainder & (1 << i)) == 0) @@ -2123,6 +2769,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, break; } + /* Count number of trailing 1's. */ for (i = 0; i <= 31; i++) { if ((remainder & (1 << i)) != 0) @@ -2310,6 +2957,17 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, if (code == XOR) break; + /* Convert. + x = y | constant ( which is composed of set_sign_bit_copies of leading 1s + and the remainder 0s for e.g. 0xfff00000) + x = ~(~(y ashift set_sign_bit_copies) lshiftrt set_sign_bit_copies) + + This can be done in 2 instructions by using shifts with mov or mvn. + e.g. for + x = x | 0xfff00000; + we generate. + mvn r0, r0, asl #12 + mvn r0, r0, lsr #12 */ if (set_sign_bit_copies > 8 && (val & (-1 << (32 - set_sign_bit_copies))) == val) { @@ -2335,6 +2993,16 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, return 2; } + /* Convert + x = y | constant (which has set_zero_bit_copies number of trailing ones). + to + x = ~((~y lshiftrt set_zero_bit_copies) ashift set_zero_bit_copies). + + For eg. r0 = r0 | 0xfff + mvn r0, r0, lsr #12 + mvn r0, r0, asl #12 + + */ if (set_zero_bit_copies > 8 && (remainder & ((1 << set_zero_bit_copies) - 1)) == remainder) { @@ -2360,6 +3028,13 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, return 2; } + /* This will never be reached for Thumb2 because orn is a valid + instruction. This is for Thumb1 and the ARM 32 bit cases. + + x = y | constant (such that ~constant is a valid constant) + Transform this to + x = ~(~y & ~constant). + */ if (const_ok_for_arm (temp1 = ARM_SIGN_EXTEND (~val))) { if (generate) @@ -2469,10 +3144,27 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, if (remainder & (1 << i)) num_bits_set++; - if (code == AND || (can_invert && num_bits_set > 16)) - remainder = (~remainder) & 0xffffffff; + if ((code == AND) + || (code != IOR && can_invert && num_bits_set > 16)) + remainder ^= 0xffffffff; else if (code == PLUS && num_bits_set > 16) remainder = (-remainder) & 0xffffffff; + + /* For XOR, if more than half the bits are set and there's a sequence + of more than 8 consecutive ones in the pattern then we can XOR by the + inverted constant and then invert the final result; this may save an + instruction and might also lead to the final mvn being merged with + some other operation. */ + else if (code == XOR && num_bits_set > 16 + && (count_insns_for_constant (remainder ^ 0xffffffff, + find_best_start + (remainder ^ 0xffffffff)) + < count_insns_for_constant (remainder, + find_best_start (remainder)))) + { + remainder ^= 0xffffffff; + final_invert = 1; + } else { can_invert = 0; @@ -2491,63 +3183,8 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, /* ??? Use thumb2 replicated constants when the high and low halfwords are the same. */ { - int best_start = 0; - if (!TARGET_THUMB2) - { - int best_consecutive_zeros = 0; - - for (i = 0; i < 32; i += 2) - { - int consecutive_zeros = 0; - - if (!(remainder & (3 << i))) - { - while ((i < 32) && !(remainder & (3 << i))) - { - consecutive_zeros += 2; - i += 2; - } - if (consecutive_zeros > best_consecutive_zeros) - { - best_consecutive_zeros = consecutive_zeros; - best_start = i - consecutive_zeros; - } - i -= 2; - } - } - - /* So long as it won't require any more insns to do so, it's - desirable to emit a small constant (in bits 0...9) in the last - insn. This way there is more chance that it can be combined with - a later addressing insn to form a pre-indexed load or store - operation. Consider: - - *((volatile int *)0xe0000100) = 1; - *((volatile int *)0xe0000110) = 2; - - We want this to wind up as: - - mov rA, #0xe0000000 - mov rB, #1 - str rB, [rA, #0x100] - mov rB, #2 - str rB, [rA, #0x110] - - rather than having to synthesize both large constants from scratch. - - Therefore, we calculate how many insns would be required to emit - the constant starting from `best_start', and also starting from - zero (i.e. with bit 31 first to be output). If `best_start' doesn't - yield a shorter sequence, we may as well use zero. */ - if (best_start != 0 - && ((((unsigned HOST_WIDE_INT) 1) << best_start) < remainder) - && (count_insns_for_constant (remainder, 0) <= - count_insns_for_constant (remainder, best_start))) - best_start = 0; - } - /* Now start emitting the insns. */ - i = best_start; + i = find_best_start (remainder); do { int end; @@ -2575,7 +3212,7 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, } else { - if (remainder && subtargets) + if ((final_invert || remainder) && subtargets) new_src = gen_reg_rtx (mode); else new_src = target; @@ -2610,21 +3247,23 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, code = PLUS; insns++; - if (TARGET_ARM) - i -= 6; - else - i -= 7; + i -= 8 - step_size; } /* Arm allows rotates by a multiple of two. Thumb-2 allows arbitrary shifts. */ - if (TARGET_ARM) - i -= 2; - else - i--; + i -= step_size; } while (remainder); } + if (final_invert) + { + if (generate) + emit_constant_insn (cond, gen_rtx_SET (VOIDmode, target, + gen_rtx_NOT (mode, source))); + insns++; + } + return insns; } @@ -2633,13 +3272,82 @@ arm_gen_constant (enum rtx_code code, enum machine_mode mode, rtx cond, immediate value easier to load. */ enum rtx_code -arm_canonicalize_comparison (enum rtx_code code, enum machine_mode mode, - rtx * op1) +arm_canonicalize_comparison (enum rtx_code code, rtx *op0, rtx *op1) { - unsigned HOST_WIDE_INT i = INTVAL (*op1); - unsigned HOST_WIDE_INT maxval; + enum machine_mode mode; + unsigned HOST_WIDE_INT i, maxval; + + mode = GET_MODE (*op0); + if (mode == VOIDmode) + mode = GET_MODE (*op1); + maxval = (((unsigned HOST_WIDE_INT) 1) << (GET_MODE_BITSIZE(mode) - 1)) - 1; + /* For DImode, we have GE/LT/GEU/LTU comparisons. In ARM mode + we can also use cmp/cmpeq for GTU/LEU. GT/LE must be either + reversed or (for constant OP1) adjusted to GE/LT. Similarly + for GTU/LEU in Thumb mode. */ + if (mode == DImode) + { + rtx tem; + + /* To keep things simple, always use the Cirrus cfcmp64 if it is + available. */ + if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK) + return code; + + if (code == GT || code == LE + || (!TARGET_ARM && (code == GTU || code == LEU))) + { + /* Missing comparison. First try to use an available + comparison. */ + if (GET_CODE (*op1) == CONST_INT) + { + i = INTVAL (*op1); + switch (code) + { + case GT: + case LE: + if (i != maxval + && arm_const_double_by_immediates (GEN_INT (i + 1))) + { + *op1 = GEN_INT (i + 1); + return code == GT ? GE : LT; + } + break; + case GTU: + case LEU: + if (i != ~((unsigned HOST_WIDE_INT) 0) + && arm_const_double_by_immediates (GEN_INT (i + 1))) + { + *op1 = GEN_INT (i + 1); + return code == GTU ? GEU : LTU; + } + break; + default: + gcc_unreachable (); + } + } + + /* If that did not work, reverse the condition. */ + tem = *op0; + *op0 = *op1; + *op1 = tem; + return swap_condition (code); + } + + return code; + } + + /* Comparisons smaller than DImode. Only adjust comparisons against + an out-of-range constant. */ + if (GET_CODE (*op1) != CONST_INT + || const_ok_for_arm (INTVAL (*op1)) + || const_ok_for_arm (- INTVAL (*op1))) + return code; + + i = INTVAL (*op1); + switch (code) { case EQ: @@ -2696,17 +3404,22 @@ arm_canonicalize_comparison (enum rtx_code code, enum machine_mode mode, /* Define how to find the value returned by a function. */ -rtx -arm_function_value(const_tree type, const_tree func ATTRIBUTE_UNUSED) +static rtx +arm_function_value(const_tree type, const_tree func, + bool outgoing ATTRIBUTE_UNUSED) { enum machine_mode mode; int unsignedp ATTRIBUTE_UNUSED; rtx r ATTRIBUTE_UNUSED; mode = TYPE_MODE (type); + + if (TARGET_AAPCS_BASED) + return aapcs_allocate_return_reg (mode, type, func); + /* Promote integer types. */ if (INTEGRAL_TYPE_P (type)) - PROMOTE_FUNCTION_MODE (mode, unsignedp, type); + mode = arm_promote_function_mode (type, mode, &unsignedp, func, 1); /* Promotes small structs returned in a register to full-word size for big-endian AAPCS. */ @@ -2720,7 +3433,88 @@ arm_function_value(const_tree type, const_tree func ATTRIBUTE_UNUSED) } } - return LIBCALL_VALUE(mode); + return LIBCALL_VALUE (mode); +} + +static int +libcall_eq (const void *p1, const void *p2) +{ + return rtx_equal_p ((const_rtx) p1, (const_rtx) p2); +} + +static hashval_t +libcall_hash (const void *p1) +{ + return hash_rtx ((const_rtx) p1, VOIDmode, NULL, NULL, FALSE); +} + +static void +add_libcall (htab_t htab, rtx libcall) +{ + *htab_find_slot (htab, libcall, INSERT) = libcall; +} + +static bool +arm_libcall_uses_aapcs_base (const_rtx libcall) +{ + static bool init_done = false; + static htab_t libcall_htab; + + if (!init_done) + { + init_done = true; + + libcall_htab = htab_create (31, libcall_hash, libcall_eq, + NULL); + add_libcall (libcall_htab, + convert_optab_libfunc (sfloat_optab, SFmode, SImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfloat_optab, DFmode, SImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfloat_optab, SFmode, DImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfloat_optab, DFmode, DImode)); + + add_libcall (libcall_htab, + convert_optab_libfunc (ufloat_optab, SFmode, SImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufloat_optab, DFmode, SImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufloat_optab, SFmode, DImode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufloat_optab, DFmode, DImode)); + + add_libcall (libcall_htab, + convert_optab_libfunc (sext_optab, SFmode, HFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (trunc_optab, HFmode, SFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfix_optab, DImode, DFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufix_optab, DImode, DFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (sfix_optab, DImode, SFmode)); + add_libcall (libcall_htab, + convert_optab_libfunc (ufix_optab, DImode, SFmode)); + } + + return libcall && htab_find (libcall_htab, libcall) != NULL; +} + +rtx +arm_libcall_value (enum machine_mode mode, const_rtx libcall) +{ + if (TARGET_AAPCS_BASED && arm_pcs_default != ARM_PCS_AAPCS + && GET_MODE_CLASS (mode) == MODE_FLOAT) + { + /* The following libcalls return their result in integer registers, + even though they return a floating point value. */ + if (arm_libcall_uses_aapcs_base (libcall)) + return gen_rtx_REG (mode, ARG_REGISTER(1)); + + } + + return LIBCALL_VALUE (mode); } /* Determine the amount of memory needed to store the possible return @@ -2730,10 +3524,12 @@ arm_apply_result_size (void) { int size = 16; - if (TARGET_ARM) + if (TARGET_32BIT) { if (TARGET_HARD_FLOAT_ABI) { + if (TARGET_VFP) + size += 32; if (TARGET_FPA) size += 12; if (TARGET_MAVERICK) @@ -2746,27 +3542,56 @@ arm_apply_result_size (void) return size; } -/* Decide whether a type should be returned in memory (true) - or in a register (false). This is called by the macro - TARGET_RETURN_IN_MEMORY. */ -bool -arm_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) +/* Decide whether TYPE should be returned in memory (true) + or in a register (false). FNTYPE is the type of the function making + the call. */ +static bool +arm_return_in_memory (const_tree type, const_tree fntype) { HOST_WIDE_INT size; - size = int_size_in_bytes (type); + size = int_size_in_bytes (type); /* Negative if not fixed size. */ + + if (TARGET_AAPCS_BASED) + { + /* Simple, non-aggregate types (ie not including vectors and + complex) are always returned in a register (or registers). + We don't care about which register here, so we can short-cut + some of the detail. */ + if (!AGGREGATE_TYPE_P (type) + && TREE_CODE (type) != VECTOR_TYPE + && TREE_CODE (type) != COMPLEX_TYPE) + return false; + + /* Any return value that is no larger than one word can be + returned in r0. */ + if (((unsigned HOST_WIDE_INT) size) <= UNITS_PER_WORD) + return false; + + /* Check any available co-processors to see if they accept the + type as a register candidate (VFP, for example, can return + some aggregates in consecutive registers). These aren't + available if the call is variadic. */ + if (aapcs_select_return_coproc (type, fntype) >= 0) + return false; + + /* Vector values should be returned using ARM registers, not + memory (unless they're over 16 bytes, which will break since + we only have four call-clobbered registers to play with). */ + if (TREE_CODE (type) == VECTOR_TYPE) + return (size < 0 || size > (4 * UNITS_PER_WORD)); + + /* The rest go in memory. */ + return true; + } - /* Vector values should be returned using ARM registers, not memory (unless - they're over 16 bytes, which will break since we only have four - call-clobbered registers to play with). */ if (TREE_CODE (type) == VECTOR_TYPE) return (size < 0 || size > (4 * UNITS_PER_WORD)); if (!AGGREGATE_TYPE_P (type) && - !(TARGET_AAPCS_BASED && TREE_CODE (type) == COMPLEX_TYPE)) - /* All simple types are returned in registers. - For AAPCS, complex types are treated the same as aggregates. */ - return 0; + (TREE_CODE (type) != VECTOR_TYPE)) + /* All simple types are returned in registers. */ + return false; if (arm_abi != ARM_ABI_APCS) { @@ -2783,7 +3608,7 @@ arm_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) the aggregate is either huge or of variable size, and in either case we will want to return it via memory and not in a register. */ if (size < 0 || size > UNITS_PER_WORD) - return 1; + return true; if (TREE_CODE (type) == RECORD_TYPE) { @@ -2799,37 +3624,37 @@ arm_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) have been created by C++. */ for (field = TYPE_FIELDS (type); field && TREE_CODE (field) != FIELD_DECL; - field = TREE_CHAIN (field)) + field = DECL_CHAIN (field)) continue; if (field == NULL) - return 0; /* An empty structure. Allowed by an extension to ANSI C. */ + return false; /* An empty structure. Allowed by an extension to ANSI C. */ /* Check that the first field is valid for returning in a register. */ /* ... Floats are not allowed */ if (FLOAT_TYPE_P (TREE_TYPE (field))) - return 1; + return true; /* ... Aggregates that are not themselves valid for returning in a register are not allowed. */ if (arm_return_in_memory (TREE_TYPE (field), NULL_TREE)) - return 1; + return true; /* Now check the remaining fields, if any. Only bitfields are allowed, since they are not addressable. */ - for (field = TREE_CHAIN (field); + for (field = DECL_CHAIN (field); field; - field = TREE_CHAIN (field)) + field = DECL_CHAIN (field)) { if (TREE_CODE (field) != FIELD_DECL) continue; if (!DECL_BIT_FIELD_TYPE (field)) - return 1; + return true; } - return 0; + return false; } if (TREE_CODE (type) == UNION_TYPE) @@ -2840,24 +3665,24 @@ arm_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED) integral, or can be returned in an integer register. */ for (field = TYPE_FIELDS (type); field; - field = TREE_CHAIN (field)) + field = DECL_CHAIN (field)) { if (TREE_CODE (field) != FIELD_DECL) continue; if (FLOAT_TYPE_P (TREE_TYPE (field))) - return 1; + return true; if (arm_return_in_memory (TREE_TYPE (field), NULL_TREE)) - return 1; + return true; } - return 0; + return false; } #endif /* not ARM_WINCE */ /* Return all other types in memory. */ - return 1; + return true; } /* Indicate whether or not words of a double are in big-endian order. */ @@ -2882,1831 +3707,3365 @@ arm_float_words_big_endian (void) return 1; } -/* Initialize a variable CUM of type CUMULATIVE_ARGS - for a call to a function whose data type is FNTYPE. - For a library call, FNTYPE is NULL. */ -void -arm_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree fntype, - rtx libname ATTRIBUTE_UNUSED, - tree fndecl ATTRIBUTE_UNUSED) +const struct pcs_attribute_arg { - /* On the ARM, the offset starts at 0. */ - pcum->nregs = 0; - pcum->iwmmxt_nregs = 0; - pcum->can_split = true; + const char *arg; + enum arm_pcs value; +} pcs_attribute_args[] = + { + {"aapcs", ARM_PCS_AAPCS}, + {"aapcs-vfp", ARM_PCS_AAPCS_VFP}, +#if 0 + /* We could recognize these, but changes would be needed elsewhere + * to implement them. */ + {"aapcs-iwmmxt", ARM_PCS_AAPCS_IWMMXT}, + {"atpcs", ARM_PCS_ATPCS}, + {"apcs", ARM_PCS_APCS}, +#endif + {NULL, ARM_PCS_UNKNOWN} + }; - /* Varargs vectors are treated the same as long long. - named_count avoids having to change the way arm handles 'named' */ - pcum->named_count = 0; - pcum->nargs = 0; +static enum arm_pcs +arm_pcs_from_attribute (tree attr) +{ + const struct pcs_attribute_arg *ptr; + const char *arg; - if (TARGET_REALLY_IWMMXT && fntype) + /* Get the value of the argument. */ + if (TREE_VALUE (attr) == NULL_TREE + || TREE_CODE (TREE_VALUE (attr)) != STRING_CST) + return ARM_PCS_UNKNOWN; + + arg = TREE_STRING_POINTER (TREE_VALUE (attr)); + + /* Check it against the list of known arguments. */ + for (ptr = pcs_attribute_args; ptr->arg != NULL; ptr++) + if (streq (arg, ptr->arg)) + return ptr->value; + + /* An unrecognized interrupt type. */ + return ARM_PCS_UNKNOWN; +} + +/* Get the PCS variant to use for this call. TYPE is the function's type + specification, DECL is the specific declartion. DECL may be null if + the call could be indirect or if this is a library call. */ +static enum arm_pcs +arm_get_pcs_model (const_tree type, const_tree decl) +{ + bool user_convention = false; + enum arm_pcs user_pcs = arm_pcs_default; + tree attr; + + gcc_assert (type); + + attr = lookup_attribute ("pcs", TYPE_ATTRIBUTES (type)); + if (attr) { - tree fn_arg; + user_pcs = arm_pcs_from_attribute (TREE_VALUE (attr)); + user_convention = true; + } - for (fn_arg = TYPE_ARG_TYPES (fntype); - fn_arg; - fn_arg = TREE_CHAIN (fn_arg)) - pcum->named_count += 1; + if (TARGET_AAPCS_BASED) + { + /* Detect varargs functions. These always use the base rules + (no argument is ever a candidate for a co-processor + register). */ + bool base_rules = stdarg_p (type); + + if (user_convention) + { + if (user_pcs > ARM_PCS_AAPCS_LOCAL) + sorry ("Non-AAPCS derived PCS variant"); + else if (base_rules && user_pcs != ARM_PCS_AAPCS) + error ("Variadic functions must use the base AAPCS variant"); + } - if (! pcum->named_count) - pcum->named_count = INT_MAX; + if (base_rules) + return ARM_PCS_AAPCS; + else if (user_convention) + return user_pcs; + else if (decl && flag_unit_at_a_time) + { + /* Local functions never leak outside this compilation unit, + so we are free to use whatever conventions are + appropriate. */ + /* FIXME: remove CONST_CAST_TREE when cgraph is constified. */ + struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl)); + if (i && i->local) + return ARM_PCS_AAPCS_LOCAL; + } } + else if (user_convention && user_pcs != arm_pcs_default) + sorry ("PCS variant"); + + /* For everything else we use the target's default. */ + return arm_pcs_default; } -/* Return true if mode/type need doubleword alignment. */ -bool -arm_needs_doubleword_align (enum machine_mode mode, tree type) +static void +aapcs_vfp_cum_init (CUMULATIVE_ARGS *pcum ATTRIBUTE_UNUSED, + const_tree fntype ATTRIBUTE_UNUSED, + rtx libcall ATTRIBUTE_UNUSED, + const_tree fndecl ATTRIBUTE_UNUSED) { - return (GET_MODE_ALIGNMENT (mode) > PARM_BOUNDARY - || (type && TYPE_ALIGN (type) > PARM_BOUNDARY)); + /* Record the unallocated VFP registers. */ + pcum->aapcs_vfp_regs_free = (1 << NUM_VFP_ARG_REGS) - 1; + pcum->aapcs_vfp_reg_alloc = 0; } +/* Walk down the type tree of TYPE counting consecutive base elements. + If *MODEP is VOIDmode, then set it to the first valid floating point + type. If a non-floating point type is found, or if a floating point + type that doesn't match a non-VOIDmode *MODEP is found, then return -1, + otherwise return the count in the sub-tree. */ +static int +aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep) +{ + enum machine_mode mode; + HOST_WIDE_INT size; + + switch (TREE_CODE (type)) + { + case REAL_TYPE: + mode = TYPE_MODE (type); + if (mode != DFmode && mode != SFmode) + return -1; -/* Determine where to put an argument to a function. - Value is zero to push the argument on the stack, - or a hard register in which to store the argument. + if (*modep == VOIDmode) + *modep = mode; - MODE is the argument's machine mode. - TYPE is the data type of the argument (as a tree). - This is null for libcalls where that information may - not be available. - CUM is a variable of type CUMULATIVE_ARGS which gives info about - the preceding args and about the function being called. - NAMED is nonzero if this argument is a named parameter - (otherwise it is an extra parameter matching an ellipsis). */ + if (*modep == mode) + return 1; -rtx -arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode, - tree type, int named) -{ - int nregs; + break; - /* Varargs vectors are treated the same as long long. - named_count avoids having to change the way arm handles 'named' */ - if (TARGET_IWMMXT_ABI - && arm_vector_mode_supported_p (mode) - && pcum->named_count > pcum->nargs + 1) - { - if (pcum->iwmmxt_nregs <= 9) - return gen_rtx_REG (mode, pcum->iwmmxt_nregs + FIRST_IWMMXT_REGNUM); - else + case COMPLEX_TYPE: + mode = TYPE_MODE (TREE_TYPE (type)); + if (mode != DFmode && mode != SFmode) + return -1; + + if (*modep == VOIDmode) + *modep = mode; + + if (*modep == mode) + return 2; + + break; + + case VECTOR_TYPE: + /* Use V2SImode and V4SImode as representatives of all 64-bit + and 128-bit vector types, whether or not those modes are + supported with the present options. */ + size = int_size_in_bytes (type); + switch (size) { - pcum->can_split = false; - return NULL_RTX; + case 8: + mode = V2SImode; + break; + case 16: + mode = V4SImode; + break; + default: + return -1; } - } - /* Put doubleword aligned quantities in even register pairs. */ - if (pcum->nregs & 1 - && ARM_DOUBLEWORD_ALIGN - && arm_needs_doubleword_align (mode, type)) - pcum->nregs++; + if (*modep == VOIDmode) + *modep = mode; - if (mode == VOIDmode) - /* Pick an arbitrary value for operand 2 of the call insn. */ - return const0_rtx; + /* Vector modes are considered to be opaque: two vectors are + equivalent for the purposes of being homogeneous aggregates + if they are the same size. */ + if (*modep == mode) + return 1; - /* Only allow splitting an arg between regs and memory if all preceding - args were allocated to regs. For args passed by reference we only count - the reference pointer. */ - if (pcum->can_split) - nregs = 1; - else - nregs = ARM_NUM_REGS2 (mode, type); + break; - if (!named || pcum->nregs + nregs > NUM_ARG_REGS) - return NULL_RTX; + case ARRAY_TYPE: + { + int count; + tree index = TYPE_DOMAIN (type); + + /* Can't handle incomplete types. */ + if (!COMPLETE_TYPE_P(type)) + return -1; + + count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep); + if (count == -1 + || !index + || !TYPE_MAX_VALUE (index) + || !host_integerp (TYPE_MAX_VALUE (index), 1) + || !TYPE_MIN_VALUE (index) + || !host_integerp (TYPE_MIN_VALUE (index), 1) + || count < 0) + return -1; + + count *= (1 + tree_low_cst (TYPE_MAX_VALUE (index), 1) + - tree_low_cst (TYPE_MIN_VALUE (index), 1)); + + /* There must be no padding. */ + if (!host_integerp (TYPE_SIZE (type), 1) + || (tree_low_cst (TYPE_SIZE (type), 1) + != count * GET_MODE_BITSIZE (*modep))) + return -1; + + return count; + } + + case RECORD_TYPE: + { + int count = 0; + int sub_count; + tree field; - return gen_rtx_REG (mode, pcum->nregs); -} + /* Can't handle incomplete types. */ + if (!COMPLETE_TYPE_P(type)) + return -1; -static int -arm_arg_partial_bytes (CUMULATIVE_ARGS *pcum, enum machine_mode mode, - tree type, bool named ATTRIBUTE_UNUSED) -{ - int nregs = pcum->nregs; + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) != FIELD_DECL) + continue; - if (TARGET_IWMMXT_ABI && arm_vector_mode_supported_p (mode)) - return 0; + sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep); + if (sub_count < 0) + return -1; + count += sub_count; + } - if (NUM_ARG_REGS > nregs - && (NUM_ARG_REGS < nregs + ARM_NUM_REGS2 (mode, type)) - && pcum->can_split) - return (NUM_ARG_REGS - nregs) * UNITS_PER_WORD; + /* There must be no padding. */ + if (!host_integerp (TYPE_SIZE (type), 1) + || (tree_low_cst (TYPE_SIZE (type), 1) + != count * GET_MODE_BITSIZE (*modep))) + return -1; - return 0; -} + return count; + } -/* Variable sized types are passed by reference. This is a GCC - extension to the ARM ABI. */ + case UNION_TYPE: + case QUAL_UNION_TYPE: + { + /* These aren't very interesting except in a degenerate case. */ + int count = 0; + int sub_count; + tree field; -static bool -arm_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED, - enum machine_mode mode ATTRIBUTE_UNUSED, - const_tree type, bool named ATTRIBUTE_UNUSED) -{ - return type && TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST; -} - -/* Encode the current state of the #pragma [no_]long_calls. */ -typedef enum -{ - OFF, /* No #pragma [no_]long_calls is in effect. */ - LONG, /* #pragma long_calls is in effect. */ - SHORT /* #pragma no_long_calls is in effect. */ -} arm_pragma_enum; + /* Can't handle incomplete types. */ + if (!COMPLETE_TYPE_P(type)) + return -1; -static arm_pragma_enum arm_pragma_long_calls = OFF; + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (TREE_CODE (field) != FIELD_DECL) + continue; -void -arm_pr_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED) -{ - arm_pragma_long_calls = LONG; -} + sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep); + if (sub_count < 0) + return -1; + count = count > sub_count ? count : sub_count; + } -void -arm_pr_no_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED) -{ - arm_pragma_long_calls = SHORT; -} + /* There must be no padding. */ + if (!host_integerp (TYPE_SIZE (type), 1) + || (tree_low_cst (TYPE_SIZE (type), 1) + != count * GET_MODE_BITSIZE (*modep))) + return -1; -void -arm_pr_long_calls_off (struct cpp_reader * pfile ATTRIBUTE_UNUSED) -{ - arm_pragma_long_calls = OFF; -} - -/* Table of machine attributes. */ -const struct attribute_spec arm_attribute_table[] = -{ - /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */ - /* Function calls made to this symbol must be done indirectly, because - it may lie outside of the 26 bit addressing range of a normal function - call. */ - { "long_call", 0, 0, false, true, true, NULL }, - /* Whereas these functions are always known to reside within the 26 bit - addressing range. */ - { "short_call", 0, 0, false, true, true, NULL }, - /* Interrupt Service Routines have special prologue and epilogue requirements. */ - { "isr", 0, 1, false, false, false, arm_handle_isr_attribute }, - { "interrupt", 0, 1, false, false, false, arm_handle_isr_attribute }, - { "naked", 0, 0, true, false, false, arm_handle_fndecl_attribute }, -#ifdef ARM_PE - /* ARM/PE has three new attributes: - interfacearm - ? - dllexport - for exporting a function/variable that will live in a dll - dllimport - for importing a function/variable from a dll + return count; + } - Microsoft allows multiple declspecs in one __declspec, separating - them with spaces. We do NOT support this. Instead, use __declspec - multiple times. - */ - { "dllimport", 0, 0, true, false, false, NULL }, - { "dllexport", 0, 0, true, false, false, NULL }, - { "interfacearm", 0, 0, true, false, false, arm_handle_fndecl_attribute }, -#elif TARGET_DLLIMPORT_DECL_ATTRIBUTES - { "dllimport", 0, 0, false, false, false, handle_dll_attribute }, - { "dllexport", 0, 0, false, false, false, handle_dll_attribute }, - { "notshared", 0, 0, false, true, false, arm_handle_notshared_attribute }, -#endif - { NULL, 0, 0, false, false, false, NULL } -}; + default: + break; + } -/* Handle an attribute requiring a FUNCTION_DECL; - arguments as in struct attribute_spec.handler. */ -static tree -arm_handle_fndecl_attribute (tree *node, tree name, tree args ATTRIBUTE_UNUSED, - int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) + return -1; +} + +/* Return true if PCS_VARIANT should use VFP registers. */ +static bool +use_vfp_abi (enum arm_pcs pcs_variant, bool is_double) { - if (TREE_CODE (*node) != FUNCTION_DECL) + if (pcs_variant == ARM_PCS_AAPCS_VFP) { - warning (OPT_Wattributes, "%qs attribute only applies to functions", - IDENTIFIER_POINTER (name)); - *no_add_attrs = true; + static bool seen_thumb1_vfp = false; + + if (TARGET_THUMB1 && !seen_thumb1_vfp) + { + sorry ("Thumb-1 hard-float VFP ABI"); + /* sorry() is not immediately fatal, so only display this once. */ + seen_thumb1_vfp = true; + } + + return true; } - return NULL_TREE; + if (pcs_variant != ARM_PCS_AAPCS_LOCAL) + return false; + + return (TARGET_32BIT && TARGET_VFP && TARGET_HARD_FLOAT && + (TARGET_VFP_DOUBLE || !is_double)); } -/* Handle an "interrupt" or "isr" attribute; - arguments as in struct attribute_spec.handler. */ -static tree -arm_handle_isr_attribute (tree *node, tree name, tree args, int flags, - bool *no_add_attrs) +static bool +aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant, + enum machine_mode mode, const_tree type, + enum machine_mode *base_mode, int *count) { - if (DECL_P (*node)) + enum machine_mode new_mode = VOIDmode; + + if (GET_MODE_CLASS (mode) == MODE_FLOAT + || GET_MODE_CLASS (mode) == MODE_VECTOR_INT + || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) { - if (TREE_CODE (*node) != FUNCTION_DECL) - { - warning (OPT_Wattributes, "%qs attribute only applies to functions", - IDENTIFIER_POINTER (name)); - *no_add_attrs = true; - } - /* FIXME: the argument if any is checked for type attributes; - should it be checked for decl ones? */ + *count = 1; + new_mode = mode; } - else + else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT) { - if (TREE_CODE (*node) == FUNCTION_TYPE - || TREE_CODE (*node) == METHOD_TYPE) - { - if (arm_isr_value (args) == ARM_FT_UNKNOWN) - { - warning (OPT_Wattributes, "%qs attribute ignored", - IDENTIFIER_POINTER (name)); - *no_add_attrs = true; - } - } - else if (TREE_CODE (*node) == POINTER_TYPE - && (TREE_CODE (TREE_TYPE (*node)) == FUNCTION_TYPE - || TREE_CODE (TREE_TYPE (*node)) == METHOD_TYPE) - && arm_isr_value (args) != ARM_FT_UNKNOWN) - { - *node = build_variant_type_copy (*node); - TREE_TYPE (*node) = build_type_attribute_variant - (TREE_TYPE (*node), - tree_cons (name, args, TYPE_ATTRIBUTES (TREE_TYPE (*node)))); - *no_add_attrs = true; - } + *count = 2; + new_mode = (mode == DCmode ? DFmode : SFmode); + } + else if (type && (mode == BLKmode || TREE_CODE (type) == VECTOR_TYPE)) + { + int ag_count = aapcs_vfp_sub_candidate (type, &new_mode); + + if (ag_count > 0 && ag_count <= 4) + *count = ag_count; else - { - /* Possibly pass this attribute on from the type to a decl. */ - if (flags & ((int) ATTR_FLAG_DECL_NEXT - | (int) ATTR_FLAG_FUNCTION_NEXT - | (int) ATTR_FLAG_ARRAY_NEXT)) - { - *no_add_attrs = true; - return tree_cons (name, args, NULL_TREE); - } - else - { - warning (OPT_Wattributes, "%qs attribute ignored", - IDENTIFIER_POINTER (name)); - } - } + return false; } + else + return false; - return NULL_TREE; + + if (!use_vfp_abi (pcs_variant, ARM_NUM_REGS (new_mode) > 1)) + return false; + + *base_mode = new_mode; + return true; } -#if TARGET_DLLIMPORT_DECL_ATTRIBUTES -/* Handle the "notshared" attribute. This attribute is another way of - requesting hidden visibility. ARM's compiler supports - "__declspec(notshared)"; we support the same thing via an - attribute. */ +static bool +aapcs_vfp_is_return_candidate (enum arm_pcs pcs_variant, + enum machine_mode mode, const_tree type) +{ + int count ATTRIBUTE_UNUSED; + enum machine_mode ag_mode ATTRIBUTE_UNUSED; -static tree -arm_handle_notshared_attribute (tree *node, - tree name ATTRIBUTE_UNUSED, - tree args ATTRIBUTE_UNUSED, - int flags ATTRIBUTE_UNUSED, - bool *no_add_attrs) + if (!use_vfp_abi (pcs_variant, false)) + return false; + return aapcs_vfp_is_call_or_return_candidate (pcs_variant, mode, type, + &ag_mode, &count); +} + +static bool +aapcs_vfp_is_call_candidate (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type) { - tree decl = TYPE_NAME (*node); + if (!use_vfp_abi (pcum->pcs_variant, false)) + return false; - if (decl) - { - DECL_VISIBILITY (decl) = VISIBILITY_HIDDEN; - DECL_VISIBILITY_SPECIFIED (decl) = 1; - *no_add_attrs = false; - } - return NULL_TREE; + return aapcs_vfp_is_call_or_return_candidate (pcum->pcs_variant, mode, type, + &pcum->aapcs_vfp_rmode, + &pcum->aapcs_vfp_rcount); } -#endif -/* Return 0 if the attributes for two types are incompatible, 1 if they - are compatible, and 2 if they are nearly compatible (which causes a - warning to be generated). */ -static int -arm_comp_type_attributes (const_tree type1, const_tree type2) +static bool +aapcs_vfp_allocate (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type ATTRIBUTE_UNUSED) { - int l1, l2, s1, s2; + int shift = GET_MODE_SIZE (pcum->aapcs_vfp_rmode) / GET_MODE_SIZE (SFmode); + unsigned mask = (1 << (shift * pcum->aapcs_vfp_rcount)) - 1; + int regno; + + for (regno = 0; regno < NUM_VFP_ARG_REGS; regno += shift) + if (((pcum->aapcs_vfp_regs_free >> regno) & mask) == mask) + { + pcum->aapcs_vfp_reg_alloc = mask << regno; + if (mode == BLKmode || (mode == TImode && !TARGET_NEON)) + { + int i; + int rcount = pcum->aapcs_vfp_rcount; + int rshift = shift; + enum machine_mode rmode = pcum->aapcs_vfp_rmode; + rtx par; + if (!TARGET_NEON) + { + /* Avoid using unsupported vector modes. */ + if (rmode == V2SImode) + rmode = DImode; + else if (rmode == V4SImode) + { + rmode = DImode; + rcount *= 2; + rshift /= 2; + } + } + par = gen_rtx_PARALLEL (mode, rtvec_alloc (rcount)); + for (i = 0; i < rcount; i++) + { + rtx tmp = gen_rtx_REG (rmode, + FIRST_VFP_REGNUM + regno + i * rshift); + tmp = gen_rtx_EXPR_LIST + (VOIDmode, tmp, + GEN_INT (i * GET_MODE_SIZE (rmode))); + XVECEXP (par, 0, i) = tmp; + } - /* Check for mismatch of non-default calling convention. */ - if (TREE_CODE (type1) != FUNCTION_TYPE) - return 1; + pcum->aapcs_reg = par; + } + else + pcum->aapcs_reg = gen_rtx_REG (mode, FIRST_VFP_REGNUM + regno); + return true; + } + return false; +} - /* Check for mismatched call attributes. */ - l1 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type1)) != NULL; - l2 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type2)) != NULL; - s1 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type1)) != NULL; - s2 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type2)) != NULL; +static rtx +aapcs_vfp_allocate_return_reg (enum arm_pcs pcs_variant ATTRIBUTE_UNUSED, + enum machine_mode mode, + const_tree type ATTRIBUTE_UNUSED) +{ + if (!use_vfp_abi (pcs_variant, false)) + return false; - /* Only bother to check if an attribute is defined. */ - if (l1 | l2 | s1 | s2) + if (mode == BLKmode || (mode == TImode && !TARGET_NEON)) { - /* If one type has an attribute, the other must have the same attribute. */ - if ((l1 != l2) || (s1 != s2)) - return 0; + int count; + enum machine_mode ag_mode; + int i; + rtx par; + int shift; + + aapcs_vfp_is_call_or_return_candidate (pcs_variant, mode, type, + &ag_mode, &count); - /* Disallow mixed attributes. */ - if ((l1 & s2) || (l2 & s1)) - return 0; - } + if (!TARGET_NEON) + { + if (ag_mode == V2SImode) + ag_mode = DImode; + else if (ag_mode == V4SImode) + { + ag_mode = DImode; + count *= 2; + } + } + shift = GET_MODE_SIZE(ag_mode) / GET_MODE_SIZE(SFmode); + par = gen_rtx_PARALLEL (mode, rtvec_alloc (count)); + for (i = 0; i < count; i++) + { + rtx tmp = gen_rtx_REG (ag_mode, FIRST_VFP_REGNUM + i * shift); + tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, + GEN_INT (i * GET_MODE_SIZE (ag_mode))); + XVECEXP (par, 0, i) = tmp; + } - /* Check for mismatched ISR attribute. */ - l1 = lookup_attribute ("isr", TYPE_ATTRIBUTES (type1)) != NULL; - if (! l1) - l1 = lookup_attribute ("interrupt", TYPE_ATTRIBUTES (type1)) != NULL; - l2 = lookup_attribute ("isr", TYPE_ATTRIBUTES (type2)) != NULL; - if (! l2) - l1 = lookup_attribute ("interrupt", TYPE_ATTRIBUTES (type2)) != NULL; - if (l1 != l2) - return 0; + return par; + } - return 1; + return gen_rtx_REG (mode, FIRST_VFP_REGNUM); } -/* Assigns default attributes to newly defined type. This is used to - set short_call/long_call attributes for function types of - functions defined inside corresponding #pragma scopes. */ static void -arm_set_default_type_attributes (tree type) +aapcs_vfp_advance (CUMULATIVE_ARGS *pcum ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + const_tree type ATTRIBUTE_UNUSED) +{ + pcum->aapcs_vfp_regs_free &= ~pcum->aapcs_vfp_reg_alloc; + pcum->aapcs_vfp_reg_alloc = 0; + return; +} + +#define AAPCS_CP(X) \ + { \ + aapcs_ ## X ## _cum_init, \ + aapcs_ ## X ## _is_call_candidate, \ + aapcs_ ## X ## _allocate, \ + aapcs_ ## X ## _is_return_candidate, \ + aapcs_ ## X ## _allocate_return_reg, \ + aapcs_ ## X ## _advance \ + } + +/* Table of co-processors that can be used to pass arguments in + registers. Idealy no arugment should be a candidate for more than + one co-processor table entry, but the table is processed in order + and stops after the first match. If that entry then fails to put + the argument into a co-processor register, the argument will go on + the stack. */ +static struct +{ + /* Initialize co-processor related state in CUMULATIVE_ARGS structure. */ + void (*cum_init) (CUMULATIVE_ARGS *, const_tree, rtx, const_tree); + + /* Return true if an argument of mode MODE (or type TYPE if MODE is + BLKmode) is a candidate for this co-processor's registers; this + function should ignore any position-dependent state in + CUMULATIVE_ARGS and only use call-type dependent information. */ + bool (*is_call_candidate) (CUMULATIVE_ARGS *, enum machine_mode, const_tree); + + /* Return true if the argument does get a co-processor register; it + should set aapcs_reg to an RTX of the register allocated as is + required for a return from FUNCTION_ARG. */ + bool (*allocate) (CUMULATIVE_ARGS *, enum machine_mode, const_tree); + + /* Return true if a result of mode MODE (or type TYPE if MODE is + BLKmode) is can be returned in this co-processor's registers. */ + bool (*is_return_candidate) (enum arm_pcs, enum machine_mode, const_tree); + + /* Allocate and return an RTX element to hold the return type of a + call, this routine must not fail and will only be called if + is_return_candidate returned true with the same parameters. */ + rtx (*allocate_return_reg) (enum arm_pcs, enum machine_mode, const_tree); + + /* Finish processing this argument and prepare to start processing + the next one. */ + void (*advance) (CUMULATIVE_ARGS *, enum machine_mode, const_tree); +} aapcs_cp_arg_layout[ARM_NUM_COPROC_SLOTS] = + { + AAPCS_CP(vfp) + }; + +#undef AAPCS_CP + +static int +aapcs_select_call_coproc (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type) { - /* Add __attribute__ ((long_call)) to all functions, when - inside #pragma long_calls or __attribute__ ((short_call)), - when inside #pragma no_long_calls. */ - if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE) - { - tree type_attr_list, attr_name; - type_attr_list = TYPE_ATTRIBUTES (type); + int i; - if (arm_pragma_long_calls == LONG) - attr_name = get_identifier ("long_call"); - else if (arm_pragma_long_calls == SHORT) - attr_name = get_identifier ("short_call"); - else - return; + for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++) + if (aapcs_cp_arg_layout[i].is_call_candidate (pcum, mode, type)) + return i; - type_attr_list = tree_cons (attr_name, NULL_TREE, type_attr_list); - TYPE_ATTRIBUTES (type) = type_attr_list; - } + return -1; } - -/* Return true if DECL is known to be linked into section SECTION. */ -static bool -arm_function_in_section_p (tree decl, section *section) +static int +aapcs_select_return_coproc (const_tree type, const_tree fntype) { - /* We can only be certain about functions defined in the same - compilation unit. */ - if (!TREE_STATIC (decl)) - return false; - - /* Make sure that SYMBOL always binds to the definition in this - compilation unit. */ - if (!targetm.binds_local_p (decl)) - return false; + /* We aren't passed a decl, so we can't check that a call is local. + However, it isn't clear that that would be a win anyway, since it + might limit some tail-calling opportunities. */ + enum arm_pcs pcs_variant; - /* If DECL_SECTION_NAME is set, assume it is trustworthy. */ - if (!DECL_SECTION_NAME (decl)) + if (fntype) { - /* Only cater for unit-at-a-time mode, where we know that the user - cannot later specify a section for DECL. */ - if (!flag_unit_at_a_time) - return false; + const_tree fndecl = NULL_TREE; - /* Make sure that we will not create a unique section for DECL. */ - if (flag_function_sections || DECL_ONE_ONLY (decl)) - return false; + if (TREE_CODE (fntype) == FUNCTION_DECL) + { + fndecl = fntype; + fntype = TREE_TYPE (fntype); + } + + pcs_variant = arm_get_pcs_model (fntype, fndecl); } + else + pcs_variant = arm_pcs_default; - return function_section (decl) == section; + if (pcs_variant != ARM_PCS_AAPCS) + { + int i; + + for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++) + if (aapcs_cp_arg_layout[i].is_return_candidate (pcs_variant, + TYPE_MODE (type), + type)) + return i; + } + return -1; } -/* Return nonzero if a 32-bit "long_call" should be generated for - a call from the current function to DECL. We generate a long_call - if the function: +static rtx +aapcs_allocate_return_reg (enum machine_mode mode, const_tree type, + const_tree fntype) +{ + /* We aren't passed a decl, so we can't check that a call is local. + However, it isn't clear that that would be a win anyway, since it + might limit some tail-calling opportunities. */ + enum arm_pcs pcs_variant; + int unsignedp ATTRIBUTE_UNUSED; - a. has an __attribute__((long call)) - or b. is within the scope of a #pragma long_calls - or c. the -mlong-calls command line switch has been specified + if (fntype) + { + const_tree fndecl = NULL_TREE; - However we do not generate a long call if the function: + if (TREE_CODE (fntype) == FUNCTION_DECL) + { + fndecl = fntype; + fntype = TREE_TYPE (fntype); + } - d. has an __attribute__ ((short_call)) - or e. is inside the scope of a #pragma no_long_calls - or f. is defined in the same section as the current function. */ + pcs_variant = arm_get_pcs_model (fntype, fndecl); + } + else + pcs_variant = arm_pcs_default; -bool -arm_is_long_call_p (tree decl) -{ - tree attrs; + /* Promote integer types. */ + if (type && INTEGRAL_TYPE_P (type)) + mode = arm_promote_function_mode (type, mode, &unsignedp, fntype, 1); - if (!decl) - return TARGET_LONG_CALLS; + if (pcs_variant != ARM_PCS_AAPCS) + { + int i; - attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl)); - if (lookup_attribute ("short_call", attrs)) - return false; + for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++) + if (aapcs_cp_arg_layout[i].is_return_candidate (pcs_variant, mode, + type)) + return aapcs_cp_arg_layout[i].allocate_return_reg (pcs_variant, + mode, type); + } - /* For "f", be conservative, and only cater for cases in which the - whole of the current function is placed in the same section. */ - if (!flag_reorder_blocks_and_partition - && arm_function_in_section_p (decl, current_function_section ())) - return false; + /* Promotes small structs returned in a register to full-word size + for big-endian AAPCS. */ + if (type && arm_return_in_msb (type)) + { + HOST_WIDE_INT size = int_size_in_bytes (type); + if (size % UNITS_PER_WORD != 0) + { + size += UNITS_PER_WORD - size % UNITS_PER_WORD; + mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0); + } + } - if (lookup_attribute ("long_call", attrs)) - return true; + return gen_rtx_REG (mode, R0_REGNUM); +} - return TARGET_LONG_CALLS; +rtx +aapcs_libcall_value (enum machine_mode mode) +{ + return aapcs_allocate_return_reg (mode, NULL_TREE, NULL_TREE); } -/* Return nonzero if it is ok to make a tail-call to DECL. */ -static bool -arm_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED) +/* Lay out a function argument using the AAPCS rules. The rule + numbers referred to here are those in the AAPCS. */ +static void +aapcs_layout_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type, bool named) { - unsigned long func_type; + int nregs, nregs2; + int ncrn; - if (cfun->machine->sibcall_blocked) - return false; + /* We only need to do this once per argument. */ + if (pcum->aapcs_arg_processed) + return; - /* Never tailcall something for which we have no decl, or if we - are in Thumb mode. */ - if (decl == NULL || TARGET_THUMB) - return false; + pcum->aapcs_arg_processed = true; - /* The PIC register is live on entry to VxWorks PLT entries, so we - must make the call before restoring the PIC register. */ - if (TARGET_VXWORKS_RTP && flag_pic && !targetm.binds_local_p (decl)) - return false; + /* Special case: if named is false then we are handling an incoming + anonymous argument which is on the stack. */ + if (!named) + return; + + /* Is this a potential co-processor register candidate? */ + if (pcum->pcs_variant != ARM_PCS_AAPCS) + { + int slot = aapcs_select_call_coproc (pcum, mode, type); + pcum->aapcs_cprc_slot = slot; - /* Cannot tail-call to long calls, since these are out of range of - a branch instruction. */ - if (arm_is_long_call_p (decl)) - return false; + /* We don't have to apply any of the rules from part B of the + preparation phase, these are handled elsewhere in the + compiler. */ - /* If we are interworking and the function is not declared static - then we can't tail-call it unless we know that it exists in this - compilation unit (since it might be a Thumb routine). */ - if (TARGET_INTERWORK && TREE_PUBLIC (decl) && !TREE_ASM_WRITTEN (decl)) - return false; + if (slot >= 0) + { + /* A Co-processor register candidate goes either in its own + class of registers or on the stack. */ + if (!pcum->aapcs_cprc_failed[slot]) + { + /* C1.cp - Try to allocate the argument to co-processor + registers. */ + if (aapcs_cp_arg_layout[slot].allocate (pcum, mode, type)) + return; + + /* C2.cp - Put the argument on the stack and note that we + can't assign any more candidates in this slot. We also + need to note that we have allocated stack space, so that + we won't later try to split a non-cprc candidate between + core registers and the stack. */ + pcum->aapcs_cprc_failed[slot] = true; + pcum->can_split = false; + } - func_type = arm_current_func_type (); - /* Never tailcall from an ISR routine - it needs a special exit sequence. */ - if (IS_INTERRUPT (func_type)) - return false; + /* We didn't get a register, so this argument goes on the + stack. */ + gcc_assert (pcum->can_split == false); + return; + } + } - /* Never tailcall if function may be called with a misaligned SP. */ - if (IS_STACKALIGN (func_type)) - return false; + /* C3 - For double-word aligned arguments, round the NCRN up to the + next even number. */ + ncrn = pcum->aapcs_ncrn; + if ((ncrn & 1) && arm_needs_doubleword_align (mode, type)) + ncrn++; - /* Everything else is ok. */ - return true; -} + nregs = ARM_NUM_REGS2(mode, type); - -/* Addressing mode support functions. */ + /* Sigh, this test should really assert that nregs > 0, but a GCC + extension allows empty structs and then gives them empty size; it + then allows such a structure to be passed by value. For some of + the code below we have to pretend that such an argument has + non-zero size so that we 'locate' it correctly either in + registers or on the stack. */ + gcc_assert (nregs >= 0); -/* Return nonzero if X is a legitimate immediate operand when compiling - for PIC. We know that X satisfies CONSTANT_P and flag_pic is true. */ -int -legitimate_pic_operand_p (rtx x) -{ - if (GET_CODE (x) == SYMBOL_REF - || (GET_CODE (x) == CONST - && GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)) - return 0; + nregs2 = nregs ? nregs : 1; - return 1; -} + /* C4 - Argument fits entirely in core registers. */ + if (ncrn + nregs2 <= NUM_ARG_REGS) + { + pcum->aapcs_reg = gen_rtx_REG (mode, ncrn); + pcum->aapcs_next_ncrn = ncrn + nregs; + return; + } -/* Record that the current function needs a PIC register. Initialize - cfun->machine->pic_reg if we have not already done so. */ + /* C5 - Some core registers left and there are no arguments already + on the stack: split this argument between the remaining core + registers and the stack. */ + if (ncrn < NUM_ARG_REGS && pcum->can_split) + { + pcum->aapcs_reg = gen_rtx_REG (mode, ncrn); + pcum->aapcs_next_ncrn = NUM_ARG_REGS; + pcum->aapcs_partial = (NUM_ARG_REGS - ncrn) * UNITS_PER_WORD; + return; + } -static void -require_pic_register (void) + /* C6 - NCRN is set to 4. */ + pcum->aapcs_next_ncrn = NUM_ARG_REGS; + + /* C7,C8 - arugment goes on the stack. We have nothing to do here. */ + return; +} + +/* Initialize a variable CUM of type CUMULATIVE_ARGS + for a call to a function whose data type is FNTYPE. + For a library call, FNTYPE is NULL. */ +void +arm_init_cumulative_args (CUMULATIVE_ARGS *pcum, tree fntype, + rtx libname, + tree fndecl ATTRIBUTE_UNUSED) { - /* A lot of the logic here is made obscure by the fact that this - routine gets called as part of the rtx cost estimation process. - We don't want those calls to affect any assumptions about the real - function; and further, we can't call entry_of_function() until we - start the real expansion process. */ - if (!crtl->uses_pic_offset_table) + /* Long call handling. */ + if (fntype) + pcum->pcs_variant = arm_get_pcs_model (fntype, fndecl); + else + pcum->pcs_variant = arm_pcs_default; + + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) { - gcc_assert (can_create_pseudo_p ()); - if (arm_pic_register != INVALID_REGNUM) + if (arm_libcall_uses_aapcs_base (libname)) + pcum->pcs_variant = ARM_PCS_AAPCS; + + pcum->aapcs_ncrn = pcum->aapcs_next_ncrn = 0; + pcum->aapcs_reg = NULL_RTX; + pcum->aapcs_partial = 0; + pcum->aapcs_arg_processed = false; + pcum->aapcs_cprc_slot = -1; + pcum->can_split = true; + + if (pcum->pcs_variant != ARM_PCS_AAPCS) { - cfun->machine->pic_reg = gen_rtx_REG (Pmode, arm_pic_register); + int i; - /* Play games to avoid marking the function as needing pic - if we are being called as part of the cost-estimation - process. */ - if (current_ir_type () != IR_GIMPLE) - crtl->uses_pic_offset_table = 1; + for (i = 0; i < ARM_NUM_COPROC_SLOTS; i++) + { + pcum->aapcs_cprc_failed[i] = false; + aapcs_cp_arg_layout[i].cum_init (pcum, fntype, libname, fndecl); + } } - else - { - rtx seq; + return; + } - cfun->machine->pic_reg = gen_reg_rtx (Pmode); + /* Legacy ABIs */ - /* Play games to avoid marking the function as needing pic - if we are being called as part of the cost-estimation - process. */ - if (current_ir_type () != IR_GIMPLE) - { - crtl->uses_pic_offset_table = 1; - start_sequence (); + /* On the ARM, the offset starts at 0. */ + pcum->nregs = 0; + pcum->iwmmxt_nregs = 0; + pcum->can_split = true; - arm_load_pic_register (0UL); + /* Varargs vectors are treated the same as long long. + named_count avoids having to change the way arm handles 'named' */ + pcum->named_count = 0; + pcum->nargs = 0; - seq = get_insns (); - end_sequence (); - emit_insn_after (seq, entry_of_function ()); - } - } + if (TARGET_REALLY_IWMMXT && fntype) + { + tree fn_arg; + + for (fn_arg = TYPE_ARG_TYPES (fntype); + fn_arg; + fn_arg = TREE_CHAIN (fn_arg)) + pcum->named_count += 1; + + if (! pcum->named_count) + pcum->named_count = INT_MAX; } } -rtx -legitimize_pic_address (rtx orig, enum machine_mode mode, rtx reg) + +/* Return true if mode/type need doubleword alignment. */ +bool +arm_needs_doubleword_align (enum machine_mode mode, const_tree type) { - if (GET_CODE (orig) == SYMBOL_REF - || GET_CODE (orig) == LABEL_REF) - { - rtx pic_ref, address; - rtx insn; - int subregs = 0; + return (GET_MODE_ALIGNMENT (mode) > PARM_BOUNDARY + || (type && TYPE_ALIGN (type) > PARM_BOUNDARY)); +} - /* If this function doesn't have a pic register, create one now. */ - require_pic_register (); - if (reg == 0) - { - gcc_assert (can_create_pseudo_p ()); - reg = gen_reg_rtx (Pmode); +/* Determine where to put an argument to a function. + Value is zero to push the argument on the stack, + or a hard register in which to store the argument. - subregs = 1; - } + MODE is the argument's machine mode. + TYPE is the data type of the argument (as a tree). + This is null for libcalls where that information may + not be available. + CUM is a variable of type CUMULATIVE_ARGS which gives info about + the preceding args and about the function being called. + NAMED is nonzero if this argument is a named parameter + (otherwise it is an extra parameter matching an ellipsis). - if (subregs) - address = gen_reg_rtx (Pmode); - else - address = reg; + On the ARM, normally the first 16 bytes are passed in registers r0-r3; all + other arguments are passed on the stack. If (NAMED == 0) (which happens + only in assign_parms, since TARGET_SETUP_INCOMING_VARARGS is + defined), say it is passed in the stack (function_prologue will + indeed make it pass in the stack if necessary). */ - if (TARGET_ARM) - emit_insn (gen_pic_load_addr_arm (address, orig)); - else if (TARGET_THUMB2) - emit_insn (gen_pic_load_addr_thumb2 (address, orig)); - else /* TARGET_THUMB1 */ - emit_insn (gen_pic_load_addr_thumb1 (address, orig)); +static rtx +arm_function_arg (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type, bool named) +{ + int nregs; - /* VxWorks does not impose a fixed gap between segments; the run-time - gap can be different from the object-file gap. We therefore can't - use GOTOFF unless we are absolutely sure that the symbol is in the - same segment as the GOT. Unfortunately, the flexibility of linker - scripts means that we can't be sure of that in general, so assume - that GOTOFF is never valid on VxWorks. */ - if ((GET_CODE (orig) == LABEL_REF - || (GET_CODE (orig) == SYMBOL_REF && - SYMBOL_REF_LOCAL_P (orig))) - && NEED_GOT_RELOC - && !TARGET_VXWORKS_RTP) - pic_ref = gen_rtx_PLUS (Pmode, cfun->machine->pic_reg, address); + /* Handle the special case quickly. Pick an arbitrary value for op2 of + a call insn (op3 of a call_value insn). */ + if (mode == VOIDmode) + return const0_rtx; + + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) + { + aapcs_layout_arg (pcum, mode, type, named); + return pcum->aapcs_reg; + } + + /* Varargs vectors are treated the same as long long. + named_count avoids having to change the way arm handles 'named' */ + if (TARGET_IWMMXT_ABI + && arm_vector_mode_supported_p (mode) + && pcum->named_count > pcum->nargs + 1) + { + if (pcum->iwmmxt_nregs <= 9) + return gen_rtx_REG (mode, pcum->iwmmxt_nregs + FIRST_IWMMXT_REGNUM); else { - pic_ref = gen_const_mem (Pmode, - gen_rtx_PLUS (Pmode, cfun->machine->pic_reg, - address)); + pcum->can_split = false; + return NULL_RTX; } + } - insn = emit_move_insn (reg, pic_ref); + /* Put doubleword aligned quantities in even register pairs. */ + if (pcum->nregs & 1 + && ARM_DOUBLEWORD_ALIGN + && arm_needs_doubleword_align (mode, type)) + pcum->nregs++; - /* Put a REG_EQUAL note on this insn, so that it can be optimized - by loop. */ - set_unique_reg_note (insn, REG_EQUAL, orig); + /* Only allow splitting an arg between regs and memory if all preceding + args were allocated to regs. For args passed by reference we only count + the reference pointer. */ + if (pcum->can_split) + nregs = 1; + else + nregs = ARM_NUM_REGS2 (mode, type); - return reg; - } - else if (GET_CODE (orig) == CONST) - { - rtx base, offset; + if (!named || pcum->nregs + nregs > NUM_ARG_REGS) + return NULL_RTX; - if (GET_CODE (XEXP (orig, 0)) == PLUS - && XEXP (XEXP (orig, 0), 0) == cfun->machine->pic_reg) - return orig; + return gen_rtx_REG (mode, pcum->nregs); +} - if (GET_CODE (XEXP (orig, 0)) == UNSPEC - && XINT (XEXP (orig, 0), 1) == UNSPEC_TLS) - return orig; +static int +arm_arg_partial_bytes (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + tree type, bool named) +{ + int nregs = pcum->nregs; - if (reg == 0) - { - gcc_assert (can_create_pseudo_p ()); - reg = gen_reg_rtx (Pmode); - } + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) + { + aapcs_layout_arg (pcum, mode, type, named); + return pcum->aapcs_partial; + } - gcc_assert (GET_CODE (XEXP (orig, 0)) == PLUS); + if (TARGET_IWMMXT_ABI && arm_vector_mode_supported_p (mode)) + return 0; - base = legitimize_pic_address (XEXP (XEXP (orig, 0), 0), Pmode, reg); - offset = legitimize_pic_address (XEXP (XEXP (orig, 0), 1), Pmode, - base == reg ? 0 : reg); + if (NUM_ARG_REGS > nregs + && (NUM_ARG_REGS < nregs + ARM_NUM_REGS2 (mode, type)) + && pcum->can_split) + return (NUM_ARG_REGS - nregs) * UNITS_PER_WORD; - if (GET_CODE (offset) == CONST_INT) - { - /* The base register doesn't really matter, we only want to - test the index for the appropriate mode. */ - if (!arm_legitimate_index_p (mode, offset, SET, 0)) - { - gcc_assert (can_create_pseudo_p ()); - offset = force_reg (Pmode, offset); - } + return 0; +} - if (GET_CODE (offset) == CONST_INT) - return plus_constant (base, INTVAL (offset)); - } +/* Update the data in PCUM to advance over an argument + of mode MODE and data type TYPE. + (TYPE is null for libcalls where that information may not be available.) */ - if (GET_MODE_SIZE (mode) > 4 - && (GET_MODE_CLASS (mode) == MODE_INT - || TARGET_SOFT_FLOAT)) +static void +arm_function_arg_advance (CUMULATIVE_ARGS *pcum, enum machine_mode mode, + const_tree type, bool named) +{ + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) + { + aapcs_layout_arg (pcum, mode, type, named); + + if (pcum->aapcs_cprc_slot >= 0) { - emit_insn (gen_addsi3 (reg, base, offset)); - return reg; + aapcs_cp_arg_layout[pcum->aapcs_cprc_slot].advance (pcum, mode, + type); + pcum->aapcs_cprc_slot = -1; } - return gen_rtx_PLUS (Pmode, base, offset); + /* Generic stuff. */ + pcum->aapcs_arg_processed = false; + pcum->aapcs_ncrn = pcum->aapcs_next_ncrn; + pcum->aapcs_reg = NULL_RTX; + pcum->aapcs_partial = 0; + } + else + { + pcum->nargs += 1; + if (arm_vector_mode_supported_p (mode) + && pcum->named_count > pcum->nargs + && TARGET_IWMMXT_ABI) + pcum->iwmmxt_nregs += 1; + else + pcum->nregs += ARM_NUM_REGS2 (mode, type); } - - return orig; } +/* Variable sized types are passed by reference. This is a GCC + extension to the ARM ABI. */ -/* Find a spare register to use during the prolog of a function. */ - -static int -thumb_find_work_register (unsigned long pushed_regs_mask) +static bool +arm_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED, + enum machine_mode mode ATTRIBUTE_UNUSED, + const_tree type, bool named ATTRIBUTE_UNUSED) { - int reg; - - /* Check the argument registers first as these are call-used. The - register allocation order means that sometimes r3 might be used - but earlier argument registers might not, so check them all. */ - for (reg = LAST_ARG_REGNUM; reg >= 0; reg --) - if (!df_regs_ever_live_p (reg)) - return reg; - - /* Before going on to check the call-saved registers we can try a couple - more ways of deducing that r3 is available. The first is when we are - pushing anonymous arguments onto the stack and we have less than 4 - registers worth of fixed arguments(*). In this case r3 will be part of - the variable argument list and so we can be sure that it will be - pushed right at the start of the function. Hence it will be available - for the rest of the prologue. - (*): ie crtl->args.pretend_args_size is greater than 0. */ - if (cfun->machine->uses_anonymous_args - && crtl->args.pretend_args_size > 0) - return LAST_ARG_REGNUM; - - /* The other case is when we have fixed arguments but less than 4 registers - worth. In this case r3 might be used in the body of the function, but - it is not being used to convey an argument into the function. In theory - we could just check crtl->args.size to see how many bytes are - being passed in argument registers, but it seems that it is unreliable. - Sometimes it will have the value 0 when in fact arguments are being - passed. (See testcase execute/20021111-1.c for an example). So we also - check the args_info.nregs field as well. The problem with this field is - that it makes no allowances for arguments that are passed to the - function but which are not used. Hence we could miss an opportunity - when a function has an unused argument in r3. But it is better to be - safe than to be sorry. */ - if (! cfun->machine->uses_anonymous_args - && crtl->args.size >= 0 - && crtl->args.size <= (LAST_ARG_REGNUM * UNITS_PER_WORD) - && crtl->args.info.nregs < 4) - return LAST_ARG_REGNUM; - - /* Otherwise look for a call-saved register that is going to be pushed. */ - for (reg = LAST_LO_REGNUM; reg > LAST_ARG_REGNUM; reg --) - if (pushed_regs_mask & (1 << reg)) - return reg; - - if (TARGET_THUMB2) - { - /* Thumb-2 can use high regs. */ - for (reg = FIRST_HI_REGNUM; reg < 15; reg ++) - if (pushed_regs_mask & (1 << reg)) - return reg; - } - /* Something went wrong - thumb_compute_save_reg_mask() - should have arranged for a suitable register to be pushed. */ - gcc_unreachable (); + return type && TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST; } + +/* Encode the current state of the #pragma [no_]long_calls. */ +typedef enum +{ + OFF, /* No #pragma [no_]long_calls is in effect. */ + LONG, /* #pragma long_calls is in effect. */ + SHORT /* #pragma no_long_calls is in effect. */ +} arm_pragma_enum; -static GTY(()) int pic_labelno; - -/* Generate code to load the PIC register. In thumb mode SCRATCH is a - low register. */ +static arm_pragma_enum arm_pragma_long_calls = OFF; void -arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED) +arm_pr_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED) { - rtx l1, labelno, pic_tmp, pic_tmp2, pic_rtx, pic_reg; - rtx global_offset_table; - - if (crtl->uses_pic_offset_table == 0 || TARGET_SINGLE_PIC_BASE) - return; + arm_pragma_long_calls = LONG; +} - gcc_assert (flag_pic); +void +arm_pr_no_long_calls (struct cpp_reader * pfile ATTRIBUTE_UNUSED) +{ + arm_pragma_long_calls = SHORT; +} - pic_reg = cfun->machine->pic_reg; - if (TARGET_VXWORKS_RTP) +void +arm_pr_long_calls_off (struct cpp_reader * pfile ATTRIBUTE_UNUSED) +{ + arm_pragma_long_calls = OFF; +} + +/* Handle an attribute requiring a FUNCTION_DECL; + arguments as in struct attribute_spec.handler. */ +static tree +arm_handle_fndecl_attribute (tree *node, tree name, tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) +{ + if (TREE_CODE (*node) != FUNCTION_DECL) { - pic_rtx = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE); - pic_rtx = gen_rtx_CONST (Pmode, pic_rtx); - emit_insn (gen_pic_load_addr_arm (pic_reg, pic_rtx)); + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + } - emit_insn (gen_rtx_SET (Pmode, pic_reg, gen_rtx_MEM (Pmode, pic_reg))); + return NULL_TREE; +} - pic_tmp = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX); - emit_insn (gen_pic_offset_arm (pic_reg, pic_reg, pic_tmp)); +/* Handle an "interrupt" or "isr" attribute; + arguments as in struct attribute_spec.handler. */ +static tree +arm_handle_isr_attribute (tree *node, tree name, tree args, int flags, + bool *no_add_attrs) +{ + if (DECL_P (*node)) + { + if (TREE_CODE (*node) != FUNCTION_DECL) + { + warning (OPT_Wattributes, "%qE attribute only applies to functions", + name); + *no_add_attrs = true; + } + /* FIXME: the argument if any is checked for type attributes; + should it be checked for decl ones? */ } else { - /* We use an UNSPEC rather than a LABEL_REF because this label - never appears in the code stream. */ - - labelno = GEN_INT (pic_labelno++); - l1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); - l1 = gen_rtx_CONST (VOIDmode, l1); - - global_offset_table - = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_"); - /* On the ARM the PC register contains 'dot + 8' at the time of the - addition, on the Thumb it is 'dot + 4'. */ - pic_tmp = plus_constant (l1, TARGET_ARM ? 8 : 4); - if (GOT_PCREL) + if (TREE_CODE (*node) == FUNCTION_TYPE + || TREE_CODE (*node) == METHOD_TYPE) { - pic_tmp2 = gen_rtx_PLUS (Pmode, global_offset_table, pc_rtx); - pic_tmp2 = gen_rtx_CONST (VOIDmode, pic_tmp2); + if (arm_isr_value (args) == ARM_FT_UNKNOWN) + { + warning (OPT_Wattributes, "%qE attribute ignored", + name); + *no_add_attrs = true; + } } - else - pic_tmp2 = gen_rtx_CONST (VOIDmode, global_offset_table); - - pic_rtx = gen_rtx_MINUS (Pmode, pic_tmp2, pic_tmp); - pic_rtx = gen_rtx_CONST (Pmode, pic_rtx); - - if (TARGET_ARM) + else if (TREE_CODE (*node) == POINTER_TYPE + && (TREE_CODE (TREE_TYPE (*node)) == FUNCTION_TYPE + || TREE_CODE (TREE_TYPE (*node)) == METHOD_TYPE) + && arm_isr_value (args) != ARM_FT_UNKNOWN) { - emit_insn (gen_pic_load_addr_arm (pic_reg, pic_rtx)); - emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno)); + *node = build_variant_type_copy (*node); + TREE_TYPE (*node) = build_type_attribute_variant + (TREE_TYPE (*node), + tree_cons (name, args, TYPE_ATTRIBUTES (TREE_TYPE (*node)))); + *no_add_attrs = true; } - else if (TARGET_THUMB2) + else { - /* Thumb-2 only allows very limited access to the PC. Calculate the - address in a temporary register. */ - if (arm_pic_register != INVALID_REGNUM) + /* Possibly pass this attribute on from the type to a decl. */ + if (flags & ((int) ATTR_FLAG_DECL_NEXT + | (int) ATTR_FLAG_FUNCTION_NEXT + | (int) ATTR_FLAG_ARRAY_NEXT)) { - pic_tmp = gen_rtx_REG (SImode, - thumb_find_work_register (saved_regs)); + *no_add_attrs = true; + return tree_cons (name, args, NULL_TREE); } else { - gcc_assert (can_create_pseudo_p ()); - pic_tmp = gen_reg_rtx (Pmode); - } - - emit_insn (gen_pic_load_addr_thumb2 (pic_reg, pic_rtx)); - emit_insn (gen_pic_load_dot_plus_four (pic_tmp, labelno)); - emit_insn (gen_addsi3 (pic_reg, pic_reg, pic_tmp)); - } - else /* TARGET_THUMB1 */ - { - if (arm_pic_register != INVALID_REGNUM - && REGNO (pic_reg) > LAST_LO_REGNUM) - { - /* We will have pushed the pic register, so we should always be - able to find a work register. */ - pic_tmp = gen_rtx_REG (SImode, - thumb_find_work_register (saved_regs)); - emit_insn (gen_pic_load_addr_thumb1 (pic_tmp, pic_rtx)); - emit_insn (gen_movsi (pic_offset_table_rtx, pic_tmp)); + warning (OPT_Wattributes, "%qE attribute ignored", + name); } - else - emit_insn (gen_pic_load_addr_thumb1 (pic_reg, pic_rtx)); - emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno)); } } - /* Need to emit this whether or not we obey regdecls, - since setjmp/longjmp can cause life info to screw up. */ - emit_use (pic_reg); + return NULL_TREE; } - -/* Return nonzero if X is valid as an ARM state addressing register. */ -static int -arm_address_register_rtx_p (rtx x, int strict_p) +/* Handle a "pcs" attribute; arguments as in struct + attribute_spec.handler. */ +static tree +arm_handle_pcs_attribute (tree *node ATTRIBUTE_UNUSED, tree name, tree args, + int flags ATTRIBUTE_UNUSED, bool *no_add_attrs) { - int regno; - - if (GET_CODE (x) != REG) - return 0; - - regno = REGNO (x); - - if (strict_p) - return ARM_REGNO_OK_FOR_BASE_P (regno); - - return (regno <= LAST_ARM_REGNUM - || regno >= FIRST_PSEUDO_REGISTER - || regno == FRAME_POINTER_REGNUM - || regno == ARG_POINTER_REGNUM); + if (arm_pcs_from_attribute (args) == ARM_PCS_UNKNOWN) + { + warning (OPT_Wattributes, "%qE attribute ignored", name); + *no_add_attrs = true; + } + return NULL_TREE; } -/* Return TRUE if this rtx is the difference of a symbol and a label, - and will reduce to a PC-relative relocation in the object file. - Expressions like this can be left alone when generating PIC, rather - than forced through the GOT. */ -static int -pcrel_constant_p (rtx x) +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES +/* Handle the "notshared" attribute. This attribute is another way of + requesting hidden visibility. ARM's compiler supports + "__declspec(notshared)"; we support the same thing via an + attribute. */ + +static tree +arm_handle_notshared_attribute (tree *node, + tree name ATTRIBUTE_UNUSED, + tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, + bool *no_add_attrs) { - if (GET_CODE (x) == MINUS) - return symbol_mentioned_p (XEXP (x, 0)) && label_mentioned_p (XEXP (x, 1)); + tree decl = TYPE_NAME (*node); - return FALSE; + if (decl) + { + DECL_VISIBILITY (decl) = VISIBILITY_HIDDEN; + DECL_VISIBILITY_SPECIFIED (decl) = 1; + *no_add_attrs = false; + } + return NULL_TREE; } +#endif -/* Return nonzero if X is a valid ARM state address operand. */ -int -arm_legitimate_address_p (enum machine_mode mode, rtx x, RTX_CODE outer, - int strict_p) +/* Return 0 if the attributes for two types are incompatible, 1 if they + are compatible, and 2 if they are nearly compatible (which causes a + warning to be generated). */ +static int +arm_comp_type_attributes (const_tree type1, const_tree type2) { - bool use_ldrd; - enum rtx_code code = GET_CODE (x); + int l1, l2, s1, s2; - if (arm_address_register_rtx_p (x, strict_p)) + /* Check for mismatch of non-default calling convention. */ + if (TREE_CODE (type1) != FUNCTION_TYPE) return 1; - use_ldrd = (TARGET_LDRD - && (mode == DImode - || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP)))); - - if (code == POST_INC || code == PRE_DEC - || ((code == PRE_INC || code == POST_DEC) - && (use_ldrd || GET_MODE_SIZE (mode) <= 4))) - return arm_address_register_rtx_p (XEXP (x, 0), strict_p); + /* Check for mismatched call attributes. */ + l1 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type1)) != NULL; + l2 = lookup_attribute ("long_call", TYPE_ATTRIBUTES (type2)) != NULL; + s1 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type1)) != NULL; + s2 = lookup_attribute ("short_call", TYPE_ATTRIBUTES (type2)) != NULL; - else if ((code == POST_MODIFY || code == PRE_MODIFY) - && arm_address_register_rtx_p (XEXP (x, 0), strict_p) - && GET_CODE (XEXP (x, 1)) == PLUS - && rtx_equal_p (XEXP (XEXP (x, 1), 0), XEXP (x, 0))) + /* Only bother to check if an attribute is defined. */ + if (l1 | l2 | s1 | s2) { - rtx addend = XEXP (XEXP (x, 1), 1); - - /* Don't allow ldrd post increment by register because it's hard - to fixup invalid register choices. */ - if (use_ldrd - && GET_CODE (x) == POST_MODIFY - && GET_CODE (addend) == REG) + /* If one type has an attribute, the other must have the same attribute. */ + if ((l1 != l2) || (s1 != s2)) return 0; - return ((use_ldrd || GET_MODE_SIZE (mode) <= 4) - && arm_legitimate_index_p (mode, addend, outer, strict_p)); + /* Disallow mixed attributes. */ + if ((l1 & s2) || (l2 & s1)) + return 0; } - /* After reload constants split into minipools will have addresses - from a LABEL_REF. */ - else if (reload_completed - && (code == LABEL_REF - || (code == CONST - && GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT))) - return 1; - - else if (mode == TImode || (TARGET_NEON && VALID_NEON_STRUCT_MODE (mode))) + /* Check for mismatched ISR attribute. */ + l1 = lookup_attribute ("isr", TYPE_ATTRIBUTES (type1)) != NULL; + if (! l1) + l1 = lookup_attribute ("interrupt", TYPE_ATTRIBUTES (type1)) != NULL; + l2 = lookup_attribute ("isr", TYPE_ATTRIBUTES (type2)) != NULL; + if (! l2) + l1 = lookup_attribute ("interrupt", TYPE_ATTRIBUTES (type2)) != NULL; + if (l1 != l2) return 0; - else if (code == PLUS) - { - rtx xop0 = XEXP (x, 0); - rtx xop1 = XEXP (x, 1); - - return ((arm_address_register_rtx_p (xop0, strict_p) - && arm_legitimate_index_p (mode, xop1, outer, strict_p)) - || (arm_address_register_rtx_p (xop1, strict_p) - && arm_legitimate_index_p (mode, xop0, outer, strict_p))); - } + return 1; +} -#if 0 - /* Reload currently can't handle MINUS, so disable this for now */ - else if (GET_CODE (x) == MINUS) +/* Assigns default attributes to newly defined type. This is used to + set short_call/long_call attributes for function types of + functions defined inside corresponding #pragma scopes. */ +static void +arm_set_default_type_attributes (tree type) +{ + /* Add __attribute__ ((long_call)) to all functions, when + inside #pragma long_calls or __attribute__ ((short_call)), + when inside #pragma no_long_calls. */ + if (TREE_CODE (type) == FUNCTION_TYPE || TREE_CODE (type) == METHOD_TYPE) { - rtx xop0 = XEXP (x, 0); - rtx xop1 = XEXP (x, 1); - - return (arm_address_register_rtx_p (xop0, strict_p) - && arm_legitimate_index_p (mode, xop1, outer, strict_p)); - } -#endif + tree type_attr_list, attr_name; + type_attr_list = TYPE_ATTRIBUTES (type); - else if (GET_MODE_CLASS (mode) != MODE_FLOAT - && code == SYMBOL_REF - && CONSTANT_POOL_ADDRESS_P (x) - && ! (flag_pic - && symbol_mentioned_p (get_pool_constant (x)) - && ! pcrel_constant_p (get_pool_constant (x)))) - return 1; + if (arm_pragma_long_calls == LONG) + attr_name = get_identifier ("long_call"); + else if (arm_pragma_long_calls == SHORT) + attr_name = get_identifier ("short_call"); + else + return; - return 0; + type_attr_list = tree_cons (attr_name, NULL_TREE, type_attr_list); + TYPE_ATTRIBUTES (type) = type_attr_list; + } } + +/* Return true if DECL is known to be linked into section SECTION. */ -/* Return nonzero if X is a valid Thumb-2 address operand. */ -int -thumb2_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p) +static bool +arm_function_in_section_p (tree decl, section *section) { - bool use_ldrd; - enum rtx_code code = GET_CODE (x); - - if (arm_address_register_rtx_p (x, strict_p)) - return 1; - - use_ldrd = (TARGET_LDRD - && (mode == DImode - || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP)))); + /* We can only be certain about functions defined in the same + compilation unit. */ + if (!TREE_STATIC (decl)) + return false; - if (code == POST_INC || code == PRE_DEC - || ((code == PRE_INC || code == POST_DEC) - && (use_ldrd || GET_MODE_SIZE (mode) <= 4))) - return arm_address_register_rtx_p (XEXP (x, 0), strict_p); + /* Make sure that SYMBOL always binds to the definition in this + compilation unit. */ + if (!targetm.binds_local_p (decl)) + return false; - else if ((code == POST_MODIFY || code == PRE_MODIFY) - && arm_address_register_rtx_p (XEXP (x, 0), strict_p) - && GET_CODE (XEXP (x, 1)) == PLUS - && rtx_equal_p (XEXP (XEXP (x, 1), 0), XEXP (x, 0))) + /* If DECL_SECTION_NAME is set, assume it is trustworthy. */ + if (!DECL_SECTION_NAME (decl)) { - /* Thumb-2 only has autoincrement by constant. */ - rtx addend = XEXP (XEXP (x, 1), 1); - HOST_WIDE_INT offset; - - if (GET_CODE (addend) != CONST_INT) - return 0; - - offset = INTVAL(addend); - if (GET_MODE_SIZE (mode) <= 4) - return (offset > -256 && offset < 256); - - return (use_ldrd && offset > -1024 && offset < 1024 - && (offset & 3) == 0); + /* Make sure that we will not create a unique section for DECL. */ + if (flag_function_sections || DECL_ONE_ONLY (decl)) + return false; } - /* After reload constants split into minipools will have addresses - from a LABEL_REF. */ - else if (reload_completed - && (code == LABEL_REF - || (code == CONST - && GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT))) - return 1; - - else if (mode == TImode || (TARGET_NEON && VALID_NEON_STRUCT_MODE (mode))) - return 0; + return function_section (decl) == section; +} - else if (code == PLUS) - { - rtx xop0 = XEXP (x, 0); - rtx xop1 = XEXP (x, 1); +/* Return nonzero if a 32-bit "long_call" should be generated for + a call from the current function to DECL. We generate a long_call + if the function: - return ((arm_address_register_rtx_p (xop0, strict_p) - && thumb2_legitimate_index_p (mode, xop1, strict_p)) - || (arm_address_register_rtx_p (xop1, strict_p) - && thumb2_legitimate_index_p (mode, xop0, strict_p))); - } + a. has an __attribute__((long call)) + or b. is within the scope of a #pragma long_calls + or c. the -mlong-calls command line switch has been specified - else if (GET_MODE_CLASS (mode) != MODE_FLOAT - && code == SYMBOL_REF - && CONSTANT_POOL_ADDRESS_P (x) - && ! (flag_pic - && symbol_mentioned_p (get_pool_constant (x)) - && ! pcrel_constant_p (get_pool_constant (x)))) - return 1; + However we do not generate a long call if the function: - return 0; -} + d. has an __attribute__ ((short_call)) + or e. is inside the scope of a #pragma no_long_calls + or f. is defined in the same section as the current function. */ -/* Return nonzero if INDEX is valid for an address index operand in - ARM state. */ -static int -arm_legitimate_index_p (enum machine_mode mode, rtx index, RTX_CODE outer, - int strict_p) +bool +arm_is_long_call_p (tree decl) { - HOST_WIDE_INT range; - enum rtx_code code = GET_CODE (index); + tree attrs; - /* Standard coprocessor addressing modes. */ - if (TARGET_HARD_FLOAT - && (TARGET_FPA || TARGET_MAVERICK) - && (GET_MODE_CLASS (mode) == MODE_FLOAT - || (TARGET_MAVERICK && mode == DImode))) - return (code == CONST_INT && INTVAL (index) < 1024 - && INTVAL (index) > -1024 - && (INTVAL (index) & 3) == 0); + if (!decl) + return TARGET_LONG_CALLS; - if (TARGET_NEON - && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))) - return (code == CONST_INT - && INTVAL (index) < 1016 - && INTVAL (index) > -1024 - && (INTVAL (index) & 3) == 0); + attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + if (lookup_attribute ("short_call", attrs)) + return false; - if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode)) - return (code == CONST_INT - && INTVAL (index) < 1024 - && INTVAL (index) > -1024 - && (INTVAL (index) & 3) == 0); + /* For "f", be conservative, and only cater for cases in which the + whole of the current function is placed in the same section. */ + if (!flag_reorder_blocks_and_partition + && TREE_CODE (decl) == FUNCTION_DECL + && arm_function_in_section_p (decl, current_function_section ())) + return false; - if (arm_address_register_rtx_p (index, strict_p) - && (GET_MODE_SIZE (mode) <= 4)) - return 1; + if (lookup_attribute ("long_call", attrs)) + return true; - if (mode == DImode || mode == DFmode) - { - if (code == CONST_INT) - { - HOST_WIDE_INT val = INTVAL (index); + return TARGET_LONG_CALLS; +} - if (TARGET_LDRD) - return val > -256 && val < 256; - else - return val > -4096 && val < 4092; - } +/* Return nonzero if it is ok to make a tail-call to DECL. */ +static bool +arm_function_ok_for_sibcall (tree decl, tree exp) +{ + unsigned long func_type; - return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p); - } + if (cfun->machine->sibcall_blocked) + return false; - if (GET_MODE_SIZE (mode) <= 4 - && ! (arm_arch4 - && (mode == HImode - || (mode == QImode && outer == SIGN_EXTEND)))) - { - if (code == MULT) - { - rtx xiop0 = XEXP (index, 0); - rtx xiop1 = XEXP (index, 1); + /* Never tailcall something for which we have no decl, or if we + are generating code for Thumb-1. */ + if (decl == NULL || TARGET_THUMB1) + return false; - return ((arm_address_register_rtx_p (xiop0, strict_p) - && power_of_two_operand (xiop1, SImode)) - || (arm_address_register_rtx_p (xiop1, strict_p) - && power_of_two_operand (xiop0, SImode))); - } - else if (code == LSHIFTRT || code == ASHIFTRT - || code == ASHIFT || code == ROTATERT) - { - rtx op = XEXP (index, 1); + /* The PIC register is live on entry to VxWorks PLT entries, so we + must make the call before restoring the PIC register. */ + if (TARGET_VXWORKS_RTP && flag_pic && !targetm.binds_local_p (decl)) + return false; - return (arm_address_register_rtx_p (XEXP (index, 0), strict_p) - && GET_CODE (op) == CONST_INT - && INTVAL (op) > 0 - && INTVAL (op) <= 31); - } - } + /* Cannot tail-call to long calls, since these are out of range of + a branch instruction. */ + if (arm_is_long_call_p (decl)) + return false; - /* For ARM v4 we may be doing a sign-extend operation during the - load. */ - if (arm_arch4) + /* If we are interworking and the function is not declared static + then we can't tail-call it unless we know that it exists in this + compilation unit (since it might be a Thumb routine). */ + if (TARGET_INTERWORK && TREE_PUBLIC (decl) && !TREE_ASM_WRITTEN (decl)) + return false; + + func_type = arm_current_func_type (); + /* Never tailcall from an ISR routine - it needs a special exit sequence. */ + if (IS_INTERRUPT (func_type)) + return false; + + if (!VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl)))) { - if (mode == HImode || (outer == SIGN_EXTEND && mode == QImode)) - range = 256; - else - range = 4096; - } - else - range = (mode == HImode) ? 4095 : 4096; + /* Check that the return value locations are the same. For + example that we aren't returning a value from the sibling in + a VFP register but then need to transfer it to a core + register. */ + rtx a, b; - return (code == CONST_INT - && INTVAL (index) < range - && INTVAL (index) > -range); -} + a = arm_function_value (TREE_TYPE (exp), decl, false); + b = arm_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)), + cfun->decl, false); + if (!rtx_equal_p (a, b)) + return false; + } -/* Return true if OP is a valid index scaling factor for Thumb-2 address - index operand. i.e. 1, 2, 4 or 8. */ -static bool -thumb2_index_mul_operand (rtx op) -{ - HOST_WIDE_INT val; - - if (GET_CODE(op) != CONST_INT) + /* Never tailcall if function may be called with a misaligned SP. */ + if (IS_STACKALIGN (func_type)) return false; - val = INTVAL(op); - return (val == 1 || val == 2 || val == 4 || val == 8); + /* Everything else is ok. */ + return true; } - -/* Return nonzero if INDEX is a valid Thumb-2 address index operand. */ -static int -thumb2_legitimate_index_p (enum machine_mode mode, rtx index, int strict_p) -{ - enum rtx_code code = GET_CODE (index); - /* ??? Combine arm and thumb2 coprocessor addressing modes. */ - /* Standard coprocessor addressing modes. */ - if (TARGET_HARD_FLOAT - && (TARGET_FPA || TARGET_MAVERICK) - && (GET_MODE_CLASS (mode) == MODE_FLOAT - || (TARGET_MAVERICK && mode == DImode))) - return (code == CONST_INT && INTVAL (index) < 1024 - && INTVAL (index) > -1024 - && (INTVAL (index) & 3) == 0); + +/* Addressing mode support functions. */ - if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode)) - { - /* For DImode assume values will usually live in core regs - and only allow LDRD addressing modes. */ - if (!TARGET_LDRD || mode != DImode) - return (code == CONST_INT - && INTVAL (index) < 1024 - && INTVAL (index) > -1024 - && (INTVAL (index) & 3) == 0); - } +/* Return nonzero if X is a legitimate immediate operand when compiling + for PIC. We know that X satisfies CONSTANT_P and flag_pic is true. */ +int +legitimate_pic_operand_p (rtx x) +{ + if (GET_CODE (x) == SYMBOL_REF + || (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)) + return 0; - if (TARGET_NEON - && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))) - return (code == CONST_INT - && INTVAL (index) < 1016 - && INTVAL (index) > -1024 - && (INTVAL (index) & 3) == 0); + return 1; +} - if (arm_address_register_rtx_p (index, strict_p) - && (GET_MODE_SIZE (mode) <= 4)) - return 1; +/* Record that the current function needs a PIC register. Initialize + cfun->machine->pic_reg if we have not already done so. */ - if (mode == DImode || mode == DFmode) +static void +require_pic_register (void) +{ + /* A lot of the logic here is made obscure by the fact that this + routine gets called as part of the rtx cost estimation process. + We don't want those calls to affect any assumptions about the real + function; and further, we can't call entry_of_function() until we + start the real expansion process. */ + if (!crtl->uses_pic_offset_table) { - HOST_WIDE_INT val = INTVAL (index); - /* ??? Can we assume ldrd for thumb2? */ - /* Thumb-2 ldrd only has reg+const addressing modes. */ - if (code != CONST_INT) - return 0; + gcc_assert (can_create_pseudo_p ()); + if (arm_pic_register != INVALID_REGNUM) + { + if (!cfun->machine->pic_reg) + cfun->machine->pic_reg = gen_rtx_REG (Pmode, arm_pic_register); - /* ldrd supports offsets of +-1020. - However the ldr fallback does not. */ - return val > -256 && val < 256 && (val & 3) == 0; - } + /* Play games to avoid marking the function as needing pic + if we are being called as part of the cost-estimation + process. */ + if (current_ir_type () != IR_GIMPLE || currently_expanding_to_rtl) + crtl->uses_pic_offset_table = 1; + } + else + { + rtx seq; - if (code == MULT) - { - rtx xiop0 = XEXP (index, 0); - rtx xiop1 = XEXP (index, 1); + if (!cfun->machine->pic_reg) + cfun->machine->pic_reg = gen_reg_rtx (Pmode); - return ((arm_address_register_rtx_p (xiop0, strict_p) - && thumb2_index_mul_operand (xiop1)) - || (arm_address_register_rtx_p (xiop1, strict_p) - && thumb2_index_mul_operand (xiop0))); - } - else if (code == ASHIFT) - { - rtx op = XEXP (index, 1); + /* Play games to avoid marking the function as needing pic + if we are being called as part of the cost-estimation + process. */ + if (current_ir_type () != IR_GIMPLE || currently_expanding_to_rtl) + { + crtl->uses_pic_offset_table = 1; + start_sequence (); - return (arm_address_register_rtx_p (XEXP (index, 0), strict_p) - && GET_CODE (op) == CONST_INT - && INTVAL (op) > 0 - && INTVAL (op) <= 3); - } + arm_load_pic_register (0UL); - return (code == CONST_INT - && INTVAL (index) < 4096 - && INTVAL (index) > -256); + seq = get_insns (); + end_sequence (); + /* We can be called during expansion of PHI nodes, where + we can't yet emit instructions directly in the final + insn stream. Queue the insns on the entry edge, they will + be committed after everything else is expanded. */ + insert_insn_on_edge (seq, single_succ_edge (ENTRY_BLOCK_PTR)); + } + } + } } -/* Return nonzero if X is valid as a 16-bit Thumb state base register. */ -static int -thumb1_base_register_rtx_p (rtx x, enum machine_mode mode, int strict_p) +rtx +legitimize_pic_address (rtx orig, enum machine_mode mode, rtx reg) { - int regno; + if (GET_CODE (orig) == SYMBOL_REF + || GET_CODE (orig) == LABEL_REF) + { + rtx insn; - if (GET_CODE (x) != REG) - return 0; + if (reg == 0) + { + gcc_assert (can_create_pseudo_p ()); + reg = gen_reg_rtx (Pmode); + } - regno = REGNO (x); + /* VxWorks does not impose a fixed gap between segments; the run-time + gap can be different from the object-file gap. We therefore can't + use GOTOFF unless we are absolutely sure that the symbol is in the + same segment as the GOT. Unfortunately, the flexibility of linker + scripts means that we can't be sure of that in general, so assume + that GOTOFF is never valid on VxWorks. */ + if ((GET_CODE (orig) == LABEL_REF + || (GET_CODE (orig) == SYMBOL_REF && + SYMBOL_REF_LOCAL_P (orig))) + && NEED_GOT_RELOC + && !TARGET_VXWORKS_RTP) + insn = arm_pic_static_addr (orig, reg); + else + { + rtx pat; + rtx mem; - if (strict_p) - return THUMB1_REGNO_MODE_OK_FOR_BASE_P (regno, mode); + /* If this function doesn't have a pic register, create one now. */ + require_pic_register (); - return (regno <= LAST_LO_REGNUM - || regno > LAST_VIRTUAL_REGISTER - || regno == FRAME_POINTER_REGNUM - || (GET_MODE_SIZE (mode) >= 4 - && (regno == STACK_POINTER_REGNUM - || regno >= FIRST_PSEUDO_REGISTER - || x == hard_frame_pointer_rtx - || x == arg_pointer_rtx))); -} + pat = gen_calculate_pic_address (reg, cfun->machine->pic_reg, orig); -/* Return nonzero if x is a legitimate index register. This is the case - for any base register that can access a QImode object. */ -inline static int -thumb1_index_register_rtx_p (rtx x, int strict_p) -{ - return thumb1_base_register_rtx_p (x, QImode, strict_p); -} + /* Make the MEM as close to a constant as possible. */ + mem = SET_SRC (pat); + gcc_assert (MEM_P (mem) && !MEM_VOLATILE_P (mem)); + MEM_READONLY_P (mem) = 1; + MEM_NOTRAP_P (mem) = 1; -/* Return nonzero if x is a legitimate 16-bit Thumb-state address. + insn = emit_insn (pat); + } - The AP may be eliminated to either the SP or the FP, so we use the - least common denominator, e.g. SImode, and offsets from 0 to 64. + /* Put a REG_EQUAL note on this insn, so that it can be optimized + by loop. */ + set_unique_reg_note (insn, REG_EQUAL, orig); - ??? Verify whether the above is the right approach. + return reg; + } + else if (GET_CODE (orig) == CONST) + { + rtx base, offset; - ??? Also, the FP may be eliminated to the SP, so perhaps that - needs special handling also. + if (GET_CODE (XEXP (orig, 0)) == PLUS + && XEXP (XEXP (orig, 0), 0) == cfun->machine->pic_reg) + return orig; - ??? Look at how the mips16 port solves this problem. It probably uses - better ways to solve some of these problems. + /* Handle the case where we have: const (UNSPEC_TLS). */ + if (GET_CODE (XEXP (orig, 0)) == UNSPEC + && XINT (XEXP (orig, 0), 1) == UNSPEC_TLS) + return orig; - Although it is not incorrect, we don't accept QImode and HImode - addresses based on the frame pointer or arg pointer until the - reload pass starts. This is so that eliminating such addresses - into stack based ones won't produce impossible code. */ -int -thumb1_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p) + /* Handle the case where we have: + const (plus (UNSPEC_TLS) (ADDEND)). The ADDEND must be a + CONST_INT. */ + if (GET_CODE (XEXP (orig, 0)) == PLUS + && GET_CODE (XEXP (XEXP (orig, 0), 0)) == UNSPEC + && XINT (XEXP (XEXP (orig, 0), 0), 1) == UNSPEC_TLS) + { + gcc_assert (GET_CODE (XEXP (XEXP (orig, 0), 1)) == CONST_INT); + return orig; + } + + if (reg == 0) + { + gcc_assert (can_create_pseudo_p ()); + reg = gen_reg_rtx (Pmode); + } + + gcc_assert (GET_CODE (XEXP (orig, 0)) == PLUS); + + base = legitimize_pic_address (XEXP (XEXP (orig, 0), 0), Pmode, reg); + offset = legitimize_pic_address (XEXP (XEXP (orig, 0), 1), Pmode, + base == reg ? 0 : reg); + + if (GET_CODE (offset) == CONST_INT) + { + /* The base register doesn't really matter, we only want to + test the index for the appropriate mode. */ + if (!arm_legitimate_index_p (mode, offset, SET, 0)) + { + gcc_assert (can_create_pseudo_p ()); + offset = force_reg (Pmode, offset); + } + + if (GET_CODE (offset) == CONST_INT) + return plus_constant (base, INTVAL (offset)); + } + + if (GET_MODE_SIZE (mode) > 4 + && (GET_MODE_CLASS (mode) == MODE_INT + || TARGET_SOFT_FLOAT)) + { + emit_insn (gen_addsi3 (reg, base, offset)); + return reg; + } + + return gen_rtx_PLUS (Pmode, base, offset); + } + + return orig; +} + + +/* Find a spare register to use during the prolog of a function. */ + +static int +thumb_find_work_register (unsigned long pushed_regs_mask) +{ + int reg; + + /* Check the argument registers first as these are call-used. The + register allocation order means that sometimes r3 might be used + but earlier argument registers might not, so check them all. */ + for (reg = LAST_ARG_REGNUM; reg >= 0; reg --) + if (!df_regs_ever_live_p (reg)) + return reg; + + /* Before going on to check the call-saved registers we can try a couple + more ways of deducing that r3 is available. The first is when we are + pushing anonymous arguments onto the stack and we have less than 4 + registers worth of fixed arguments(*). In this case r3 will be part of + the variable argument list and so we can be sure that it will be + pushed right at the start of the function. Hence it will be available + for the rest of the prologue. + (*): ie crtl->args.pretend_args_size is greater than 0. */ + if (cfun->machine->uses_anonymous_args + && crtl->args.pretend_args_size > 0) + return LAST_ARG_REGNUM; + + /* The other case is when we have fixed arguments but less than 4 registers + worth. In this case r3 might be used in the body of the function, but + it is not being used to convey an argument into the function. In theory + we could just check crtl->args.size to see how many bytes are + being passed in argument registers, but it seems that it is unreliable. + Sometimes it will have the value 0 when in fact arguments are being + passed. (See testcase execute/20021111-1.c for an example). So we also + check the args_info.nregs field as well. The problem with this field is + that it makes no allowances for arguments that are passed to the + function but which are not used. Hence we could miss an opportunity + when a function has an unused argument in r3. But it is better to be + safe than to be sorry. */ + if (! cfun->machine->uses_anonymous_args + && crtl->args.size >= 0 + && crtl->args.size <= (LAST_ARG_REGNUM * UNITS_PER_WORD) + && crtl->args.info.nregs < 4) + return LAST_ARG_REGNUM; + + /* Otherwise look for a call-saved register that is going to be pushed. */ + for (reg = LAST_LO_REGNUM; reg > LAST_ARG_REGNUM; reg --) + if (pushed_regs_mask & (1 << reg)) + return reg; + + if (TARGET_THUMB2) + { + /* Thumb-2 can use high regs. */ + for (reg = FIRST_HI_REGNUM; reg < 15; reg ++) + if (pushed_regs_mask & (1 << reg)) + return reg; + } + /* Something went wrong - thumb_compute_save_reg_mask() + should have arranged for a suitable register to be pushed. */ + gcc_unreachable (); +} + +static GTY(()) int pic_labelno; + +/* Generate code to load the PIC register. In thumb mode SCRATCH is a + low register. */ + +void +arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED) +{ + rtx l1, labelno, pic_tmp, pic_rtx, pic_reg; + + if (crtl->uses_pic_offset_table == 0 || TARGET_SINGLE_PIC_BASE) + return; + + gcc_assert (flag_pic); + + pic_reg = cfun->machine->pic_reg; + if (TARGET_VXWORKS_RTP) + { + pic_rtx = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE); + pic_rtx = gen_rtx_CONST (Pmode, pic_rtx); + emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx)); + + emit_insn (gen_rtx_SET (Pmode, pic_reg, gen_rtx_MEM (Pmode, pic_reg))); + + pic_tmp = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX); + emit_insn (gen_pic_offset_arm (pic_reg, pic_reg, pic_tmp)); + } + else + { + /* We use an UNSPEC rather than a LABEL_REF because this label + never appears in the code stream. */ + + labelno = GEN_INT (pic_labelno++); + l1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); + l1 = gen_rtx_CONST (VOIDmode, l1); + + /* On the ARM the PC register contains 'dot + 8' at the time of the + addition, on the Thumb it is 'dot + 4'. */ + pic_rtx = plus_constant (l1, TARGET_ARM ? 8 : 4); + pic_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, pic_rtx), + UNSPEC_GOTSYM_OFF); + pic_rtx = gen_rtx_CONST (Pmode, pic_rtx); + + if (TARGET_32BIT) + { + emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx)); + if (TARGET_ARM) + emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno)); + else + emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno)); + } + else /* TARGET_THUMB1 */ + { + if (arm_pic_register != INVALID_REGNUM + && REGNO (pic_reg) > LAST_LO_REGNUM) + { + /* We will have pushed the pic register, so we should always be + able to find a work register. */ + pic_tmp = gen_rtx_REG (SImode, + thumb_find_work_register (saved_regs)); + emit_insn (gen_pic_load_addr_thumb1 (pic_tmp, pic_rtx)); + emit_insn (gen_movsi (pic_offset_table_rtx, pic_tmp)); + } + else + emit_insn (gen_pic_load_addr_thumb1 (pic_reg, pic_rtx)); + emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno)); + } + } + + /* Need to emit this whether or not we obey regdecls, + since setjmp/longjmp can cause life info to screw up. */ + emit_use (pic_reg); +} + +/* Generate code to load the address of a static var when flag_pic is set. */ +static rtx +arm_pic_static_addr (rtx orig, rtx reg) +{ + rtx l1, labelno, offset_rtx, insn; + + gcc_assert (flag_pic); + + /* We use an UNSPEC rather than a LABEL_REF because this label + never appears in the code stream. */ + labelno = GEN_INT (pic_labelno++); + l1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); + l1 = gen_rtx_CONST (VOIDmode, l1); + + /* On the ARM the PC register contains 'dot + 8' at the time of the + addition, on the Thumb it is 'dot + 4'. */ + offset_rtx = plus_constant (l1, TARGET_ARM ? 8 : 4); + offset_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, orig, offset_rtx), + UNSPEC_SYMBOL_OFFSET); + offset_rtx = gen_rtx_CONST (Pmode, offset_rtx); + + if (TARGET_32BIT) + { + emit_insn (gen_pic_load_addr_32bit (reg, offset_rtx)); + if (TARGET_ARM) + insn = emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno)); + else + insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + } + else /* TARGET_THUMB1 */ + { + emit_insn (gen_pic_load_addr_thumb1 (reg, offset_rtx)); + insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + } + + return insn; +} + +/* Return nonzero if X is valid as an ARM state addressing register. */ +static int +arm_address_register_rtx_p (rtx x, int strict_p) +{ + int regno; + + if (GET_CODE (x) != REG) + return 0; + + regno = REGNO (x); + + if (strict_p) + return ARM_REGNO_OK_FOR_BASE_P (regno); + + return (regno <= LAST_ARM_REGNUM + || regno >= FIRST_PSEUDO_REGISTER + || regno == FRAME_POINTER_REGNUM + || regno == ARG_POINTER_REGNUM); +} + +/* Return TRUE if this rtx is the difference of a symbol and a label, + and will reduce to a PC-relative relocation in the object file. + Expressions like this can be left alone when generating PIC, rather + than forced through the GOT. */ +static int +pcrel_constant_p (rtx x) +{ + if (GET_CODE (x) == MINUS) + return symbol_mentioned_p (XEXP (x, 0)) && label_mentioned_p (XEXP (x, 1)); + + return FALSE; +} + +/* Return true if X will surely end up in an index register after next + splitting pass. */ +static bool +will_be_in_index_register (const_rtx x) +{ + /* arm.md: calculate_pic_address will split this into a register. */ + return GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_PIC_SYM; +} + +/* Return nonzero if X is a valid ARM state address operand. */ +int +arm_legitimate_address_outer_p (enum machine_mode mode, rtx x, RTX_CODE outer, + int strict_p) +{ + bool use_ldrd; + enum rtx_code code = GET_CODE (x); + + if (arm_address_register_rtx_p (x, strict_p)) + return 1; + + use_ldrd = (TARGET_LDRD + && (mode == DImode + || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP)))); + + if (code == POST_INC || code == PRE_DEC + || ((code == PRE_INC || code == POST_DEC) + && (use_ldrd || GET_MODE_SIZE (mode) <= 4))) + return arm_address_register_rtx_p (XEXP (x, 0), strict_p); + + else if ((code == POST_MODIFY || code == PRE_MODIFY) + && arm_address_register_rtx_p (XEXP (x, 0), strict_p) + && GET_CODE (XEXP (x, 1)) == PLUS + && rtx_equal_p (XEXP (XEXP (x, 1), 0), XEXP (x, 0))) + { + rtx addend = XEXP (XEXP (x, 1), 1); + + /* Don't allow ldrd post increment by register because it's hard + to fixup invalid register choices. */ + if (use_ldrd + && GET_CODE (x) == POST_MODIFY + && GET_CODE (addend) == REG) + return 0; + + return ((use_ldrd || GET_MODE_SIZE (mode) <= 4) + && arm_legitimate_index_p (mode, addend, outer, strict_p)); + } + + /* After reload constants split into minipools will have addresses + from a LABEL_REF. */ + else if (reload_completed + && (code == LABEL_REF + || (code == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT))) + return 1; + + else if (mode == TImode || (TARGET_NEON && VALID_NEON_STRUCT_MODE (mode))) + return 0; + + else if (code == PLUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + return ((arm_address_register_rtx_p (xop0, strict_p) + && ((GET_CODE(xop1) == CONST_INT + && arm_legitimate_index_p (mode, xop1, outer, strict_p)) + || (!strict_p && will_be_in_index_register (xop1)))) + || (arm_address_register_rtx_p (xop1, strict_p) + && arm_legitimate_index_p (mode, xop0, outer, strict_p))); + } + +#if 0 + /* Reload currently can't handle MINUS, so disable this for now */ + else if (GET_CODE (x) == MINUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + return (arm_address_register_rtx_p (xop0, strict_p) + && arm_legitimate_index_p (mode, xop1, outer, strict_p)); + } +#endif + + else if (GET_MODE_CLASS (mode) != MODE_FLOAT + && code == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (x) + && ! (flag_pic + && symbol_mentioned_p (get_pool_constant (x)) + && ! pcrel_constant_p (get_pool_constant (x)))) + return 1; + + return 0; +} + +/* Return nonzero if X is a valid Thumb-2 address operand. */ +static int +thumb2_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p) +{ + bool use_ldrd; + enum rtx_code code = GET_CODE (x); + + if (arm_address_register_rtx_p (x, strict_p)) + return 1; + + use_ldrd = (TARGET_LDRD + && (mode == DImode + || (mode == DFmode && (TARGET_SOFT_FLOAT || TARGET_VFP)))); + + if (code == POST_INC || code == PRE_DEC + || ((code == PRE_INC || code == POST_DEC) + && (use_ldrd || GET_MODE_SIZE (mode) <= 4))) + return arm_address_register_rtx_p (XEXP (x, 0), strict_p); + + else if ((code == POST_MODIFY || code == PRE_MODIFY) + && arm_address_register_rtx_p (XEXP (x, 0), strict_p) + && GET_CODE (XEXP (x, 1)) == PLUS + && rtx_equal_p (XEXP (XEXP (x, 1), 0), XEXP (x, 0))) + { + /* Thumb-2 only has autoincrement by constant. */ + rtx addend = XEXP (XEXP (x, 1), 1); + HOST_WIDE_INT offset; + + if (GET_CODE (addend) != CONST_INT) + return 0; + + offset = INTVAL(addend); + if (GET_MODE_SIZE (mode) <= 4) + return (offset > -256 && offset < 256); + + return (use_ldrd && offset > -1024 && offset < 1024 + && (offset & 3) == 0); + } + + /* After reload constants split into minipools will have addresses + from a LABEL_REF. */ + else if (reload_completed + && (code == LABEL_REF + || (code == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT))) + return 1; + + else if (mode == TImode || (TARGET_NEON && VALID_NEON_STRUCT_MODE (mode))) + return 0; + + else if (code == PLUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + return ((arm_address_register_rtx_p (xop0, strict_p) + && (thumb2_legitimate_index_p (mode, xop1, strict_p) + || (!strict_p && will_be_in_index_register (xop1)))) + || (arm_address_register_rtx_p (xop1, strict_p) + && thumb2_legitimate_index_p (mode, xop0, strict_p))); + } + + else if (GET_MODE_CLASS (mode) != MODE_FLOAT + && code == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (x) + && ! (flag_pic + && symbol_mentioned_p (get_pool_constant (x)) + && ! pcrel_constant_p (get_pool_constant (x)))) + return 1; + + return 0; +} + +/* Return nonzero if INDEX is valid for an address index operand in + ARM state. */ +static int +arm_legitimate_index_p (enum machine_mode mode, rtx index, RTX_CODE outer, + int strict_p) +{ + HOST_WIDE_INT range; + enum rtx_code code = GET_CODE (index); + + /* Standard coprocessor addressing modes. */ + if (TARGET_HARD_FLOAT + && (TARGET_FPA || TARGET_MAVERICK) + && (GET_MODE_CLASS (mode) == MODE_FLOAT + || (TARGET_MAVERICK && mode == DImode))) + return (code == CONST_INT && INTVAL (index) < 1024 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + if (TARGET_NEON + && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))) + return (code == CONST_INT + && INTVAL (index) < 1016 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode)) + return (code == CONST_INT + && INTVAL (index) < 1024 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + if (arm_address_register_rtx_p (index, strict_p) + && (GET_MODE_SIZE (mode) <= 4)) + return 1; + + if (mode == DImode || mode == DFmode) + { + if (code == CONST_INT) + { + HOST_WIDE_INT val = INTVAL (index); + + if (TARGET_LDRD) + return val > -256 && val < 256; + else + return val > -4096 && val < 4092; + } + + return TARGET_LDRD && arm_address_register_rtx_p (index, strict_p); + } + + if (GET_MODE_SIZE (mode) <= 4 + && ! (arm_arch4 + && (mode == HImode + || mode == HFmode + || (mode == QImode && outer == SIGN_EXTEND)))) + { + if (code == MULT) + { + rtx xiop0 = XEXP (index, 0); + rtx xiop1 = XEXP (index, 1); + + return ((arm_address_register_rtx_p (xiop0, strict_p) + && power_of_two_operand (xiop1, SImode)) + || (arm_address_register_rtx_p (xiop1, strict_p) + && power_of_two_operand (xiop0, SImode))); + } + else if (code == LSHIFTRT || code == ASHIFTRT + || code == ASHIFT || code == ROTATERT) + { + rtx op = XEXP (index, 1); + + return (arm_address_register_rtx_p (XEXP (index, 0), strict_p) + && GET_CODE (op) == CONST_INT + && INTVAL (op) > 0 + && INTVAL (op) <= 31); + } + } + + /* For ARM v4 we may be doing a sign-extend operation during the + load. */ + if (arm_arch4) + { + if (mode == HImode + || mode == HFmode + || (outer == SIGN_EXTEND && mode == QImode)) + range = 256; + else + range = 4096; + } + else + range = (mode == HImode || mode == HFmode) ? 4095 : 4096; + + return (code == CONST_INT + && INTVAL (index) < range + && INTVAL (index) > -range); +} + +/* Return true if OP is a valid index scaling factor for Thumb-2 address + index operand. i.e. 1, 2, 4 or 8. */ +static bool +thumb2_index_mul_operand (rtx op) +{ + HOST_WIDE_INT val; + + if (GET_CODE(op) != CONST_INT) + return false; + + val = INTVAL(op); + return (val == 1 || val == 2 || val == 4 || val == 8); +} + +/* Return nonzero if INDEX is a valid Thumb-2 address index operand. */ +static int +thumb2_legitimate_index_p (enum machine_mode mode, rtx index, int strict_p) +{ + enum rtx_code code = GET_CODE (index); + + /* ??? Combine arm and thumb2 coprocessor addressing modes. */ + /* Standard coprocessor addressing modes. */ + if (TARGET_HARD_FLOAT + && (TARGET_FPA || TARGET_MAVERICK) + && (GET_MODE_CLASS (mode) == MODE_FLOAT + || (TARGET_MAVERICK && mode == DImode))) + return (code == CONST_INT && INTVAL (index) < 1024 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + if (TARGET_REALLY_IWMMXT && VALID_IWMMXT_REG_MODE (mode)) + { + /* For DImode assume values will usually live in core regs + and only allow LDRD addressing modes. */ + if (!TARGET_LDRD || mode != DImode) + return (code == CONST_INT + && INTVAL (index) < 1024 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + } + + if (TARGET_NEON + && (VALID_NEON_DREG_MODE (mode) || VALID_NEON_QREG_MODE (mode))) + return (code == CONST_INT + && INTVAL (index) < 1016 + && INTVAL (index) > -1024 + && (INTVAL (index) & 3) == 0); + + if (arm_address_register_rtx_p (index, strict_p) + && (GET_MODE_SIZE (mode) <= 4)) + return 1; + + if (mode == DImode || mode == DFmode) + { + if (code == CONST_INT) + { + HOST_WIDE_INT val = INTVAL (index); + /* ??? Can we assume ldrd for thumb2? */ + /* Thumb-2 ldrd only has reg+const addressing modes. */ + /* ldrd supports offsets of +-1020. + However the ldr fallback does not. */ + return val > -256 && val < 256 && (val & 3) == 0; + } + else + return 0; + } + + if (code == MULT) + { + rtx xiop0 = XEXP (index, 0); + rtx xiop1 = XEXP (index, 1); + + return ((arm_address_register_rtx_p (xiop0, strict_p) + && thumb2_index_mul_operand (xiop1)) + || (arm_address_register_rtx_p (xiop1, strict_p) + && thumb2_index_mul_operand (xiop0))); + } + else if (code == ASHIFT) + { + rtx op = XEXP (index, 1); + + return (arm_address_register_rtx_p (XEXP (index, 0), strict_p) + && GET_CODE (op) == CONST_INT + && INTVAL (op) > 0 + && INTVAL (op) <= 3); + } + + return (code == CONST_INT + && INTVAL (index) < 4096 + && INTVAL (index) > -256); +} + +/* Return nonzero if X is valid as a 16-bit Thumb state base register. */ +static int +thumb1_base_register_rtx_p (rtx x, enum machine_mode mode, int strict_p) +{ + int regno; + + if (GET_CODE (x) != REG) + return 0; + + regno = REGNO (x); + + if (strict_p) + return THUMB1_REGNO_MODE_OK_FOR_BASE_P (regno, mode); + + return (regno <= LAST_LO_REGNUM + || regno > LAST_VIRTUAL_REGISTER + || regno == FRAME_POINTER_REGNUM + || (GET_MODE_SIZE (mode) >= 4 + && (regno == STACK_POINTER_REGNUM + || regno >= FIRST_PSEUDO_REGISTER + || x == hard_frame_pointer_rtx + || x == arg_pointer_rtx))); +} + +/* Return nonzero if x is a legitimate index register. This is the case + for any base register that can access a QImode object. */ +inline static int +thumb1_index_register_rtx_p (rtx x, int strict_p) +{ + return thumb1_base_register_rtx_p (x, QImode, strict_p); +} + +/* Return nonzero if x is a legitimate 16-bit Thumb-state address. + + The AP may be eliminated to either the SP or the FP, so we use the + least common denominator, e.g. SImode, and offsets from 0 to 64. + + ??? Verify whether the above is the right approach. + + ??? Also, the FP may be eliminated to the SP, so perhaps that + needs special handling also. + + ??? Look at how the mips16 port solves this problem. It probably uses + better ways to solve some of these problems. + + Although it is not incorrect, we don't accept QImode and HImode + addresses based on the frame pointer or arg pointer until the + reload pass starts. This is so that eliminating such addresses + into stack based ones won't produce impossible code. */ +static int +thumb1_legitimate_address_p (enum machine_mode mode, rtx x, int strict_p) +{ + /* ??? Not clear if this is right. Experiment. */ + if (GET_MODE_SIZE (mode) < 4 + && !(reload_in_progress || reload_completed) + && (reg_mentioned_p (frame_pointer_rtx, x) + || reg_mentioned_p (arg_pointer_rtx, x) + || reg_mentioned_p (virtual_incoming_args_rtx, x) + || reg_mentioned_p (virtual_outgoing_args_rtx, x) + || reg_mentioned_p (virtual_stack_dynamic_rtx, x) + || reg_mentioned_p (virtual_stack_vars_rtx, x))) + return 0; + + /* Accept any base register. SP only in SImode or larger. */ + else if (thumb1_base_register_rtx_p (x, mode, strict_p)) + return 1; + + /* This is PC relative data before arm_reorg runs. */ + else if (GET_MODE_SIZE (mode) >= 4 && CONSTANT_P (x) + && GET_CODE (x) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (x) && !flag_pic) + return 1; + + /* This is PC relative data after arm_reorg runs. */ + else if ((GET_MODE_SIZE (mode) >= 4 || mode == HFmode) + && reload_completed + && (GET_CODE (x) == LABEL_REF + || (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF + && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT))) + return 1; + + /* Post-inc indexing only supported for SImode and larger. */ + else if (GET_CODE (x) == POST_INC && GET_MODE_SIZE (mode) >= 4 + && thumb1_index_register_rtx_p (XEXP (x, 0), strict_p)) + return 1; + + else if (GET_CODE (x) == PLUS) + { + /* REG+REG address can be any two index registers. */ + /* We disallow FRAME+REG addressing since we know that FRAME + will be replaced with STACK, and SP relative addressing only + permits SP+OFFSET. */ + if (GET_MODE_SIZE (mode) <= 4 + && XEXP (x, 0) != frame_pointer_rtx + && XEXP (x, 1) != frame_pointer_rtx + && thumb1_index_register_rtx_p (XEXP (x, 0), strict_p) + && (thumb1_index_register_rtx_p (XEXP (x, 1), strict_p) + || (!strict_p && will_be_in_index_register (XEXP (x, 1))))) + return 1; + + /* REG+const has 5-7 bit offset for non-SP registers. */ + else if ((thumb1_index_register_rtx_p (XEXP (x, 0), strict_p) + || XEXP (x, 0) == arg_pointer_rtx) + && GET_CODE (XEXP (x, 1)) == CONST_INT + && thumb_legitimate_offset_p (mode, INTVAL (XEXP (x, 1)))) + return 1; + + /* REG+const has 10-bit offset for SP, but only SImode and + larger is supported. */ + /* ??? Should probably check for DI/DFmode overflow here + just like GO_IF_LEGITIMATE_OFFSET does. */ + else if (GET_CODE (XEXP (x, 0)) == REG + && REGNO (XEXP (x, 0)) == STACK_POINTER_REGNUM + && GET_MODE_SIZE (mode) >= 4 + && GET_CODE (XEXP (x, 1)) == CONST_INT + && INTVAL (XEXP (x, 1)) >= 0 + && INTVAL (XEXP (x, 1)) + GET_MODE_SIZE (mode) <= 1024 + && (INTVAL (XEXP (x, 1)) & 3) == 0) + return 1; + + else if (GET_CODE (XEXP (x, 0)) == REG + && (REGNO (XEXP (x, 0)) == FRAME_POINTER_REGNUM + || REGNO (XEXP (x, 0)) == ARG_POINTER_REGNUM + || (REGNO (XEXP (x, 0)) >= FIRST_VIRTUAL_REGISTER + && REGNO (XEXP (x, 0)) + <= LAST_VIRTUAL_POINTER_REGISTER)) + && GET_MODE_SIZE (mode) >= 4 + && GET_CODE (XEXP (x, 1)) == CONST_INT + && (INTVAL (XEXP (x, 1)) & 3) == 0) + return 1; + } + + else if (GET_MODE_CLASS (mode) != MODE_FLOAT + && GET_MODE_SIZE (mode) == 4 + && GET_CODE (x) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (x) + && ! (flag_pic + && symbol_mentioned_p (get_pool_constant (x)) + && ! pcrel_constant_p (get_pool_constant (x)))) + return 1; + + return 0; +} + +/* Return nonzero if VAL can be used as an offset in a Thumb-state address + instruction of mode MODE. */ +int +thumb_legitimate_offset_p (enum machine_mode mode, HOST_WIDE_INT val) +{ + switch (GET_MODE_SIZE (mode)) + { + case 1: + return val >= 0 && val < 32; + + case 2: + return val >= 0 && val < 64 && (val & 1) == 0; + + default: + return (val >= 0 + && (val + GET_MODE_SIZE (mode)) <= 128 + && (val & 3) == 0); + } +} + +bool +arm_legitimate_address_p (enum machine_mode mode, rtx x, bool strict_p) +{ + if (TARGET_ARM) + return arm_legitimate_address_outer_p (mode, x, SET, strict_p); + else if (TARGET_THUMB2) + return thumb2_legitimate_address_p (mode, x, strict_p); + else /* if (TARGET_THUMB1) */ + return thumb1_legitimate_address_p (mode, x, strict_p); +} + +/* Build the SYMBOL_REF for __tls_get_addr. */ + +static GTY(()) rtx tls_get_addr_libfunc; + +static rtx +get_tls_get_addr (void) +{ + if (!tls_get_addr_libfunc) + tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr"); + return tls_get_addr_libfunc; +} + +static rtx +arm_load_tp (rtx target) +{ + if (!target) + target = gen_reg_rtx (SImode); + + if (TARGET_HARD_TP) + { + /* Can return in any reg. */ + emit_insn (gen_load_tp_hard (target)); + } + else + { + /* Always returned in r0. Immediately copy the result into a pseudo, + otherwise other uses of r0 (e.g. setting up function arguments) may + clobber the value. */ + + rtx tmp; + + emit_insn (gen_load_tp_soft ()); + + tmp = gen_rtx_REG (SImode, 0); + emit_move_insn (target, tmp); + } + return target; +} + +static rtx +load_tls_operand (rtx x, rtx reg) +{ + rtx tmp; + + if (reg == NULL_RTX) + reg = gen_reg_rtx (SImode); + + tmp = gen_rtx_CONST (SImode, x); + + emit_move_insn (reg, tmp); + + return reg; +} + +static rtx +arm_call_tls_get_addr (rtx x, rtx reg, rtx *valuep, int reloc) +{ + rtx insns, label, labelno, sum; + + start_sequence (); + + labelno = GEN_INT (pic_labelno++); + label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); + label = gen_rtx_CONST (VOIDmode, label); + + sum = gen_rtx_UNSPEC (Pmode, + gen_rtvec (4, x, GEN_INT (reloc), label, + GEN_INT (TARGET_ARM ? 8 : 4)), + UNSPEC_TLS); + reg = load_tls_operand (sum, reg); + + if (TARGET_ARM) + emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno)); + else if (TARGET_THUMB2) + emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + else /* TARGET_THUMB1 */ + emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + + *valuep = emit_library_call_value (get_tls_get_addr (), NULL_RTX, LCT_PURE, /* LCT_CONST? */ + Pmode, 1, reg, Pmode); + + insns = get_insns (); + end_sequence (); + + return insns; +} + +rtx +legitimize_tls_address (rtx x, rtx reg) +{ + rtx dest, tp, label, labelno, sum, insns, ret, eqv, addend; + unsigned int model = SYMBOL_REF_TLS_MODEL (x); + + switch (model) + { + case TLS_MODEL_GLOBAL_DYNAMIC: + insns = arm_call_tls_get_addr (x, reg, &ret, TLS_GD32); + dest = gen_reg_rtx (Pmode); + emit_libcall_block (insns, dest, ret, x); + return dest; + + case TLS_MODEL_LOCAL_DYNAMIC: + insns = arm_call_tls_get_addr (x, reg, &ret, TLS_LDM32); + + /* Attach a unique REG_EQUIV, to allow the RTL optimizers to + share the LDM result with other LD model accesses. */ + eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const1_rtx), + UNSPEC_TLS); + dest = gen_reg_rtx (Pmode); + emit_libcall_block (insns, dest, ret, eqv); + + /* Load the addend. */ + addend = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, x, GEN_INT (TLS_LDO32)), + UNSPEC_TLS); + addend = force_reg (SImode, gen_rtx_CONST (SImode, addend)); + return gen_rtx_PLUS (Pmode, dest, addend); + + case TLS_MODEL_INITIAL_EXEC: + labelno = GEN_INT (pic_labelno++); + label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); + label = gen_rtx_CONST (VOIDmode, label); + sum = gen_rtx_UNSPEC (Pmode, + gen_rtvec (4, x, GEN_INT (TLS_IE32), label, + GEN_INT (TARGET_ARM ? 8 : 4)), + UNSPEC_TLS); + reg = load_tls_operand (sum, reg); + + if (TARGET_ARM) + emit_insn (gen_tls_load_dot_plus_eight (reg, reg, labelno)); + else if (TARGET_THUMB2) + emit_insn (gen_tls_load_dot_plus_four (reg, NULL, reg, labelno)); + else + { + emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + emit_move_insn (reg, gen_const_mem (SImode, reg)); + } + + tp = arm_load_tp (NULL_RTX); + + return gen_rtx_PLUS (Pmode, tp, reg); + + case TLS_MODEL_LOCAL_EXEC: + tp = arm_load_tp (NULL_RTX); + + reg = gen_rtx_UNSPEC (Pmode, + gen_rtvec (2, x, GEN_INT (TLS_LE32)), + UNSPEC_TLS); + reg = force_reg (SImode, gen_rtx_CONST (SImode, reg)); + + return gen_rtx_PLUS (Pmode, tp, reg); + + default: + abort (); + } +} + +/* Try machine-dependent ways of modifying an illegitimate address + to be legitimate. If we find one, return the new, valid address. */ +rtx +arm_legitimize_address (rtx x, rtx orig_x, enum machine_mode mode) +{ + if (!TARGET_ARM) + { + /* TODO: legitimize_address for Thumb2. */ + if (TARGET_THUMB2) + return x; + return thumb_legitimize_address (x, orig_x, mode); + } + + if (arm_tls_symbol_p (x)) + return legitimize_tls_address (x, NULL_RTX); + + if (GET_CODE (x) == PLUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + if (CONSTANT_P (xop0) && !symbol_mentioned_p (xop0)) + xop0 = force_reg (SImode, xop0); + + if (CONSTANT_P (xop1) && !symbol_mentioned_p (xop1)) + xop1 = force_reg (SImode, xop1); + + if (ARM_BASE_REGISTER_RTX_P (xop0) + && GET_CODE (xop1) == CONST_INT) + { + HOST_WIDE_INT n, low_n; + rtx base_reg, val; + n = INTVAL (xop1); + + /* VFP addressing modes actually allow greater offsets, but for + now we just stick with the lowest common denominator. */ + if (mode == DImode + || ((TARGET_SOFT_FLOAT || TARGET_VFP) && mode == DFmode)) + { + low_n = n & 0x0f; + n &= ~0x0f; + if (low_n > 4) + { + n += 16; + low_n -= 16; + } + } + else + { + low_n = ((mode) == TImode ? 0 + : n >= 0 ? (n & 0xfff) : -((-n) & 0xfff)); + n -= low_n; + } + + base_reg = gen_reg_rtx (SImode); + val = force_operand (plus_constant (xop0, n), NULL_RTX); + emit_move_insn (base_reg, val); + x = plus_constant (base_reg, low_n); + } + else if (xop0 != XEXP (x, 0) || xop1 != XEXP (x, 1)) + x = gen_rtx_PLUS (SImode, xop0, xop1); + } + + /* XXX We don't allow MINUS any more -- see comment in + arm_legitimate_address_outer_p (). */ + else if (GET_CODE (x) == MINUS) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + + if (CONSTANT_P (xop0)) + xop0 = force_reg (SImode, xop0); + + if (CONSTANT_P (xop1) && ! symbol_mentioned_p (xop1)) + xop1 = force_reg (SImode, xop1); + + if (xop0 != XEXP (x, 0) || xop1 != XEXP (x, 1)) + x = gen_rtx_MINUS (SImode, xop0, xop1); + } + + /* Make sure to take full advantage of the pre-indexed addressing mode + with absolute addresses which often allows for the base register to + be factorized for multiple adjacent memory references, and it might + even allows for the mini pool to be avoided entirely. */ + else if (GET_CODE (x) == CONST_INT && optimize > 0) + { + unsigned int bits; + HOST_WIDE_INT mask, base, index; + rtx base_reg; + + /* ldr and ldrb can use a 12-bit index, ldrsb and the rest can only + use a 8-bit index. So let's use a 12-bit index for SImode only and + hope that arm_gen_constant will enable ldrb to use more bits. */ + bits = (mode == SImode) ? 12 : 8; + mask = (1 << bits) - 1; + base = INTVAL (x) & ~mask; + index = INTVAL (x) & mask; + if (bit_count (base & 0xffffffff) > (32 - bits)/2) + { + /* It'll most probably be more efficient to generate the base + with more bits set and use a negative index instead. */ + base |= mask; + index -= mask; + } + base_reg = force_reg (SImode, GEN_INT (base)); + x = plus_constant (base_reg, index); + } + + if (flag_pic) + { + /* We need to find and carefully transform any SYMBOL and LABEL + references; so go back to the original address expression. */ + rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX); + + if (new_x != orig_x) + x = new_x; + } + + return x; +} + + +/* Try machine-dependent ways of modifying an illegitimate Thumb address + to be legitimate. If we find one, return the new, valid address. */ +rtx +thumb_legitimize_address (rtx x, rtx orig_x, enum machine_mode mode) { - /* ??? Not clear if this is right. Experiment. */ - if (GET_MODE_SIZE (mode) < 4 - && !(reload_in_progress || reload_completed) - && (reg_mentioned_p (frame_pointer_rtx, x) - || reg_mentioned_p (arg_pointer_rtx, x) - || reg_mentioned_p (virtual_incoming_args_rtx, x) - || reg_mentioned_p (virtual_outgoing_args_rtx, x) - || reg_mentioned_p (virtual_stack_dynamic_rtx, x) - || reg_mentioned_p (virtual_stack_vars_rtx, x))) - return 0; + if (arm_tls_symbol_p (x)) + return legitimize_tls_address (x, NULL_RTX); - /* Accept any base register. SP only in SImode or larger. */ - else if (thumb1_base_register_rtx_p (x, mode, strict_p)) - return 1; + if (GET_CODE (x) == PLUS + && GET_CODE (XEXP (x, 1)) == CONST_INT + && (INTVAL (XEXP (x, 1)) >= 32 * GET_MODE_SIZE (mode) + || INTVAL (XEXP (x, 1)) < 0)) + { + rtx xop0 = XEXP (x, 0); + rtx xop1 = XEXP (x, 1); + HOST_WIDE_INT offset = INTVAL (xop1); - /* This is PC relative data before arm_reorg runs. */ - else if (GET_MODE_SIZE (mode) >= 4 && CONSTANT_P (x) - && GET_CODE (x) == SYMBOL_REF - && CONSTANT_POOL_ADDRESS_P (x) && !flag_pic) - return 1; + /* Try and fold the offset into a biasing of the base register and + then offsetting that. Don't do this when optimizing for space + since it can cause too many CSEs. */ + if (optimize_size && offset >= 0 + && offset < 256 + 31 * GET_MODE_SIZE (mode)) + { + HOST_WIDE_INT delta; - /* This is PC relative data after arm_reorg runs. */ - else if (GET_MODE_SIZE (mode) >= 4 && reload_completed - && (GET_CODE (x) == LABEL_REF - || (GET_CODE (x) == CONST - && GET_CODE (XEXP (x, 0)) == PLUS - && GET_CODE (XEXP (XEXP (x, 0), 0)) == LABEL_REF - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT))) - return 1; + if (offset >= 256) + delta = offset - (256 - GET_MODE_SIZE (mode)); + else if (offset < 32 * GET_MODE_SIZE (mode) + 8) + delta = 31 * GET_MODE_SIZE (mode); + else + delta = offset & (~31 * GET_MODE_SIZE (mode)); - /* Post-inc indexing only supported for SImode and larger. */ - else if (GET_CODE (x) == POST_INC && GET_MODE_SIZE (mode) >= 4 - && thumb1_index_register_rtx_p (XEXP (x, 0), strict_p)) - return 1; + xop0 = force_operand (plus_constant (xop0, offset - delta), + NULL_RTX); + x = plus_constant (xop0, delta); + } + else if (offset < 0 && offset > -256) + /* Small negative offsets are best done with a subtract before the + dereference, forcing these into a register normally takes two + instructions. */ + x = force_operand (x, NULL_RTX); + else + { + /* For the remaining cases, force the constant into a register. */ + xop1 = force_reg (SImode, xop1); + x = gen_rtx_PLUS (SImode, xop0, xop1); + } + } + else if (GET_CODE (x) == PLUS + && s_register_operand (XEXP (x, 1), SImode) + && !s_register_operand (XEXP (x, 0), SImode)) + { + rtx xop0 = force_operand (XEXP (x, 0), NULL_RTX); - else if (GET_CODE (x) == PLUS) + x = gen_rtx_PLUS (SImode, xop0, XEXP (x, 1)); + } + + if (flag_pic) { - /* REG+REG address can be any two index registers. */ - /* We disallow FRAME+REG addressing since we know that FRAME - will be replaced with STACK, and SP relative addressing only - permits SP+OFFSET. */ - if (GET_MODE_SIZE (mode) <= 4 - && XEXP (x, 0) != frame_pointer_rtx - && XEXP (x, 1) != frame_pointer_rtx - && thumb1_index_register_rtx_p (XEXP (x, 0), strict_p) - && thumb1_index_register_rtx_p (XEXP (x, 1), strict_p)) - return 1; + /* We need to find and carefully transform any SYMBOL and LABEL + references; so go back to the original address expression. */ + rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX); - /* REG+const has 5-7 bit offset for non-SP registers. */ - else if ((thumb1_index_register_rtx_p (XEXP (x, 0), strict_p) - || XEXP (x, 0) == arg_pointer_rtx) - && GET_CODE (XEXP (x, 1)) == CONST_INT - && thumb_legitimate_offset_p (mode, INTVAL (XEXP (x, 1)))) - return 1; + if (new_x != orig_x) + x = new_x; + } - /* REG+const has 10-bit offset for SP, but only SImode and - larger is supported. */ - /* ??? Should probably check for DI/DFmode overflow here - just like GO_IF_LEGITIMATE_OFFSET does. */ - else if (GET_CODE (XEXP (x, 0)) == REG - && REGNO (XEXP (x, 0)) == STACK_POINTER_REGNUM - && GET_MODE_SIZE (mode) >= 4 - && GET_CODE (XEXP (x, 1)) == CONST_INT - && INTVAL (XEXP (x, 1)) >= 0 - && INTVAL (XEXP (x, 1)) + GET_MODE_SIZE (mode) <= 1024 - && (INTVAL (XEXP (x, 1)) & 3) == 0) - return 1; + return x; +} - else if (GET_CODE (XEXP (x, 0)) == REG - && (REGNO (XEXP (x, 0)) == FRAME_POINTER_REGNUM - || REGNO (XEXP (x, 0)) == ARG_POINTER_REGNUM - || (REGNO (XEXP (x, 0)) >= FIRST_VIRTUAL_REGISTER - && REGNO (XEXP (x, 0)) <= LAST_VIRTUAL_REGISTER)) - && GET_MODE_SIZE (mode) >= 4 - && GET_CODE (XEXP (x, 1)) == CONST_INT - && (INTVAL (XEXP (x, 1)) & 3) == 0) - return 1; +rtx +thumb_legitimize_reload_address (rtx *x_p, + enum machine_mode mode, + int opnum, int type, + int ind_levels ATTRIBUTE_UNUSED) +{ + rtx x = *x_p; + + if (GET_CODE (x) == PLUS + && GET_MODE_SIZE (mode) < 4 + && REG_P (XEXP (x, 0)) + && XEXP (x, 0) == stack_pointer_rtx + && GET_CODE (XEXP (x, 1)) == CONST_INT + && !thumb_legitimate_offset_p (mode, INTVAL (XEXP (x, 1)))) + { + rtx orig_x = x; + + x = copy_rtx (x); + push_reload (orig_x, NULL_RTX, x_p, NULL, MODE_BASE_REG_CLASS (mode), + Pmode, VOIDmode, 0, 0, opnum, (enum reload_type) type); + return x; } - else if (GET_MODE_CLASS (mode) != MODE_FLOAT - && GET_MODE_SIZE (mode) == 4 - && GET_CODE (x) == SYMBOL_REF - && CONSTANT_POOL_ADDRESS_P (x) - && ! (flag_pic - && symbol_mentioned_p (get_pool_constant (x)) - && ! pcrel_constant_p (get_pool_constant (x)))) - return 1; + /* If both registers are hi-regs, then it's better to reload the + entire expression rather than each register individually. That + only requires one reload register rather than two. */ + if (GET_CODE (x) == PLUS + && REG_P (XEXP (x, 0)) + && REG_P (XEXP (x, 1)) + && !REG_MODE_OK_FOR_REG_BASE_P (XEXP (x, 0), mode) + && !REG_MODE_OK_FOR_REG_BASE_P (XEXP (x, 1), mode)) + { + rtx orig_x = x; + + x = copy_rtx (x); + push_reload (orig_x, NULL_RTX, x_p, NULL, MODE_BASE_REG_CLASS (mode), + Pmode, VOIDmode, 0, 0, opnum, (enum reload_type) type); + return x; + } + + return NULL; +} + +/* Test for various thread-local symbols. */ + +/* Return TRUE if X is a thread-local symbol. */ + +static bool +arm_tls_symbol_p (rtx x) +{ + if (! TARGET_HAVE_TLS) + return false; - return 0; + if (GET_CODE (x) != SYMBOL_REF) + return false; + + return SYMBOL_REF_TLS_MODEL (x) != 0; } -/* Return nonzero if VAL can be used as an offset in a Thumb-state address - instruction of mode MODE. */ -int -thumb_legitimate_offset_p (enum machine_mode mode, HOST_WIDE_INT val) +/* Helper for arm_tls_referenced_p. */ + +static int +arm_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED) { - switch (GET_MODE_SIZE (mode)) - { - case 1: - return val >= 0 && val < 32; + if (GET_CODE (*x) == SYMBOL_REF) + return SYMBOL_REF_TLS_MODEL (*x) != 0; - case 2: - return val >= 0 && val < 64 && (val & 1) == 0; + /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are + TLS offsets, not real symbol references. */ + if (GET_CODE (*x) == UNSPEC + && XINT (*x, 1) == UNSPEC_TLS) + return -1; - default: - return (val >= 0 - && (val + GET_MODE_SIZE (mode)) <= 128 - && (val & 3) == 0); - } + return 0; } -/* Build the SYMBOL_REF for __tls_get_addr. */ - -static GTY(()) rtx tls_get_addr_libfunc; +/* Return TRUE if X contains any TLS symbol references. */ -static rtx -get_tls_get_addr (void) +bool +arm_tls_referenced_p (rtx x) { - if (!tls_get_addr_libfunc) - tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr"); - return tls_get_addr_libfunc; + if (! TARGET_HAVE_TLS) + return false; + + return for_each_rtx (&x, arm_tls_operand_p_1, NULL); } -static rtx -arm_load_tp (rtx target) +/* Implement TARGET_CANNOT_FORCE_CONST_MEM. */ + +bool +arm_cannot_force_const_mem (rtx x) { - if (!target) - target = gen_reg_rtx (SImode); + rtx base, offset; - if (TARGET_HARD_TP) + if (ARM_OFFSETS_MUST_BE_WITHIN_SECTIONS_P) { - /* Can return in any reg. */ - emit_insn (gen_load_tp_hard (target)); + split_const (x, &base, &offset); + if (GET_CODE (base) == SYMBOL_REF + && !offset_within_block_p (base, INTVAL (offset))) + return true; } - else - { - /* Always returned in r0. Immediately copy the result into a pseudo, - otherwise other uses of r0 (e.g. setting up function arguments) may - clobber the value. */ + return arm_tls_referenced_p (x); +} + +#define REG_OR_SUBREG_REG(X) \ + (GET_CODE (X) == REG \ + || (GET_CODE (X) == SUBREG && GET_CODE (SUBREG_REG (X)) == REG)) - rtx tmp; +#define REG_OR_SUBREG_RTX(X) \ + (GET_CODE (X) == REG ? (X) : SUBREG_REG (X)) - emit_insn (gen_load_tp_soft ()); +static inline int +thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) +{ + enum machine_mode mode = GET_MODE (x); + int total; - tmp = gen_rtx_REG (SImode, 0); - emit_move_insn (target, tmp); - } - return target; -} + switch (code) + { + case ASHIFT: + case ASHIFTRT: + case LSHIFTRT: + case ROTATERT: + case PLUS: + case MINUS: + case COMPARE: + case NEG: + case NOT: + return COSTS_N_INSNS (1); -static rtx -load_tls_operand (rtx x, rtx reg) -{ - rtx tmp; + case MULT: + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + { + int cycles = 0; + unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1)); - if (reg == NULL_RTX) - reg = gen_reg_rtx (SImode); + while (i) + { + i >>= 2; + cycles++; + } + return COSTS_N_INSNS (2) + cycles; + } + return COSTS_N_INSNS (1) + 16; - tmp = gen_rtx_CONST (SImode, x); + case SET: + return (COSTS_N_INSNS (1) + + 4 * ((GET_CODE (SET_SRC (x)) == MEM) + + GET_CODE (SET_DEST (x)) == MEM)); - emit_move_insn (reg, tmp); + case CONST_INT: + if (outer == SET) + { + if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256) + return 0; + if (thumb_shiftable_const (INTVAL (x))) + return COSTS_N_INSNS (2); + return COSTS_N_INSNS (3); + } + else if ((outer == PLUS || outer == COMPARE) + && INTVAL (x) < 256 && INTVAL (x) > -256) + return 0; + else if ((outer == IOR || outer == XOR || outer == AND) + && INTVAL (x) < 256 && INTVAL (x) >= -256) + return COSTS_N_INSNS (1); + else if (outer == AND) + { + int i; + /* This duplicates the tests in the andsi3 expander. */ + for (i = 9; i <= 31; i++) + if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x) + || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x)) + return COSTS_N_INSNS (2); + } + else if (outer == ASHIFT || outer == ASHIFTRT + || outer == LSHIFTRT) + return 0; + return COSTS_N_INSNS (2); - return reg; -} + case CONST: + case CONST_DOUBLE: + case LABEL_REF: + case SYMBOL_REF: + return COSTS_N_INSNS (3); -static rtx -arm_call_tls_get_addr (rtx x, rtx reg, rtx *valuep, int reloc) -{ - rtx insns, label, labelno, sum; + case UDIV: + case UMOD: + case DIV: + case MOD: + return 100; - start_sequence (); + case TRUNCATE: + return 99; - labelno = GEN_INT (pic_labelno++); - label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); - label = gen_rtx_CONST (VOIDmode, label); + case AND: + case XOR: + case IOR: + /* XXX guess. */ + return 8; - sum = gen_rtx_UNSPEC (Pmode, - gen_rtvec (4, x, GEN_INT (reloc), label, - GEN_INT (TARGET_ARM ? 8 : 4)), - UNSPEC_TLS); - reg = load_tls_operand (sum, reg); + case MEM: + /* XXX another guess. */ + /* Memory costs quite a lot for the first word, but subsequent words + load at the equivalent of a single insn each. */ + return (10 + 4 * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD) + + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)) + ? 4 : 0)); - if (TARGET_ARM) - emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno)); - else if (TARGET_THUMB2) - { - rtx tmp; - /* Thumb-2 only allows very limited access to the PC. Calculate - the address in a temporary register. */ - tmp = gen_reg_rtx (SImode); - emit_insn (gen_pic_load_dot_plus_four (tmp, labelno)); - emit_insn (gen_addsi3(reg, reg, tmp)); - } - else /* TARGET_THUMB1 */ - emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); + case IF_THEN_ELSE: + /* XXX a guess. */ + if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC) + return 14; + return 2; - *valuep = emit_library_call_value (get_tls_get_addr (), NULL_RTX, LCT_PURE, /* LCT_CONST? */ - Pmode, 1, reg, Pmode); + case SIGN_EXTEND: + case ZERO_EXTEND: + total = mode == DImode ? COSTS_N_INSNS (1) : 0; + total += thumb1_rtx_costs (XEXP (x, 0), GET_CODE (XEXP (x, 0)), code); - insns = get_insns (); - end_sequence (); + if (mode == SImode) + return total; - return insns; + if (arm_arch6) + return total + COSTS_N_INSNS (1); + + /* Assume a two-shift sequence. Increase the cost slightly so + we prefer actual shifts over an extend operation. */ + return total + 1 + COSTS_N_INSNS (2); + + default: + return 99; + } } -rtx -legitimize_tls_address (rtx x, rtx reg) +static inline bool +arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed) { - rtx dest, tp, label, labelno, sum, insns, ret, eqv, addend; - unsigned int model = SYMBOL_REF_TLS_MODEL (x); + enum machine_mode mode = GET_MODE (x); + enum rtx_code subcode; + rtx operand; + enum rtx_code code = GET_CODE (x); + *total = 0; - switch (model) + switch (code) { - case TLS_MODEL_GLOBAL_DYNAMIC: - insns = arm_call_tls_get_addr (x, reg, &ret, TLS_GD32); - dest = gen_reg_rtx (Pmode); - emit_libcall_block (insns, dest, ret, x); - return dest; + case MEM: + /* Memory costs quite a lot for the first word, but subsequent words + load at the equivalent of a single insn each. */ + *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode)); + return true; - case TLS_MODEL_LOCAL_DYNAMIC: - insns = arm_call_tls_get_addr (x, reg, &ret, TLS_LDM32); + case DIV: + case MOD: + case UDIV: + case UMOD: + if (TARGET_HARD_FLOAT && mode == SFmode) + *total = COSTS_N_INSNS (2); + else if (TARGET_HARD_FLOAT && mode == DFmode && !TARGET_VFP_SINGLE) + *total = COSTS_N_INSNS (4); + else + *total = COSTS_N_INSNS (20); + return false; - /* Attach a unique REG_EQUIV, to allow the RTL optimizers to - share the LDM result with other LD model accesses. */ - eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const1_rtx), - UNSPEC_TLS); - dest = gen_reg_rtx (Pmode); - emit_libcall_block (insns, dest, ret, eqv); + case ROTATE: + if (GET_CODE (XEXP (x, 1)) == REG) + *total = COSTS_N_INSNS (1); /* Need to subtract from 32 */ + else if (GET_CODE (XEXP (x, 1)) != CONST_INT) + *total = rtx_cost (XEXP (x, 1), code, speed); + + /* Fall through */ + case ROTATERT: + if (mode != SImode) + { + *total += COSTS_N_INSNS (4); + return true; + } + + /* Fall through */ + case ASHIFT: case LSHIFTRT: case ASHIFTRT: + *total += rtx_cost (XEXP (x, 0), code, speed); + if (mode == DImode) + { + *total += COSTS_N_INSNS (3); + return true; + } - /* Load the addend. */ - addend = gen_rtx_UNSPEC (Pmode, gen_rtvec (2, x, GEN_INT (TLS_LDO32)), - UNSPEC_TLS); - addend = force_reg (SImode, gen_rtx_CONST (SImode, addend)); - return gen_rtx_PLUS (Pmode, dest, addend); + *total += COSTS_N_INSNS (1); + /* Increase the cost of complex shifts because they aren't any faster, + and reduce dual issue opportunities. */ + if (arm_tune_cortex_a9 + && outer != SET && GET_CODE (XEXP (x, 1)) != CONST_INT) + ++*total; - case TLS_MODEL_INITIAL_EXEC: - labelno = GEN_INT (pic_labelno++); - label = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, labelno), UNSPEC_PIC_LABEL); - label = gen_rtx_CONST (VOIDmode, label); - sum = gen_rtx_UNSPEC (Pmode, - gen_rtvec (4, x, GEN_INT (TLS_IE32), label, - GEN_INT (TARGET_ARM ? 8 : 4)), - UNSPEC_TLS); - reg = load_tls_operand (sum, reg); + return true; - if (TARGET_ARM) - emit_insn (gen_tls_load_dot_plus_eight (reg, reg, labelno)); - else if (TARGET_THUMB2) + case MINUS: + if (mode == DImode) { - rtx tmp; - /* Thumb-2 only allows very limited access to the PC. Calculate - the address in a temporary register. */ - tmp = gen_reg_rtx (SImode); - emit_insn (gen_pic_load_dot_plus_four (tmp, labelno)); - emit_insn (gen_addsi3(reg, reg, tmp)); - emit_move_insn (reg, gen_const_mem (SImode, reg)); + *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); + if (GET_CODE (XEXP (x, 0)) == CONST_INT + && const_ok_for_arm (INTVAL (XEXP (x, 0)))) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } + + if (GET_CODE (XEXP (x, 1)) == CONST_INT + && const_ok_for_arm (INTVAL (XEXP (x, 1)))) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } + + return false; } - else + + if (GET_MODE_CLASS (mode) == MODE_FLOAT) { - emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno)); - emit_move_insn (reg, gen_const_mem (SImode, reg)); - } + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE + && arm_const_double_rtx (XEXP (x, 0))) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } - tp = arm_load_tp (NULL_RTX); + if (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE + && arm_const_double_rtx (XEXP (x, 1))) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } - return gen_rtx_PLUS (Pmode, tp, reg); + return false; + } + *total = COSTS_N_INSNS (20); + return false; + } - case TLS_MODEL_LOCAL_EXEC: - tp = arm_load_tp (NULL_RTX); + *total = COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 0)) == CONST_INT + && const_ok_for_arm (INTVAL (XEXP (x, 0)))) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } - reg = gen_rtx_UNSPEC (Pmode, - gen_rtvec (2, x, GEN_INT (TLS_LE32)), - UNSPEC_TLS); - reg = force_reg (SImode, gen_rtx_CONST (SImode, reg)); + subcode = GET_CODE (XEXP (x, 1)); + if (subcode == ASHIFT || subcode == ASHIFTRT + || subcode == LSHIFTRT + || subcode == ROTATE || subcode == ROTATERT) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, speed); + return true; + } - return gen_rtx_PLUS (Pmode, tp, reg); + /* A shift as a part of RSB costs no more than RSB itself. */ + if (GET_CODE (XEXP (x, 0)) == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) + { + *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, speed); + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } - default: - abort (); - } -} + if (subcode == MULT + && power_of_two_operand (XEXP (XEXP (x, 1), 1), SImode)) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 1), 0), subcode, speed); + return true; + } -/* Try machine-dependent ways of modifying an illegitimate address - to be legitimate. If we find one, return the new, valid address. */ -rtx -arm_legitimize_address (rtx x, rtx orig_x, enum machine_mode mode) -{ - if (arm_tls_symbol_p (x)) - return legitimize_tls_address (x, NULL_RTX); + if (GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMPARE + || GET_RTX_CLASS (GET_CODE (XEXP (x, 1))) == RTX_COMM_COMPARE) + { + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed); + if (GET_CODE (XEXP (XEXP (x, 1), 0)) == REG + && REGNO (XEXP (XEXP (x, 1), 0)) != CC_REGNUM) + *total += COSTS_N_INSNS (1); - if (GET_CODE (x) == PLUS) - { - rtx xop0 = XEXP (x, 0); - rtx xop1 = XEXP (x, 1); + return true; + } - if (CONSTANT_P (xop0) && !symbol_mentioned_p (xop0)) - xop0 = force_reg (SImode, xop0); + /* Fall through */ - if (CONSTANT_P (xop1) && !symbol_mentioned_p (xop1)) - xop1 = force_reg (SImode, xop1); + case PLUS: + if (code == PLUS && arm_arch6 && mode == SImode + && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND + || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)) + { + *total = COSTS_N_INSNS (1); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), GET_CODE (XEXP (x, 0)), + speed); + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; + } - if (ARM_BASE_REGISTER_RTX_P (xop0) - && GET_CODE (xop1) == CONST_INT) + /* MLA: All arguments must be registers. We filter out + multiplication by a power of two, so that we fall down into + the code below. */ + if (GET_CODE (XEXP (x, 0)) == MULT + && !power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) { - HOST_WIDE_INT n, low_n; - rtx base_reg, val; - n = INTVAL (xop1); + /* The cost comes from the cost of the multiply. */ + return false; + } - /* VFP addressing modes actually allow greater offsets, but for - now we just stick with the lowest common denominator. */ - if (mode == DImode - || ((TARGET_SOFT_FLOAT || TARGET_VFP) && mode == DFmode)) + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) { - low_n = n & 0x0f; - n &= ~0x0f; - if (low_n > 4) + *total = COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE + && arm_const_double_rtx (XEXP (x, 1))) { - n += 16; - low_n -= 16; + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; } - } - else - { - low_n = ((mode) == TImode ? 0 - : n >= 0 ? (n & 0xfff) : -((-n) & 0xfff)); - n -= low_n; + + return false; } - base_reg = gen_reg_rtx (SImode); - val = force_operand (plus_constant (xop0, n), NULL_RTX); - emit_move_insn (base_reg, val); - x = plus_constant (base_reg, low_n); + *total = COSTS_N_INSNS (20); + return false; } - else if (xop0 != XEXP (x, 0) || xop1 != XEXP (x, 1)) - x = gen_rtx_PLUS (SImode, xop0, xop1); - } - /* XXX We don't allow MINUS any more -- see comment in - arm_legitimate_address_p (). */ - else if (GET_CODE (x) == MINUS) - { - rtx xop0 = XEXP (x, 0); - rtx xop1 = XEXP (x, 1); + if (GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMPARE + || GET_RTX_CLASS (GET_CODE (XEXP (x, 0))) == RTX_COMM_COMPARE) + { + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 1), code, speed); + if (GET_CODE (XEXP (XEXP (x, 0), 0)) == REG + && REGNO (XEXP (XEXP (x, 0), 0)) != CC_REGNUM) + *total += COSTS_N_INSNS (1); + return true; + } - if (CONSTANT_P (xop0)) - xop0 = force_reg (SImode, xop0); + /* Fall through */ - if (CONSTANT_P (xop1) && ! symbol_mentioned_p (xop1)) - xop1 = force_reg (SImode, xop1); + case AND: case XOR: case IOR: - if (xop0 != XEXP (x, 0) || xop1 != XEXP (x, 1)) - x = gen_rtx_MINUS (SImode, xop0, xop1); - } + /* Normally the frame registers will be spilt into reg+const during + reload, so it is a bad idea to combine them with other instructions, + since then they might not be moved outside of loops. As a compromise + we allow integration with ops that have a constant as their second + operand. */ + if (REG_OR_SUBREG_REG (XEXP (x, 0)) + && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0))) + && GET_CODE (XEXP (x, 1)) != CONST_INT) + *total = COSTS_N_INSNS (1); - /* Make sure to take full advantage of the pre-indexed addressing mode - with absolute addresses which often allows for the base register to - be factorized for multiple adjacent memory references, and it might - even allows for the mini pool to be avoided entirely. */ - else if (GET_CODE (x) == CONST_INT && optimize > 0) - { - unsigned int bits; - HOST_WIDE_INT mask, base, index; - rtx base_reg; + if (mode == DImode) + { + *total += COSTS_N_INSNS (2); + if (GET_CODE (XEXP (x, 1)) == CONST_INT + && const_ok_for_op (INTVAL (XEXP (x, 1)), code)) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } - /* ldr and ldrb can use a 12-bit index, ldrsb and the rest can only - use a 8-bit index. So let's use a 12-bit index for SImode only and - hope that arm_gen_constant will enable ldrb to use more bits. */ - bits = (mode == SImode) ? 12 : 8; - mask = (1 << bits) - 1; - base = INTVAL (x) & ~mask; - index = INTVAL (x) & mask; - if (bit_count (base & 0xffffffff) > (32 - bits)/2) - { - /* It'll most probably be more efficient to generate the base - with more bits set and use a negative index instead. */ - base |= mask; - index -= mask; + return false; } - base_reg = force_reg (SImode, GEN_INT (base)); - x = plus_constant (base_reg, index); - } - - if (flag_pic) - { - /* We need to find and carefully transform any SYMBOL and LABEL - references; so go back to the original address expression. */ - rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX); - if (new_x != orig_x) - x = new_x; - } + *total += COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 1)) == CONST_INT + && const_ok_for_op (INTVAL (XEXP (x, 1)), code)) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } + subcode = GET_CODE (XEXP (x, 0)); + if (subcode == ASHIFT || subcode == ASHIFTRT + || subcode == LSHIFTRT + || subcode == ROTATE || subcode == ROTATERT) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + return true; + } - return x; -} + if (subcode == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + return true; + } + if (subcode == UMIN || subcode == UMAX + || subcode == SMIN || subcode == SMAX) + { + *total = COSTS_N_INSNS (3); + return true; + } -/* Try machine-dependent ways of modifying an illegitimate Thumb address - to be legitimate. If we find one, return the new, valid address. */ -rtx -thumb_legitimize_address (rtx x, rtx orig_x, enum machine_mode mode) -{ - if (arm_tls_symbol_p (x)) - return legitimize_tls_address (x, NULL_RTX); + return false; - if (GET_CODE (x) == PLUS - && GET_CODE (XEXP (x, 1)) == CONST_INT - && (INTVAL (XEXP (x, 1)) >= 32 * GET_MODE_SIZE (mode) - || INTVAL (XEXP (x, 1)) < 0)) - { - rtx xop0 = XEXP (x, 0); - rtx xop1 = XEXP (x, 1); - HOST_WIDE_INT offset = INTVAL (xop1); + case MULT: + /* This should have been handled by the CPU specific routines. */ + gcc_unreachable (); - /* Try and fold the offset into a biasing of the base register and - then offsetting that. Don't do this when optimizing for space - since it can cause too many CSEs. */ - if (optimize_size && offset >= 0 - && offset < 256 + 31 * GET_MODE_SIZE (mode)) + case TRUNCATE: + if (arm_arch3m && mode == SImode + && GET_CODE (XEXP (x, 0)) == LSHIFTRT + && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT + && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) + == GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1))) + && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND + || GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND)) { - HOST_WIDE_INT delta; + *total = rtx_cost (XEXP (XEXP (x, 0), 0), LSHIFTRT, speed); + return true; + } + *total = COSTS_N_INSNS (2); /* Plus the cost of the MULT */ + return false; - if (offset >= 256) - delta = offset - (256 - GET_MODE_SIZE (mode)); - else if (offset < 32 * GET_MODE_SIZE (mode) + 8) - delta = 31 * GET_MODE_SIZE (mode); - else - delta = offset & (~31 * GET_MODE_SIZE (mode)); + case NEG: + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + return false; + } + *total = COSTS_N_INSNS (2); + return false; + } - xop0 = force_operand (plus_constant (xop0, offset - delta), - NULL_RTX); - x = plus_constant (xop0, delta); + /* Fall through */ + case NOT: + *total = COSTS_N_INSNS (ARM_NUM_REGS(mode)); + if (mode == SImode && code == NOT) + { + subcode = GET_CODE (XEXP (x, 0)); + if (subcode == ASHIFT || subcode == ASHIFTRT + || subcode == LSHIFTRT + || subcode == ROTATE || subcode == ROTATERT + || (subcode == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode))) + { + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + /* Register shifts cost an extra cycle. */ + if (GET_CODE (XEXP (XEXP (x, 0), 1)) != CONST_INT) + *total += COSTS_N_INSNS (1) + rtx_cost (XEXP (XEXP (x, 0), 1), + subcode, speed); + return true; + } } - else if (offset < 0 && offset > -256) - /* Small negative offsets are best done with a subtract before the - dereference, forcing these into a register normally takes two - instructions. */ - x = force_operand (x, NULL_RTX); - else + + return false; + + case IF_THEN_ELSE: + if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC) { - /* For the remaining cases, force the constant into a register. */ - xop1 = force_reg (SImode, xop1); - x = gen_rtx_PLUS (SImode, xop0, xop1); + *total = COSTS_N_INSNS (4); + return true; } - } - else if (GET_CODE (x) == PLUS - && s_register_operand (XEXP (x, 1), SImode) - && !s_register_operand (XEXP (x, 0), SImode)) - { - rtx xop0 = force_operand (XEXP (x, 0), NULL_RTX); - x = gen_rtx_PLUS (SImode, xop0, XEXP (x, 1)); - } + operand = XEXP (x, 0); - if (flag_pic) - { - /* We need to find and carefully transform any SYMBOL and LABEL - references; so go back to the original address expression. */ - rtx new_x = legitimize_pic_address (orig_x, mode, NULL_RTX); + if (!((GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMPARE + || GET_RTX_CLASS (GET_CODE (operand)) == RTX_COMM_COMPARE) + && GET_CODE (XEXP (operand, 0)) == REG + && REGNO (XEXP (operand, 0)) == CC_REGNUM)) + *total += COSTS_N_INSNS (1); + *total += (rtx_cost (XEXP (x, 1), code, speed) + + rtx_cost (XEXP (x, 2), code, speed)); + return true; - if (new_x != orig_x) - x = new_x; - } + case NE: + if (mode == SImode && XEXP (x, 1) == const0_rtx) + { + *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed); + return true; + } + goto scc_insn; - return x; -} + case GE: + if ((GET_CODE (XEXP (x, 0)) != REG || REGNO (XEXP (x, 0)) != CC_REGNUM) + && mode == SImode && XEXP (x, 1) == const0_rtx) + { + *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed); + return true; + } + goto scc_insn; -rtx -thumb_legitimize_reload_address (rtx *x_p, - enum machine_mode mode, - int opnum, int type, - int ind_levels ATTRIBUTE_UNUSED) -{ - rtx x = *x_p; + case LT: + if ((GET_CODE (XEXP (x, 0)) != REG || REGNO (XEXP (x, 0)) != CC_REGNUM) + && mode == SImode && XEXP (x, 1) == const0_rtx) + { + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed); + return true; + } + goto scc_insn; - if (GET_CODE (x) == PLUS - && GET_MODE_SIZE (mode) < 4 - && REG_P (XEXP (x, 0)) - && XEXP (x, 0) == stack_pointer_rtx - && GET_CODE (XEXP (x, 1)) == CONST_INT - && !thumb_legitimate_offset_p (mode, INTVAL (XEXP (x, 1)))) - { - rtx orig_x = x; + case EQ: + case GT: + case LE: + case GEU: + case LTU: + case GTU: + case LEU: + case UNORDERED: + case ORDERED: + case UNEQ: + case UNGE: + case UNLT: + case UNGT: + case UNLE: + scc_insn: + /* SCC insns. In the case where the comparison has already been + performed, then they cost 2 instructions. Otherwise they need + an additional comparison before them. */ + *total = COSTS_N_INSNS (2); + if (GET_CODE (XEXP (x, 0)) == REG && REGNO (XEXP (x, 0)) == CC_REGNUM) + { + return true; + } - x = copy_rtx (x); - push_reload (orig_x, NULL_RTX, x_p, NULL, MODE_BASE_REG_CLASS (mode), - Pmode, VOIDmode, 0, 0, opnum, type); - return x; - } + /* Fall through */ + case COMPARE: + if (GET_CODE (XEXP (x, 0)) == REG && REGNO (XEXP (x, 0)) == CC_REGNUM) + { + *total = 0; + return true; + } - /* If both registers are hi-regs, then it's better to reload the - entire expression rather than each register individually. That - only requires one reload register rather than two. */ - if (GET_CODE (x) == PLUS - && REG_P (XEXP (x, 0)) - && REG_P (XEXP (x, 1)) - && !REG_MODE_OK_FOR_REG_BASE_P (XEXP (x, 0), mode) - && !REG_MODE_OK_FOR_REG_BASE_P (XEXP (x, 1), mode)) - { - rtx orig_x = x; + *total += COSTS_N_INSNS (1); + if (GET_CODE (XEXP (x, 1)) == CONST_INT + && const_ok_for_op (INTVAL (XEXP (x, 1)), code)) + { + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; + } - x = copy_rtx (x); - push_reload (orig_x, NULL_RTX, x_p, NULL, MODE_BASE_REG_CLASS (mode), - Pmode, VOIDmode, 0, 0, opnum, type); - return x; - } + subcode = GET_CODE (XEXP (x, 0)); + if (subcode == ASHIFT || subcode == ASHIFTRT + || subcode == LSHIFTRT + || subcode == ROTATE || subcode == ROTATERT) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + return true; + } - return NULL; -} + if (subcode == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) + { + *total += rtx_cost (XEXP (x, 1), code, speed); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), subcode, speed); + return true; + } + + return false; -/* Test for various thread-local symbols. */ + case UMIN: + case UMAX: + case SMIN: + case SMAX: + *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, speed); + if (GET_CODE (XEXP (x, 1)) != CONST_INT + || !const_ok_for_arm (INTVAL (XEXP (x, 1)))) + *total += rtx_cost (XEXP (x, 1), code, speed); + return true; -/* Return TRUE if X is a thread-local symbol. */ + case ABS: + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + return false; + } + *total = COSTS_N_INSNS (20); + return false; + } + *total = COSTS_N_INSNS (1); + if (mode == DImode) + *total += COSTS_N_INSNS (3); + return false; -static bool -arm_tls_symbol_p (rtx x) -{ - if (! TARGET_HAVE_TLS) - return false; + case SIGN_EXTEND: + case ZERO_EXTEND: + *total = 0; + if (GET_MODE_CLASS (mode) == MODE_INT) + { + rtx op = XEXP (x, 0); + enum machine_mode opmode = GET_MODE (op); - if (GET_CODE (x) != SYMBOL_REF) - return false; + if (mode == DImode) + *total += COSTS_N_INSNS (1); - return SYMBOL_REF_TLS_MODEL (x) != 0; -} + if (opmode != SImode) + { + if (MEM_P (op)) + { + /* If !arm_arch4, we use one of the extendhisi2_mem + or movhi_bytes patterns for HImode. For a QImode + sign extension, we first zero-extend from memory + and then perform a shift sequence. */ + if (!arm_arch4 && (opmode != QImode || code == SIGN_EXTEND)) + *total += COSTS_N_INSNS (2); + } + else if (arm_arch6) + *total += COSTS_N_INSNS (1); + + /* We don't have the necessary insn, so we need to perform some + other operation. */ + else if (TARGET_ARM && code == ZERO_EXTEND && mode == QImode) + /* An and with constant 255. */ + *total += COSTS_N_INSNS (1); + else + /* A shift sequence. Increase costs slightly to avoid + combining two shifts into an extend operation. */ + *total += COSTS_N_INSNS (2) + 1; + } -/* Helper for arm_tls_referenced_p. */ + return false; + } -static int -arm_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED) -{ - if (GET_CODE (*x) == SYMBOL_REF) - return SYMBOL_REF_TLS_MODEL (*x) != 0; + switch (GET_MODE (XEXP (x, 0))) + { + case V8QImode: + case V4HImode: + case V2SImode: + case V4QImode: + case V2HImode: + *total = COSTS_N_INSNS (1); + return false; - /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are - TLS offsets, not real symbol references. */ - if (GET_CODE (*x) == UNSPEC - && XINT (*x, 1) == UNSPEC_TLS) - return -1; + default: + gcc_unreachable (); + } + gcc_unreachable (); - return 0; -} + case ZERO_EXTRACT: + case SIGN_EXTRACT: + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, speed); + return true; -/* Return TRUE if X contains any TLS symbol references. */ + case CONST_INT: + if (const_ok_for_arm (INTVAL (x)) + || const_ok_for_arm (~INTVAL (x))) + *total = COSTS_N_INSNS (1); + else + *total = COSTS_N_INSNS (arm_gen_constant (SET, mode, NULL_RTX, + INTVAL (x), NULL_RTX, + NULL_RTX, 0, 0)); + return true; -bool -arm_tls_referenced_p (rtx x) -{ - if (! TARGET_HAVE_TLS) - return false; + case CONST: + case LABEL_REF: + case SYMBOL_REF: + *total = COSTS_N_INSNS (3); + return true; - return for_each_rtx (&x, arm_tls_operand_p_1, NULL); -} + case HIGH: + *total = COSTS_N_INSNS (1); + return true; -/* Implement TARGET_CANNOT_FORCE_CONST_MEM. */ + case LO_SUM: + *total = COSTS_N_INSNS (1); + *total += rtx_cost (XEXP (x, 0), code, speed); + return true; -bool -arm_cannot_force_const_mem (rtx x) -{ - rtx base, offset; + case CONST_DOUBLE: + if (TARGET_HARD_FLOAT && vfp3_const_double_rtx (x) + && (mode == SFmode || !TARGET_VFP_SINGLE)) + *total = COSTS_N_INSNS (1); + else + *total = COSTS_N_INSNS (4); + return true; - if (ARM_OFFSETS_MUST_BE_WITHIN_SECTIONS_P) - { - split_const (x, &base, &offset); - if (GET_CODE (base) == SYMBOL_REF - && !offset_within_block_p (base, INTVAL (offset))) - return true; + default: + *total = COSTS_N_INSNS (4); + return false; } - return arm_tls_referenced_p (x); } - -#define REG_OR_SUBREG_REG(X) \ - (GET_CODE (X) == REG \ - || (GET_CODE (X) == SUBREG && GET_CODE (SUBREG_REG (X)) == REG)) -#define REG_OR_SUBREG_RTX(X) \ - (GET_CODE (X) == REG ? (X) : SUBREG_REG (X)) - -#ifndef COSTS_N_INSNS -#define COSTS_N_INSNS(N) ((N) * 4 - 2) -#endif +/* Estimates the size cost of thumb1 instructions. + For now most of the code is copied from thumb1_rtx_costs. We need more + fine grain tuning when we have more related test cases. */ static inline int -thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) +thumb1_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) { enum machine_mode mode = GET_MODE (x); @@ -4725,42 +7084,50 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) case MULT: if (GET_CODE (XEXP (x, 1)) == CONST_INT) - { - int cycles = 0; - unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1)); - - while (i) - { - i >>= 2; - cycles++; - } - return COSTS_N_INSNS (2) + cycles; - } - return COSTS_N_INSNS (1) + 16; + { + /* Thumb1 mul instruction can't operate on const. We must Load it + into a register first. */ + int const_size = thumb1_size_rtx_costs (XEXP (x, 1), CONST_INT, SET); + return COSTS_N_INSNS (1) + const_size; + } + return COSTS_N_INSNS (1); case SET: return (COSTS_N_INSNS (1) - + 4 * ((GET_CODE (SET_SRC (x)) == MEM) - + GET_CODE (SET_DEST (x)) == MEM)); - - case CONST_INT: - if (outer == SET) - { - if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256) - return 0; - if (thumb_shiftable_const (INTVAL (x))) - return COSTS_N_INSNS (2); - return COSTS_N_INSNS (3); - } + + 4 * ((GET_CODE (SET_SRC (x)) == MEM) + + GET_CODE (SET_DEST (x)) == MEM)); + + case CONST_INT: + if (outer == SET) + { + if ((unsigned HOST_WIDE_INT) INTVAL (x) < 256) + return COSTS_N_INSNS (1); + /* See split "TARGET_THUMB1 && satisfies_constraint_J". */ + if (INTVAL (x) >= -255 && INTVAL (x) <= -1) + return COSTS_N_INSNS (2); + /* See split "TARGET_THUMB1 && satisfies_constraint_K". */ + if (thumb_shiftable_const (INTVAL (x))) + return COSTS_N_INSNS (2); + return COSTS_N_INSNS (3); + } else if ((outer == PLUS || outer == COMPARE) - && INTVAL (x) < 256 && INTVAL (x) > -256) - return 0; - else if (outer == AND - && INTVAL (x) < 256 && INTVAL (x) >= -256) - return COSTS_N_INSNS (1); + && INTVAL (x) < 256 && INTVAL (x) > -256) + return 0; + else if ((outer == IOR || outer == XOR || outer == AND) + && INTVAL (x) < 256 && INTVAL (x) >= -256) + return COSTS_N_INSNS (1); + else if (outer == AND) + { + int i; + /* This duplicates the tests in the andsi3 expander. */ + for (i = 9; i <= 31; i++) + if ((((HOST_WIDE_INT) 1) << i) - 1 == INTVAL (x) + || (((HOST_WIDE_INT) 1) << i) - 1 == ~INTVAL (x)) + return COSTS_N_INSNS (2); + } else if (outer == ASHIFT || outer == ASHIFTRT - || outer == LSHIFTRT) - return 0; + || outer == LSHIFTRT) + return 0; return COSTS_N_INSNS (2); case CONST: @@ -4787,300 +7154,35 @@ thumb1_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer) case MEM: /* XXX another guess. */ /* Memory costs quite a lot for the first word, but subsequent words - load at the equivalent of a single insn each. */ + load at the equivalent of a single insn each. */ return (10 + 4 * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD) - + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)) - ? 4 : 0)); + + ((GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)) + ? 4 : 0)); case IF_THEN_ELSE: /* XXX a guess. */ if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC) - return 14; + return 14; return 2; case ZERO_EXTEND: /* XXX still guessing. */ switch (GET_MODE (XEXP (x, 0))) - { - case QImode: - return (1 + (mode == DImode ? 4 : 0) - + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - - case HImode: - return (4 + (mode == DImode ? 4 : 0) - + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - - case SImode: - return (1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - - default: - return 99; - } - - default: - return 99; - } -} - - -/* Worker routine for arm_rtx_costs. */ -/* ??? This needs updating for thumb2. */ -static inline int -arm_rtx_costs_1 (rtx x, enum rtx_code code, enum rtx_code outer) -{ - enum machine_mode mode = GET_MODE (x); - enum rtx_code subcode; - int extra_cost; - - switch (code) - { - case MEM: - /* Memory costs quite a lot for the first word, but subsequent words - load at the equivalent of a single insn each. */ - return (10 + 4 * ((GET_MODE_SIZE (mode) - 1) / UNITS_PER_WORD) - + (GET_CODE (x) == SYMBOL_REF - && CONSTANT_POOL_ADDRESS_P (x) ? 4 : 0)); - - case DIV: - case MOD: - case UDIV: - case UMOD: - return optimize_size ? COSTS_N_INSNS (2) : 100; - - case ROTATE: - if (mode == SImode && GET_CODE (XEXP (x, 1)) == REG) - return 4; - /* Fall through */ - case ROTATERT: - if (mode != SImode) - return 8; - /* Fall through */ - case ASHIFT: case LSHIFTRT: case ASHIFTRT: - if (mode == DImode) - return (8 + (GET_CODE (XEXP (x, 1)) == CONST_INT ? 0 : 8) - + ((GET_CODE (XEXP (x, 0)) == REG - || (GET_CODE (XEXP (x, 0)) == SUBREG - && GET_CODE (SUBREG_REG (XEXP (x, 0))) == REG)) - ? 0 : 8)); - return (1 + ((GET_CODE (XEXP (x, 0)) == REG - || (GET_CODE (XEXP (x, 0)) == SUBREG - && GET_CODE (SUBREG_REG (XEXP (x, 0))) == REG)) - ? 0 : 4) - + ((GET_CODE (XEXP (x, 1)) == REG - || (GET_CODE (XEXP (x, 1)) == SUBREG - && GET_CODE (SUBREG_REG (XEXP (x, 1))) == REG) - || (GET_CODE (XEXP (x, 1)) == CONST_INT)) - ? 0 : 4)); - - case MINUS: - if (GET_CODE (XEXP (x, 1)) == MULT && mode == SImode && arm_arch_thumb2) - { - extra_cost = rtx_cost (XEXP (x, 1), code); - if (!REG_OR_SUBREG_REG (XEXP (x, 0))) - extra_cost += 4 * ARM_NUM_REGS (mode); - return extra_cost; - } - - if (mode == DImode) - return (4 + (REG_OR_SUBREG_REG (XEXP (x, 1)) ? 0 : 8) - + ((REG_OR_SUBREG_REG (XEXP (x, 0)) - || (GET_CODE (XEXP (x, 0)) == CONST_INT - && const_ok_for_arm (INTVAL (XEXP (x, 0))))) - ? 0 : 8)); - - if (GET_MODE_CLASS (mode) == MODE_FLOAT) - return (2 + ((REG_OR_SUBREG_REG (XEXP (x, 1)) - || (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE - && arm_const_double_rtx (XEXP (x, 1)))) - ? 0 : 8) - + ((REG_OR_SUBREG_REG (XEXP (x, 0)) - || (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE - && arm_const_double_rtx (XEXP (x, 0)))) - ? 0 : 8)); - - if (((GET_CODE (XEXP (x, 0)) == CONST_INT - && const_ok_for_arm (INTVAL (XEXP (x, 0))) - && REG_OR_SUBREG_REG (XEXP (x, 1)))) - || (((subcode = GET_CODE (XEXP (x, 1))) == ASHIFT - || subcode == ASHIFTRT || subcode == LSHIFTRT - || subcode == ROTATE || subcode == ROTATERT - || (subcode == MULT - && GET_CODE (XEXP (XEXP (x, 1), 1)) == CONST_INT - && ((INTVAL (XEXP (XEXP (x, 1), 1)) & - (INTVAL (XEXP (XEXP (x, 1), 1)) - 1)) == 0))) - && REG_OR_SUBREG_REG (XEXP (XEXP (x, 1), 0)) - && (REG_OR_SUBREG_REG (XEXP (XEXP (x, 1), 1)) - || GET_CODE (XEXP (XEXP (x, 1), 1)) == CONST_INT) - && REG_OR_SUBREG_REG (XEXP (x, 0)))) - return 1; - /* Fall through */ - - case PLUS: - if (arm_arch6 && mode == SImode - && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND - || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)) - return 1 + (GET_CODE (XEXP (XEXP (x, 0), 0)) == MEM ? 10 : 0) - + (GET_CODE (XEXP (x, 1)) == MEM ? 10 : 0); - - if (GET_CODE (XEXP (x, 0)) == MULT) - { - extra_cost = rtx_cost (XEXP (x, 0), code); - if (!REG_OR_SUBREG_REG (XEXP (x, 1))) - extra_cost += 4 * ARM_NUM_REGS (mode); - return extra_cost; - } - - if (GET_MODE_CLASS (mode) == MODE_FLOAT) - return (2 + (REG_OR_SUBREG_REG (XEXP (x, 0)) ? 0 : 8) - + ((REG_OR_SUBREG_REG (XEXP (x, 1)) - || (GET_CODE (XEXP (x, 1)) == CONST_DOUBLE - && arm_const_double_rtx (XEXP (x, 1)))) - ? 0 : 8)); - - /* Fall through */ - case AND: case XOR: case IOR: - extra_cost = 0; - - /* Normally the frame registers will be spilt into reg+const during - reload, so it is a bad idea to combine them with other instructions, - since then they might not be moved outside of loops. As a compromise - we allow integration with ops that have a constant as their second - operand. */ - if ((REG_OR_SUBREG_REG (XEXP (x, 0)) - && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0))) - && GET_CODE (XEXP (x, 1)) != CONST_INT) - || (REG_OR_SUBREG_REG (XEXP (x, 0)) - && ARM_FRAME_RTX (REG_OR_SUBREG_RTX (XEXP (x, 0))))) - extra_cost = 4; - - if (mode == DImode) - return (4 + extra_cost + (REG_OR_SUBREG_REG (XEXP (x, 0)) ? 0 : 8) - + ((REG_OR_SUBREG_REG (XEXP (x, 1)) - || (GET_CODE (XEXP (x, 1)) == CONST_INT - && const_ok_for_op (INTVAL (XEXP (x, 1)), code))) - ? 0 : 8)); - - if (REG_OR_SUBREG_REG (XEXP (x, 0))) - return (1 + (GET_CODE (XEXP (x, 1)) == CONST_INT ? 0 : extra_cost) - + ((REG_OR_SUBREG_REG (XEXP (x, 1)) - || (GET_CODE (XEXP (x, 1)) == CONST_INT - && const_ok_for_op (INTVAL (XEXP (x, 1)), code))) - ? 0 : 4)); - - else if (REG_OR_SUBREG_REG (XEXP (x, 1))) - return (1 + extra_cost - + ((((subcode = GET_CODE (XEXP (x, 0))) == ASHIFT - || subcode == LSHIFTRT || subcode == ASHIFTRT - || subcode == ROTATE || subcode == ROTATERT - || (subcode == MULT - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT - && ((INTVAL (XEXP (XEXP (x, 0), 1)) & - (INTVAL (XEXP (XEXP (x, 0), 1)) - 1)) == 0))) - && (REG_OR_SUBREG_REG (XEXP (XEXP (x, 0), 0))) - && ((REG_OR_SUBREG_REG (XEXP (XEXP (x, 0), 1))) - || GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT)) - ? 0 : 4)); - - return 8; - - case MULT: - /* This should have been handled by the CPU specific routines. */ - gcc_unreachable (); - - case TRUNCATE: - if (arm_arch3m && mode == SImode - && GET_CODE (XEXP (x, 0)) == LSHIFTRT - && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT - && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) - == GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1))) - && (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND - || GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND)) - return 8; - return 99; - - case NEG: - if (GET_MODE_CLASS (mode) == MODE_FLOAT) - return 4 + (REG_OR_SUBREG_REG (XEXP (x, 0)) ? 0 : 6); - /* Fall through */ - case NOT: - if (mode == DImode) - return 4 + (REG_OR_SUBREG_REG (XEXP (x, 0)) ? 0 : 4); - - return 1 + (REG_OR_SUBREG_REG (XEXP (x, 0)) ? 0 : 4); - - case IF_THEN_ELSE: - if (GET_CODE (XEXP (x, 1)) == PC || GET_CODE (XEXP (x, 2)) == PC) - return 14; - return 2; - - case COMPARE: - return 1; - - case ABS: - return 4 + (mode == DImode ? 4 : 0); - - case SIGN_EXTEND: - if (arm_arch_thumb2 && mode == SImode) - return 1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0); - - if (GET_MODE (XEXP (x, 0)) == QImode) - return (4 + (mode == DImode ? 4 : 0) - + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - /* Fall through */ - case ZERO_EXTEND: - if (arm_arch6 && mode == SImode) - return 1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0); - - switch (GET_MODE (XEXP (x, 0))) - { - case QImode: - return (1 + (mode == DImode ? 4 : 0) - + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - - case HImode: - return (4 + (mode == DImode ? 4 : 0) - + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - - case SImode: - return (1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - - case V8QImode: - case V4HImode: - case V2SImode: - case V4QImode: - case V2HImode: - return 1; - - default: - gcc_unreachable (); - } - gcc_unreachable (); + { + case QImode: + return (1 + (mode == DImode ? 4 : 0) + + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - case CONST_INT: - if (const_ok_for_arm (INTVAL (x))) - return outer == SET ? 2 : -1; - else if (outer == AND - && const_ok_for_arm (~INTVAL (x))) - return -1; - else if ((outer == COMPARE - || outer == PLUS || outer == MINUS) - && const_ok_for_arm (-INTVAL (x))) - return -1; - else - return 5; + case HImode: + return (4 + (mode == DImode ? 4 : 0) + + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - case CONST: - case LABEL_REF: - case SYMBOL_REF: - return 6; + case SImode: + return (1 + (GET_CODE (XEXP (x, 0)) == MEM ? 10 : 0)); - case CONST_DOUBLE: - if (arm_const_double_rtx (x) || vfp3_const_double_rtx (x)) - return outer == SET ? 2 : -1; - else if ((outer == COMPARE || outer == PLUS) - && neg_const_double_rtx_ok_for_fpa (x)) - return -1; - return 7; + default: + return 99; + } default: return 99; @@ -5089,14 +7191,13 @@ arm_rtx_costs_1 (rtx x, enum rtx_code code, enum rtx_code outer) /* RTX costs when optimizing for size. */ static bool -arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) +arm_size_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total) { enum machine_mode mode = GET_MODE (x); - if (TARGET_THUMB1) { - /* XXX TBD. For now, use the standard costs. */ - *total = thumb1_rtx_costs (x, code, outer_code); + *total = thumb1_size_rtx_costs (x, code, outer_code); return true; } @@ -5108,6 +7209,12 @@ arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) a single register, otherwise it costs one insn per word. */ if (REG_P (XEXP (x, 0))) *total = COSTS_N_INSNS (1); + else if (flag_pic + && GET_CODE (XEXP (x, 0)) == PLUS + && will_be_in_index_register (XEXP (XEXP (x, 0), 1))) + /* This will be split into two instructions. + See arm.md:calculate_pic_address. */ + *total = COSTS_N_INSNS (2); else *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); return true; @@ -5123,7 +7230,7 @@ arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) case ROTATE: if (mode == SImode && GET_CODE (XEXP (x, 1)) == REG) { - *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code); + *total = COSTS_N_INSNS (2) + rtx_cost (XEXP (x, 0), code, false); return true; } /* Fall through */ @@ -5133,15 +7240,15 @@ arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) case ASHIFTRT: if (mode == DImode && GET_CODE (XEXP (x, 1)) == CONST_INT) { - *total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), code); + *total = COSTS_N_INSNS (3) + rtx_cost (XEXP (x, 0), code, false); return true; } else if (mode == SImode) { - *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code); + *total = COSTS_N_INSNS (1) + rtx_cost (XEXP (x, 0), code, false); /* Slightly disparage register shifts, but not by much. */ if (GET_CODE (XEXP (x, 1)) != CONST_INT) - *total += 1 + rtx_cost (XEXP (x, 1), code); + *total += 1 + rtx_cost (XEXP (x, 1), code, false); return true; } @@ -5150,7 +7257,8 @@ arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) return false; case MINUS: - if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT) + if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT + && (mode == SFmode || !TARGET_VFP_SINGLE)) { *total = COSTS_N_INSNS (1); return false; @@ -5180,12 +7288,23 @@ arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) return false; case PLUS: - if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT) + if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT + && (mode == SFmode || !TARGET_VFP_SINGLE)) { *total = COSTS_N_INSNS (1); return false; } + /* A shift as a part of ADD costs nothing. */ + if (GET_CODE (XEXP (x, 0)) == MULT + && power_of_two_operand (XEXP (XEXP (x, 0), 1), SImode)) + { + *total = COSTS_N_INSNS (TARGET_THUMB2 ? 2 : 1); + *total += rtx_cost (XEXP (XEXP (x, 0), 0), code, false); + *total += rtx_cost (XEXP (x, 1), code, false); + return true; + } + /* Fall through */ case AND: case XOR: case IOR: if (mode == SImode) @@ -5210,8 +7329,13 @@ arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) return false; case NEG: - if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT) - *total = COSTS_N_INSNS (1); + if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT + && (mode == SFmode || !TARGET_VFP_SINGLE)) + { + *total = COSTS_N_INSNS (1); + return false; + } + /* Fall through */ case NOT: *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); @@ -5230,52 +7354,23 @@ arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) return false; case ABS: - if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT) + if (TARGET_HARD_FLOAT && GET_MODE_CLASS (mode) == MODE_FLOAT + && (mode == SFmode || !TARGET_VFP_SINGLE)) *total = COSTS_N_INSNS (1); else *total = COSTS_N_INSNS (1 + ARM_NUM_REGS (mode)); return false; case SIGN_EXTEND: - *total = 0; - if (GET_MODE_SIZE (GET_MODE (XEXP (x, 0))) < 4) - { - if (!(arm_arch4 && MEM_P (XEXP (x, 0)))) - *total += COSTS_N_INSNS (arm_arch6 ? 1 : 2); - } - if (mode == DImode) - *total += COSTS_N_INSNS (1); - return false; - case ZERO_EXTEND: - *total = 0; - if (!(arm_arch4 && MEM_P (XEXP (x, 0)))) - { - switch (GET_MODE (XEXP (x, 0))) - { - case QImode: - *total += COSTS_N_INSNS (1); - break; - - case HImode: - *total += COSTS_N_INSNS (arm_arch6 ? 1 : 2); - - case SImode: - break; - - default: - *total += COSTS_N_INSNS (2); - } - } - - if (mode == DImode) - *total += COSTS_N_INSNS (1); - - return false; + return arm_rtx_costs_1 (x, outer_code, total, 0); case CONST_INT: if (const_ok_for_arm (INTVAL (x))) - *total = COSTS_N_INSNS (outer_code == SET ? 1 : 0); + /* A multiplication by a constant requires another instruction + to load the constant to a register. */ + *total = COSTS_N_INSNS ((outer_code == SET || outer_code == MULT) + ? 1 : 0); else if (const_ok_for_arm (~INTVAL (x))) *total = COSTS_N_INSNS (outer_code == AND ? 0 : 1); else if (const_ok_for_arm (-INTVAL (x))) @@ -5300,6 +7395,13 @@ arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) *total = COSTS_N_INSNS (4); return true; + case HIGH: + case LO_SUM: + /* We prefer constant pool entries to MOVW/MOVT pairs, so bump the + cost of these slightly. */ + *total = COSTS_N_INSNS (1) + 1; + return true; + default: if (mode != VOIDmode) *total = COSTS_N_INSNS (ARM_NUM_REGS (mode)); @@ -5309,11 +7411,26 @@ arm_size_rtx_costs (rtx x, int code, int outer_code, int *total) } } +/* RTX costs when optimizing for size. */ +static bool +arm_rtx_costs (rtx x, int code, int outer_code, int *total, + bool speed) +{ + if (!speed) + return arm_size_rtx_costs (x, (enum rtx_code) code, + (enum rtx_code) outer_code, total); + else + return current_tune->rtx_costs (x, (enum rtx_code) code, + (enum rtx_code) outer_code, + total, speed); +} + /* RTX costs for cores with a slow MUL implementation. Thumb-2 is not supported on any "slowmul" cores, so it can be ignored. */ static bool -arm_slowmul_rtx_costs (rtx x, int code, int outer_code, int *total) +arm_slowmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total, bool speed) { enum machine_mode mode = GET_MODE (x); @@ -5329,8 +7446,8 @@ arm_slowmul_rtx_costs (rtx x, int code, int outer_code, int *total) if (GET_MODE_CLASS (mode) == MODE_FLOAT || mode == DImode) { - *total = 30; - return true; + *total = COSTS_N_INSNS (20); + return false; } if (GET_CODE (XEXP (x, 1)) == CONST_INT) @@ -5346,20 +7463,19 @@ arm_slowmul_rtx_costs (rtx x, int code, int outer_code, int *total) for (j = 0; i && j < 32; j += booth_unit_size) { i >>= booth_unit_size; - cost += 2; + cost++; } - *total = cost; + *total = COSTS_N_INSNS (cost); + *total += rtx_cost (XEXP (x, 0), code, speed); return true; } - *total = 30 + (REG_OR_SUBREG_REG (XEXP (x, 0)) ? 0 : 4) - + (REG_OR_SUBREG_REG (XEXP (x, 1)) ? 0 : 4); - return true; + *total = COSTS_N_INSNS (20); + return false; default: - *total = arm_rtx_costs_1 (x, code, outer_code); - return true; + return arm_rtx_costs_1 (x, outer_code, total, speed);; } } @@ -5367,7 +7483,8 @@ arm_slowmul_rtx_costs (rtx x, int code, int outer_code, int *total) /* RTX cost for cores with a fast multiply unit (M variants). */ static bool -arm_fastmul_rtx_costs (rtx x, int code, int outer_code, int *total) +arm_fastmul_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total, bool speed) { enum machine_mode mode = GET_MODE (x); @@ -5388,16 +7505,15 @@ arm_fastmul_rtx_costs (rtx x, int code, int outer_code, int *total) && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)) { - *total = 8; - return true; + *total = COSTS_N_INSNS(2); + return false; } - if (GET_MODE_CLASS (mode) == MODE_FLOAT - || mode == DImode) + if (mode == DImode) { - *total = 30; - return true; + *total = COSTS_N_INSNS (5); + return false; } if (GET_CODE (XEXP (x, 1)) == CONST_INT) @@ -5413,20 +7529,36 @@ arm_fastmul_rtx_costs (rtx x, int code, int outer_code, int *total) for (j = 0; i && j < 32; j += booth_unit_size) { i >>= booth_unit_size; - cost += 2; + cost++; } - *total = cost; - return true; + *total = COSTS_N_INSNS(cost); + return false; } - *total = 8 + (REG_OR_SUBREG_REG (XEXP (x, 0)) ? 0 : 4) - + (REG_OR_SUBREG_REG (XEXP (x, 1)) ? 0 : 4); - return true; + if (mode == SImode) + { + *total = COSTS_N_INSNS (4); + return false; + } + + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + return false; + } + } + + /* Requires a lib call */ + *total = COSTS_N_INSNS (20); + return false; default: - *total = arm_rtx_costs_1 (x, code, outer_code); - return true; + return arm_rtx_costs_1 (x, outer_code, total, speed); } } @@ -5435,7 +7567,8 @@ arm_fastmul_rtx_costs (rtx x, int code, int outer_code, int *total) so it can be ignored. */ static bool -arm_xscale_rtx_costs (rtx x, int code, int outer_code, int *total) +arm_xscale_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total, bool speed) { enum machine_mode mode = GET_MODE (x); @@ -5447,6 +7580,15 @@ arm_xscale_rtx_costs (rtx x, int code, int outer_code, int *total) switch (code) { + case COMPARE: + if (GET_CODE (XEXP (x, 0)) != MULT) + return arm_rtx_costs_1 (x, outer_code, total, speed); + + /* A COMPARE of a MULT is slow on XScale; the muls instruction + will stall until the multiplication is complete. */ + *total = COSTS_N_INSNS (3); + return false; + case MULT: /* There is no point basing this on the tuning, since it is always the fast variant if it exists at all. */ @@ -5455,60 +7597,58 @@ arm_xscale_rtx_costs (rtx x, int code, int outer_code, int *total) && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)) { - *total = 8; - return true; + *total = COSTS_N_INSNS (2); + return false; } - if (GET_MODE_CLASS (mode) == MODE_FLOAT - || mode == DImode) + if (mode == DImode) { - *total = 30; - return true; + *total = COSTS_N_INSNS (5); + return false; } if (GET_CODE (XEXP (x, 1)) == CONST_INT) { - unsigned HOST_WIDE_INT i = (INTVAL (XEXP (x, 1)) - & (unsigned HOST_WIDE_INT) 0xffffffff); - int cost, const_ok = const_ok_for_arm (i); + /* If operand 1 is a constant we can more accurately + calculate the cost of the multiply. The multiplier can + retire 15 bits on the first cycle and a further 12 on the + second. We do, of course, have to load the constant into + a register first. */ + unsigned HOST_WIDE_INT i = INTVAL (XEXP (x, 1)); + /* There's a general overhead of one cycle. */ + int cost = 1; unsigned HOST_WIDE_INT masked_const; - /* The cost will be related to two insns. - First a load of the constant (MOV or LDR), then a multiply. */ - cost = 2; - if (! const_ok) - cost += 1; /* LDR is probably more expensive because - of longer result latency. */ + if (i & 0x80000000) + i = ~i; + + i &= (unsigned HOST_WIDE_INT) 0xffffffff; + masked_const = i & 0xffff8000; - if (masked_const != 0 && masked_const != 0xffff8000) + if (masked_const != 0) { + cost++; masked_const = i & 0xf8000000; - if (masked_const == 0 || masked_const == 0xf8000000) - cost += 1; - else - cost += 2; + if (masked_const != 0) + cost++; } - *total = cost; - return true; + *total = COSTS_N_INSNS (cost); + return false; } - *total = 8 + (REG_OR_SUBREG_REG (XEXP (x, 0)) ? 0 : 4) - + (REG_OR_SUBREG_REG (XEXP (x, 1)) ? 0 : 4); - return true; - - case COMPARE: - /* A COMPARE of a MULT is slow on XScale; the muls instruction - will stall until the multiplication is complete. */ - if (GET_CODE (XEXP (x, 0)) == MULT) - *total = 4 + rtx_cost (XEXP (x, 0), code); - else - *total = arm_rtx_costs_1 (x, code, outer_code); - return true; + if (mode == SImode) + { + *total = COSTS_N_INSNS (3); + return false; + } + + /* Requires a lib call */ + *total = COSTS_N_INSNS (20); + return false; default: - *total = arm_rtx_costs_1 (x, code, outer_code); - return true; + return arm_rtx_costs_1 (x, outer_code, total, speed); } } @@ -5516,11 +7656,10 @@ arm_xscale_rtx_costs (rtx x, int code, int outer_code, int *total) /* RTX costs for 9e (and later) cores. */ static bool -arm_9e_rtx_costs (rtx x, int code, int outer_code, int *total) +arm_9e_rtx_costs (rtx x, enum rtx_code code, enum rtx_code outer_code, + int *total, bool speed) { enum machine_mode mode = GET_MODE (x); - int nonreg_cost; - int cost; if (TARGET_THUMB1) { @@ -5546,35 +7685,39 @@ arm_9e_rtx_costs (rtx x, int code, int outer_code, int *total) && (GET_CODE (XEXP (x, 0)) == ZERO_EXTEND || GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)) { - *total = 3; - return true; + *total = COSTS_N_INSNS (2); + return false; } - if (GET_MODE_CLASS (mode) == MODE_FLOAT) - { - *total = 30; - return true; - } if (mode == DImode) { - cost = 7; - nonreg_cost = 8; + *total = COSTS_N_INSNS (5); + return false; } - else + + if (mode == SImode) { - cost = 2; - nonreg_cost = 4; + *total = COSTS_N_INSNS (2); + return false; } + if (GET_MODE_CLASS (mode) == MODE_FLOAT) + { + if (TARGET_HARD_FLOAT + && (mode == SFmode + || (mode == DFmode && !TARGET_VFP_SINGLE))) + { + *total = COSTS_N_INSNS (1); + return false; + } + } - *total = cost + (REG_OR_SUBREG_REG (XEXP (x, 0)) ? 0 : nonreg_cost) - + (REG_OR_SUBREG_REG (XEXP (x, 1)) ? 0 : nonreg_cost); - return true; + *total = COSTS_N_INSNS (20); + return false; default: - *total = arm_rtx_costs_1 (x, code, outer_code); - return true; + return arm_rtx_costs_1 (x, outer_code, total, speed); } } /* All address computations that can be done are free, but rtx cost returns @@ -5591,9 +7734,9 @@ arm_arm_address_cost (rtx x) if (c == MEM || c == LABEL_REF || c == SYMBOL_REF) return 10; - if (c == PLUS || c == MINUS) + if (c == PLUS) { - if (GET_CODE (XEXP (x, 0)) == CONST_INT) + if (GET_CODE (XEXP (x, 1)) == CONST_INT) return 2; if (ARITHMETIC_P (XEXP (x, 0)) || ARITHMETIC_P (XEXP (x, 1))) @@ -5621,20 +7764,18 @@ arm_thumb_address_cost (rtx x) } static int -arm_address_cost (rtx x) +arm_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED) { return TARGET_32BIT ? arm_arm_address_cost (x) : arm_thumb_address_cost (x); } -static int -arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) +/* Adjust cost hook for XScale. */ +static bool +xscale_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) { - rtx i_pat, d_pat; - /* Some true dependencies can have a higher cost depending on precisely how certain input operands are used. */ - if (arm_tune_xscale - && REG_NOTE_KIND (link) == 0 + if (REG_NOTE_KIND(link) == 0 && recog_memoized (insn) >= 0 && recog_memoized (dep) >= 0) { @@ -5668,10 +7809,116 @@ arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) if (reg_overlap_mentioned_p (recog_data.operand[opno], shifted_operand)) - return 2; + { + *cost = 2; + return false; + } } } } + return true; +} + +/* Adjust cost hook for Cortex A9. */ +static bool +cortex_a9_sched_adjust_cost (rtx insn, rtx link, rtx dep, int * cost) +{ + switch (REG_NOTE_KIND (link)) + { + case REG_DEP_ANTI: + *cost = 0; + return false; + + case REG_DEP_TRUE: + case REG_DEP_OUTPUT: + if (recog_memoized (insn) >= 0 + && recog_memoized (dep) >= 0) + { + if (GET_CODE (PATTERN (insn)) == SET) + { + if (GET_MODE_CLASS + (GET_MODE (SET_DEST (PATTERN (insn)))) == MODE_FLOAT + || GET_MODE_CLASS + (GET_MODE (SET_SRC (PATTERN (insn)))) == MODE_FLOAT) + { + enum attr_type attr_type_insn = get_attr_type (insn); + enum attr_type attr_type_dep = get_attr_type (dep); + + /* By default all dependencies of the form + s0 = s0 s1 + s0 = s0 s2 + have an extra latency of 1 cycle because + of the input and output dependency in this + case. However this gets modeled as an true + dependency and hence all these checks. */ + if (REG_P (SET_DEST (PATTERN (insn))) + && REG_P (SET_DEST (PATTERN (dep))) + && reg_overlap_mentioned_p (SET_DEST (PATTERN (insn)), + SET_DEST (PATTERN (dep)))) + { + /* FMACS is a special case where the dependant + instruction can be issued 3 cycles before + the normal latency in case of an output + dependency. */ + if ((attr_type_insn == TYPE_FMACS + || attr_type_insn == TYPE_FMACD) + && (attr_type_dep == TYPE_FMACS + || attr_type_dep == TYPE_FMACD)) + { + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) + *cost = insn_default_latency (dep) - 3; + else + *cost = insn_default_latency (dep); + return false; + } + else + { + if (REG_NOTE_KIND (link) == REG_DEP_OUTPUT) + *cost = insn_default_latency (dep) + 1; + else + *cost = insn_default_latency (dep); + } + return false; + } + } + } + } + break; + + default: + gcc_unreachable (); + } + + return true; +} + +/* This function implements the target macro TARGET_SCHED_ADJUST_COST. + It corrects the value of COST based on the relationship between + INSN and DEP through the dependence LINK. It returns the new + value. There is a per-core adjust_cost hook to adjust scheduler costs + and the per-core hook can choose to completely override the generic + adjust_cost function. Only put bits of code into arm_adjust_cost that + are common across all cores. */ +static int +arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) +{ + rtx i_pat, d_pat; + + /* When generating Thumb-1 code, we want to place flag-setting operations + close to a conditional branch which depends on them, so that we can + omit the comparison. */ + if (TARGET_THUMB1 + && REG_NOTE_KIND (link) == 0 + && recog_memoized (insn) == CODE_FOR_cbranchsi4_insn + && recog_memoized (dep) >= 0 + && get_attr_conds (dep) == CONDS_SET) + return 0; + + if (current_tune->sched_adjust_cost != NULL) + { + if (!current_tune->sched_adjust_cost (insn, link, dep, &cost)) + return cost; + } /* XXX This is not strictly true for the FPA. */ if (REG_NOTE_KIND (link) == REG_DEP_ANTI @@ -5694,7 +7941,8 @@ arm_adjust_cost (rtx insn, rtx link, rtx dep, int cost) constant pool are cached, and that others will miss. This is a hack. */ - if ((GET_CODE (src_mem) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (src_mem)) + if ((GET_CODE (src_mem) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (src_mem)) || reg_mentioned_p (stack_pointer_rtx, src_mem) || reg_mentioned_p (frame_pointer_rtx, src_mem) || reg_mentioned_p (hard_frame_pointer_rtx, src_mem)) @@ -5765,7 +8013,7 @@ neg_const_double_rtx_ok_for_fpa (rtx x) init_fp_table (); REAL_VALUE_FROM_CONST_DOUBLE (r, x); - r = REAL_VALUE_NEGATE (r); + r = real_value_negate (&r); if (REAL_VALUE_MINUS_ZERO (r)) return 0; @@ -5816,7 +8064,7 @@ vfp3_const_double_index (rtx x) /* Extract sign, exponent and mantissa. */ sign = REAL_VALUE_NEGATIVE (r) ? 1 : 0; - r = REAL_VALUE_ABS (r); + r = real_value_abs (&r); exponent = REAL_EXP (&r); /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the highest (sign) bit, with a fixed binary point at bit point_pos. @@ -5944,7 +8192,7 @@ neon_valid_immediate (rtx op, enum machine_mode mode, int inverse, break; \ } - unsigned int i, elsize, idx = 0, n_elts = CONST_VECTOR_NUNITS (op); + unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op); unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode)); unsigned char bytes[16]; int immtype = -1, matches; @@ -6209,25 +8457,198 @@ neon_pairwise_reduce (rtx op0, rtx op1, enum machine_mode mode, } } -/* Initialize a vector with non-constant elements. FIXME: We can do better - than the current implementation (building a vector on the stack and then - loading it) in many cases. See rs6000.c. */ +/* If VALS is a vector constant that can be loaded into a register + using VDUP, generate instructions to do so and return an RTX to + assign to the register. Otherwise return NULL_RTX. */ + +static rtx +neon_vdup_constant (rtx vals) +{ + enum machine_mode mode = GET_MODE (vals); + enum machine_mode inner_mode = GET_MODE_INNER (mode); + int n_elts = GET_MODE_NUNITS (mode); + bool all_same = true; + rtx x; + int i; + + if (GET_CODE (vals) != CONST_VECTOR || GET_MODE_SIZE (inner_mode) > 4) + return NULL_RTX; + + for (i = 0; i < n_elts; ++i) + { + x = XVECEXP (vals, 0, i); + if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) + all_same = false; + } + + if (!all_same) + /* The elements are not all the same. We could handle repeating + patterns of a mode larger than INNER_MODE here (e.g. int8x8_t + {0, C, 0, C, 0, C, 0, C} which can be loaded using + vdup.i16). */ + return NULL_RTX; + + /* We can load this constant by using VDUP and a constant in a + single ARM register. This will be cheaper than a vector + load. */ + + x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); + return gen_rtx_VEC_DUPLICATE (mode, x); +} + +/* Generate code to load VALS, which is a PARALLEL containing only + constants (for vec_init) or CONST_VECTOR, efficiently into a + register. Returns an RTX to copy into the register, or NULL_RTX + for a PARALLEL that can not be converted into a CONST_VECTOR. */ + +rtx +neon_make_constant (rtx vals) +{ + enum machine_mode mode = GET_MODE (vals); + rtx target; + rtx const_vec = NULL_RTX; + int n_elts = GET_MODE_NUNITS (mode); + int n_const = 0; + int i; + + if (GET_CODE (vals) == CONST_VECTOR) + const_vec = vals; + else if (GET_CODE (vals) == PARALLEL) + { + /* A CONST_VECTOR must contain only CONST_INTs and + CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF). + Only store valid constants in a CONST_VECTOR. */ + for (i = 0; i < n_elts; ++i) + { + rtx x = XVECEXP (vals, 0, i); + if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE) + n_const++; + } + if (n_const == n_elts) + const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)); + } + else + gcc_unreachable (); + + if (const_vec != NULL + && neon_immediate_valid_for_move (const_vec, mode, NULL, NULL)) + /* Load using VMOV. On Cortex-A8 this takes one cycle. */ + return const_vec; + else if ((target = neon_vdup_constant (vals)) != NULL_RTX) + /* Loaded using VDUP. On Cortex-A8 the VDUP takes one NEON + pipeline cycle; creating the constant takes one or two ARM + pipeline cycles. */ + return target; + else if (const_vec != NULL_RTX) + /* Load from constant pool. On Cortex-A8 this takes two cycles + (for either double or quad vectors). We can not take advantage + of single-cycle VLD1 because we need a PC-relative addressing + mode. */ + return const_vec; + else + /* A PARALLEL containing something not valid inside CONST_VECTOR. + We can not construct an initializer. */ + return NULL_RTX; +} + +/* Initialize vector TARGET to VALS. */ void neon_expand_vector_init (rtx target, rtx vals) { enum machine_mode mode = GET_MODE (target); - enum machine_mode inner = GET_MODE_INNER (mode); - unsigned int i, n_elts = GET_MODE_NUNITS (mode); - rtx mem; + enum machine_mode inner_mode = GET_MODE_INNER (mode); + int n_elts = GET_MODE_NUNITS (mode); + int n_var = 0, one_var = -1; + bool all_same = true; + rtx x, mem; + int i; + + for (i = 0; i < n_elts; ++i) + { + x = XVECEXP (vals, 0, i); + if (!CONSTANT_P (x)) + ++n_var, one_var = i; + + if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0))) + all_same = false; + } + + if (n_var == 0) + { + rtx constant = neon_make_constant (vals); + if (constant != NULL_RTX) + { + emit_move_insn (target, constant); + return; + } + } + + /* Splat a single non-constant element if we can. */ + if (all_same && GET_MODE_SIZE (inner_mode) <= 4) + { + x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0)); + emit_insn (gen_rtx_SET (VOIDmode, target, + gen_rtx_VEC_DUPLICATE (mode, x))); + return; + } + + /* One field is non-constant. Load constant then overwrite varying + field. This is more efficient than using the stack. */ + if (n_var == 1) + { + rtx copy = copy_rtx (vals); + rtx index = GEN_INT (one_var); + + /* Load constant part of vector, substitute neighboring value for + varying element. */ + XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, (one_var + 1) % n_elts); + neon_expand_vector_init (target, copy); - gcc_assert (VECTOR_MODE_P (mode)); + /* Insert variable. */ + x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var)); + switch (mode) + { + case V8QImode: + emit_insn (gen_neon_vset_lanev8qi (target, x, target, index)); + break; + case V16QImode: + emit_insn (gen_neon_vset_lanev16qi (target, x, target, index)); + break; + case V4HImode: + emit_insn (gen_neon_vset_lanev4hi (target, x, target, index)); + break; + case V8HImode: + emit_insn (gen_neon_vset_lanev8hi (target, x, target, index)); + break; + case V2SImode: + emit_insn (gen_neon_vset_lanev2si (target, x, target, index)); + break; + case V4SImode: + emit_insn (gen_neon_vset_lanev4si (target, x, target, index)); + break; + case V2SFmode: + emit_insn (gen_neon_vset_lanev2sf (target, x, target, index)); + break; + case V4SFmode: + emit_insn (gen_neon_vset_lanev4sf (target, x, target, index)); + break; + case V2DImode: + emit_insn (gen_neon_vset_lanev2di (target, x, target, index)); + break; + default: + gcc_unreachable (); + } + return; + } + /* Construct the vector in memory one field at a time + and load the whole vector. */ mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), 0); for (i = 0; i < n_elts; i++) - emit_move_insn (adjust_address_nv (mem, inner, i * GET_MODE_SIZE (inner)), - XVECEXP (vals, 0, i)); - + emit_move_insn (adjust_address_nv (mem, inner_mode, + i * GET_MODE_SIZE (inner_mode)), + XVECEXP (vals, 0, i)); emit_move_insn (target, mem); } @@ -6386,10 +8807,13 @@ arm_coproc_mem_operand (rtx op, bool wb) } /* Return TRUE if OP is a memory operand which we can load or store a vector - to/from. If CORE is true, we're moving from ARM registers not Neon - registers. */ + to/from. TYPE is one of the following values: + 0 - Vector load/stor (vldr) + 1 - Core registers (ldm) + 2 - Element/structure loads (vld1) + */ int -neon_vector_mem_operand (rtx op, bool core) +neon_vector_mem_operand (rtx op, int type) { rtx ind; @@ -6422,23 +8846,16 @@ neon_vector_mem_operand (rtx op, bool core) return arm_address_register_rtx_p (ind, 0); /* Allow post-increment with Neon registers. */ - if (!core && GET_CODE (ind) == POST_INC) + if ((type != 1 && GET_CODE (ind) == POST_INC) + || (type == 0 && GET_CODE (ind) == PRE_DEC)) return arm_address_register_rtx_p (XEXP (ind, 0), 0); -#if 0 - /* FIXME: We can support this too if we use VLD1/VST1. */ - if (!core - && GET_CODE (ind) == POST_MODIFY - && arm_address_register_rtx_p (XEXP (ind, 0), 0) - && GET_CODE (XEXP (ind, 1)) == PLUS - && rtx_equal_p (XEXP (XEXP (ind, 1), 0), XEXP (ind, 0))) - ind = XEXP (ind, 1); -#endif + /* FIXME: vld1 allows register post-modify. */ /* Match: (plus (reg) (const)). */ - if (!core + if (type == 0 && GET_CODE (ind) == PLUS && GET_CODE (XEXP (ind, 0)) == REG && REG_MODE_OK_FOR_BASE_P (XEXP (ind, 0), VOIDmode) @@ -6505,10 +8922,19 @@ arm_eliminable_register (rtx x) enum reg_class coproc_secondary_reload_class (enum machine_mode mode, rtx x, bool wb) { + if (mode == HFmode) + { + if (!TARGET_NEON_FP16) + return GENERAL_REGS; + if (s_register_operand (x, mode) || neon_vector_mem_operand (x, 2)) + return NO_REGS; + return GENERAL_REGS; + } + if (TARGET_NEON && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) - && neon_vector_mem_operand (x, FALSE)) + && neon_vector_mem_operand (x, 0)) return NO_REGS; if (arm_coproc_mem_operand (x, wb) || s_register_operand (x, mode)) @@ -6786,28 +9212,21 @@ tls_mentioned_p (rtx x) } } -/* Must not copy a SET whose source operand is PC-relative. */ +/* Must not copy any rtx that uses a pc-relative address. */ + +static int +arm_note_pic_base (rtx *x, void *date ATTRIBUTE_UNUSED) +{ + if (GET_CODE (*x) == UNSPEC + && XINT (*x, 1) == UNSPEC_PIC_BASE) + return 1; + return 0; +} static bool arm_cannot_copy_insn_p (rtx insn) { - rtx pat = PATTERN (insn); - - if (GET_CODE (pat) == SET) - { - rtx rhs = SET_SRC (pat); - - if (GET_CODE (rhs) == UNSPEC - && XINT (rhs, 1) == UNSPEC_PIC_BASE) - return TRUE; - - if (GET_CODE (rhs) == MEM - && GET_CODE (XEXP (rhs, 0)) == UNSPEC - && XINT (XEXP (rhs, 0), 1) == UNSPEC_PIC_BASE) - return TRUE; - } - - return FALSE; + return for_each_rtx (&PATTERN (insn), arm_note_pic_base, NULL); } enum rtx_code @@ -6868,7 +9287,7 @@ adjacent_mem_locations (rtx a, rtx b) /* Don't accept any offset that will require multiple instructions to handle, since this would cause the arith_adjacentmem pattern to output an overlong sequence. */ - if (!const_ok_for_op (PLUS, val0) || !const_ok_for_op (PLUS, val1)) + if (!const_ok_for_op (val0, PLUS) || !const_ok_for_op (val1, PLUS)) return 0; /* Don't allow an eliminable register: register elimination can make @@ -6895,19 +9314,151 @@ adjacent_mem_locations (rtx a, rtx b) return 0; } -int -load_multiple_sequence (rtx *operands, int nops, int *regs, int *base, - HOST_WIDE_INT *load_offset) +/* Return true iff it would be profitable to turn a sequence of NOPS loads + or stores (depending on IS_STORE) into a load-multiple or store-multiple + instruction. ADD_OFFSET is nonzero if the base address register needs + to be modified with an add instruction before we can use it. */ + +static bool +multiple_operation_profitable_p (bool is_store ATTRIBUTE_UNUSED, + int nops, HOST_WIDE_INT add_offset) + { + /* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm + if the offset isn't small enough. The reason 2 ldrs are faster + is because these ARMs are able to do more than one cache access + in a single cycle. The ARM9 and StrongARM have Harvard caches, + whilst the ARM8 has a double bandwidth cache. This means that + these cores can do both an instruction fetch and a data fetch in + a single cycle, so the trick of calculating the address into a + scratch register (one of the result regs) and then doing a load + multiple actually becomes slower (and no smaller in code size). + That is the transformation + + ldr rd1, [rbase + offset] + ldr rd2, [rbase + offset + 4] + + to + + add rd1, rbase, offset + ldmia rd1, {rd1, rd2} + + produces worse code -- '3 cycles + any stalls on rd2' instead of + '2 cycles + any stalls on rd2'. On ARMs with only one cache + access per cycle, the first sequence could never complete in less + than 6 cycles, whereas the ldm sequence would only take 5 and + would make better use of sequential accesses if not hitting the + cache. + + We cheat here and test 'arm_ld_sched' which we currently know to + only be true for the ARM8, ARM9 and StrongARM. If this ever + changes, then the test below needs to be reworked. */ + if (nops == 2 && arm_ld_sched && add_offset != 0) + return false; + + /* XScale has load-store double instructions, but they have stricter + alignment requirements than load-store multiple, so we cannot + use them. + + For XScale ldm requires 2 + NREGS cycles to complete and blocks + the pipeline until completion. + + NREGS CYCLES + 1 3 + 2 4 + 3 5 + 4 6 + + An ldr instruction takes 1-3 cycles, but does not block the + pipeline. + + NREGS CYCLES + 1 1-3 + 2 2-6 + 3 3-9 + 4 4-12 + + Best case ldr will always win. However, the more ldr instructions + we issue, the less likely we are to be able to schedule them well. + Using ldr instructions also increases code size. + + As a compromise, we use ldr for counts of 1 or 2 regs, and ldm + for counts of 3 or 4 regs. */ + if (nops <= 2 && arm_tune_xscale && !optimize_size) + return false; + return true; +} + +/* Subroutine of load_multiple_sequence and store_multiple_sequence. + Given an array of UNSORTED_OFFSETS, of which there are NOPS, compute + an array ORDER which describes the sequence to use when accessing the + offsets that produces an ascending order. In this sequence, each + offset must be larger by exactly 4 than the previous one. ORDER[0] + must have been filled in with the lowest offset by the caller. + If UNSORTED_REGS is nonnull, it is an array of register numbers that + we use to verify that ORDER produces an ascending order of registers. + Return true if it was possible to construct such an order, false if + not. */ + +static bool +compute_offset_order (int nops, HOST_WIDE_INT *unsorted_offsets, int *order, + int *unsorted_regs) +{ + int i; + for (i = 1; i < nops; i++) + { + int j; + + order[i] = order[i - 1]; + for (j = 0; j < nops; j++) + if (unsorted_offsets[j] == unsorted_offsets[order[i - 1]] + 4) + { + /* We must find exactly one offset that is higher than the + previous one by 4. */ + if (order[i] != order[i - 1]) + return false; + order[i] = j; + } + if (order[i] == order[i - 1]) + return false; + /* The register numbers must be ascending. */ + if (unsorted_regs != NULL + && unsorted_regs[order[i]] <= unsorted_regs[order[i - 1]]) + return false; + } + return true; +} + +/* Used to determine in a peephole whether a sequence of load + instructions can be changed into a load-multiple instruction. + NOPS is the number of separate load instructions we are examining. The + first NOPS entries in OPERANDS are the destination registers, the + next NOPS entries are memory operands. If this function is + successful, *BASE is set to the common base register of the memory + accesses; *LOAD_OFFSET is set to the first memory location's offset + from that base register. + REGS is an array filled in with the destination register numbers. + SAVED_ORDER (if nonnull), is an array filled in with an order that maps + insn numbers to to an ascending order of stores. If CHECK_REGS is true, + the sequence of registers in REGS matches the loads from ascending memory + locations, and the function verifies that the register numbers are + themselves ascending. If CHECK_REGS is false, the register numbers + are stored in the order they are found in the operands. */ +static int +load_multiple_sequence (rtx *operands, int nops, int *regs, int *saved_order, + int *base, HOST_WIDE_INT *load_offset, bool check_regs) { - int unsorted_regs[4]; - HOST_WIDE_INT unsorted_offsets[4]; - int order[4]; + int unsorted_regs[MAX_LDM_STM_OPS]; + HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS]; + int order[MAX_LDM_STM_OPS]; + rtx base_reg_rtx = NULL; int base_reg = -1; - int i; + int i, ldm_case; + + /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be + easily extended if required. */ + gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS); - /* Can only handle 2, 3, or 4 insns at present, - though could be easily extended if required. */ - gcc_assert (nops >= 2 && nops <= 4); + memset (order, 0, MAX_LDM_STM_OPS * sizeof (int)); /* Loop over the operands and check that the memory references are suitable (i.e. immediate offsets from the same base register). At @@ -6945,32 +9496,30 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base, if (i == 0) { base_reg = REGNO (reg); - unsorted_regs[0] = (GET_CODE (operands[i]) == REG - ? REGNO (operands[i]) - : REGNO (SUBREG_REG (operands[i]))); - order[0] = 0; - } - else - { - if (base_reg != (int) REGNO (reg)) - /* Not addressed from the same base register. */ + base_reg_rtx = reg; + if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM) return 0; - - unsorted_regs[i] = (GET_CODE (operands[i]) == REG - ? REGNO (operands[i]) - : REGNO (SUBREG_REG (operands[i]))); - if (unsorted_regs[i] < unsorted_regs[order[0]]) - order[0] = i; } + else if (base_reg != (int) REGNO (reg)) + /* Not addressed from the same base register. */ + return 0; + + unsorted_regs[i] = (GET_CODE (operands[i]) == REG + ? REGNO (operands[i]) + : REGNO (SUBREG_REG (operands[i]))); /* If it isn't an integer register, or if it overwrites the base register but isn't the last insn in the list, then we can't do this. */ - if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14 + if (unsorted_regs[i] < 0 + || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM) + || unsorted_regs[i] > 14 || (i != nops - 1 && unsorted_regs[i] == base_reg)) return 0; unsorted_offsets[i] = INTVAL (offset); + if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]]) + order[0] = i; } else /* Not a suitable memory address. */ @@ -6979,162 +9528,87 @@ load_multiple_sequence (rtx *operands, int nops, int *regs, int *base, /* All the useful information has now been extracted from the operands into unsorted_regs and unsorted_offsets; additionally, - order[0] has been set to the lowest numbered register in the - list. Sort the registers into order, and check that the memory - offsets are ascending and adjacent. */ - - for (i = 1; i < nops; i++) - { - int j; - - order[i] = order[i - 1]; - for (j = 0; j < nops; j++) - if (unsorted_regs[j] > unsorted_regs[order[i - 1]] - && (order[i] == order[i - 1] - || unsorted_regs[j] < unsorted_regs[order[i]])) - order[i] = j; - - /* Have we found a suitable register? if not, one must be used more - than once. */ - if (order[i] == order[i - 1]) - return 0; + order[0] has been set to the lowest offset in the list. Sort + the offsets into order, verifying that they are adjacent, and + check that the register numbers are ascending. */ + if (!compute_offset_order (nops, unsorted_offsets, order, + check_regs ? unsorted_regs : NULL)) + return 0; - /* Is the memory address adjacent and ascending? */ - if (unsorted_offsets[order[i]] != unsorted_offsets[order[i - 1]] + 4) - return 0; - } + if (saved_order) + memcpy (saved_order, order, sizeof order); if (base) { *base = base_reg; for (i = 0; i < nops; i++) - regs[i] = unsorted_regs[order[i]]; + regs[i] = unsorted_regs[check_regs ? order[i] : i]; *load_offset = unsorted_offsets[order[0]]; } - if (unsorted_offsets[order[0]] == 0) - return 1; /* ldmia */ - - if (TARGET_ARM && unsorted_offsets[order[0]] == 4) - return 2; /* ldmib */ - - if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) - return 3; /* ldmda */ - - if (unsorted_offsets[order[nops - 1]] == -4) - return 4; /* ldmdb */ - - /* For ARM8,9 & StrongARM, 2 ldr instructions are faster than an ldm - if the offset isn't small enough. The reason 2 ldrs are faster - is because these ARMs are able to do more than one cache access - in a single cycle. The ARM9 and StrongARM have Harvard caches, - whilst the ARM8 has a double bandwidth cache. This means that - these cores can do both an instruction fetch and a data fetch in - a single cycle, so the trick of calculating the address into a - scratch register (one of the result regs) and then doing a load - multiple actually becomes slower (and no smaller in code size). - That is the transformation - - ldr rd1, [rbase + offset] - ldr rd2, [rbase + offset + 4] - - to - - add rd1, rbase, offset - ldmia rd1, {rd1, rd2} - - produces worse code -- '3 cycles + any stalls on rd2' instead of - '2 cycles + any stalls on rd2'. On ARMs with only one cache - access per cycle, the first sequence could never complete in less - than 6 cycles, whereas the ldm sequence would only take 5 and - would make better use of sequential accesses if not hitting the - cache. - - We cheat here and test 'arm_ld_sched' which we currently know to - only be true for the ARM8, ARM9 and StrongARM. If this ever - changes, then the test below needs to be reworked. */ - if (nops == 2 && arm_ld_sched) + if (TARGET_THUMB1 + && !peep2_reg_dead_p (nops, base_reg_rtx)) return 0; - /* Can't do it without setting up the offset, only do this if it takes - no more than one insn. */ - return (const_ok_for_arm (unsorted_offsets[order[0]]) - || const_ok_for_arm (-unsorted_offsets[order[0]])) ? 5 : 0; -} - -const char * -emit_ldm_seq (rtx *operands, int nops) -{ - int regs[4]; - int base_reg; - HOST_WIDE_INT offset; - char buf[100]; - int i; - - switch (load_multiple_sequence (operands, nops, regs, &base_reg, &offset)) - { - case 1: - strcpy (buf, "ldm%(ia%)\t"); - break; - - case 2: - strcpy (buf, "ldm%(ib%)\t"); - break; - - case 3: - strcpy (buf, "ldm%(da%)\t"); - break; - - case 4: - strcpy (buf, "ldm%(db%)\t"); - break; - - case 5: - if (offset >= 0) - sprintf (buf, "add%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX, - reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg], - (long) offset); - else - sprintf (buf, "sub%%?\t%s%s, %s%s, #%ld", REGISTER_PREFIX, - reg_names[regs[0]], REGISTER_PREFIX, reg_names[base_reg], - (long) -offset); - output_asm_insn (buf, operands); - base_reg = regs[0]; - strcpy (buf, "ldm%(ia%)\t"); - break; - - default: - gcc_unreachable (); - } - - sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX, - reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]); - - for (i = 1; i < nops; i++) - sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX, - reg_names[regs[i]]); - - strcat (buf, "}\t%@ phole ldm"); + if (unsorted_offsets[order[0]] == 0) + ldm_case = 1; /* ldmia */ + else if (TARGET_ARM && unsorted_offsets[order[0]] == 4) + ldm_case = 2; /* ldmib */ + else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) + ldm_case = 3; /* ldmda */ + else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4) + ldm_case = 4; /* ldmdb */ + else if (const_ok_for_arm (unsorted_offsets[order[0]]) + || const_ok_for_arm (-unsorted_offsets[order[0]])) + ldm_case = 5; + else + return 0; - output_asm_insn (buf, operands); - return ""; -} + if (!multiple_operation_profitable_p (false, nops, + ldm_case == 5 + ? unsorted_offsets[order[0]] : 0)) + return 0; -int -store_multiple_sequence (rtx *operands, int nops, int *regs, int *base, - HOST_WIDE_INT * load_offset) -{ - int unsorted_regs[4]; - HOST_WIDE_INT unsorted_offsets[4]; - int order[4]; + return ldm_case; +} + +/* Used to determine in a peephole whether a sequence of store instructions can + be changed into a store-multiple instruction. + NOPS is the number of separate store instructions we are examining. + NOPS_TOTAL is the total number of instructions recognized by the peephole + pattern. + The first NOPS entries in OPERANDS are the source registers, the next + NOPS entries are memory operands. If this function is successful, *BASE is + set to the common base register of the memory accesses; *LOAD_OFFSET is set + to the first memory location's offset from that base register. REGS is an + array filled in with the source register numbers, REG_RTXS (if nonnull) is + likewise filled with the corresponding rtx's. + SAVED_ORDER (if nonnull), is an array filled in with an order that maps insn + numbers to to an ascending order of stores. + If CHECK_REGS is true, the sequence of registers in *REGS matches the stores + from ascending memory locations, and the function verifies that the register + numbers are themselves ascending. If CHECK_REGS is false, the register + numbers are stored in the order they are found in the operands. */ +static int +store_multiple_sequence (rtx *operands, int nops, int nops_total, + int *regs, rtx *reg_rtxs, int *saved_order, int *base, + HOST_WIDE_INT *load_offset, bool check_regs) +{ + int unsorted_regs[MAX_LDM_STM_OPS]; + rtx unsorted_reg_rtxs[MAX_LDM_STM_OPS]; + HOST_WIDE_INT unsorted_offsets[MAX_LDM_STM_OPS]; + int order[MAX_LDM_STM_OPS]; int base_reg = -1; - int i; + rtx base_reg_rtx = NULL; + int i, stm_case; - /* Can only handle 2, 3, or 4 insns at present, though could be easily - extended if required. */ - gcc_assert (nops >= 2 && nops <= 4); + /* Can only handle up to MAX_LDM_STM_OPS insns at present, though could be + easily extended if required. */ + gcc_assert (nops >= 2 && nops <= MAX_LDM_STM_OPS); + + memset (order, 0, MAX_LDM_STM_OPS * sizeof (int)); /* Loop over the operands and check that the memory references are suitable (i.e. immediate offsets from the same base register). At @@ -7169,32 +9643,32 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base, && (GET_CODE (offset = XEXP (XEXP (operands[nops + i], 0), 1)) == CONST_INT))) { + unsorted_reg_rtxs[i] = (GET_CODE (operands[i]) == REG + ? operands[i] : SUBREG_REG (operands[i])); + unsorted_regs[i] = REGNO (unsorted_reg_rtxs[i]); + if (i == 0) { base_reg = REGNO (reg); - unsorted_regs[0] = (GET_CODE (operands[i]) == REG - ? REGNO (operands[i]) - : REGNO (SUBREG_REG (operands[i]))); - order[0] = 0; - } - else - { - if (base_reg != (int) REGNO (reg)) - /* Not addressed from the same base register. */ + base_reg_rtx = reg; + if (TARGET_THUMB1 && base_reg > LAST_LO_REGNUM) return 0; - - unsorted_regs[i] = (GET_CODE (operands[i]) == REG - ? REGNO (operands[i]) - : REGNO (SUBREG_REG (operands[i]))); - if (unsorted_regs[i] < unsorted_regs[order[0]]) - order[0] = i; } + else if (base_reg != (int) REGNO (reg)) + /* Not addressed from the same base register. */ + return 0; /* If it isn't an integer register, then we can't do this. */ - if (unsorted_regs[i] < 0 || unsorted_regs[i] > 14) + if (unsorted_regs[i] < 0 + || (TARGET_THUMB1 && unsorted_regs[i] > LAST_LO_REGNUM) + || (TARGET_THUMB2 && unsorted_regs[i] == base_reg) + || (TARGET_THUMB2 && unsorted_regs[i] == SP_REGNUM) + || unsorted_regs[i] > 14) return 0; unsorted_offsets[i] = INTVAL (offset); + if (i == 0 || unsorted_offsets[i] < unsorted_offsets[order[0]]) + order[0] = i; } else /* Not a suitable memory address. */ @@ -7203,253 +9677,450 @@ store_multiple_sequence (rtx *operands, int nops, int *regs, int *base, /* All the useful information has now been extracted from the operands into unsorted_regs and unsorted_offsets; additionally, - order[0] has been set to the lowest numbered register in the - list. Sort the registers into order, and check that the memory - offsets are ascending and adjacent. */ - - for (i = 1; i < nops; i++) - { - int j; - - order[i] = order[i - 1]; - for (j = 0; j < nops; j++) - if (unsorted_regs[j] > unsorted_regs[order[i - 1]] - && (order[i] == order[i - 1] - || unsorted_regs[j] < unsorted_regs[order[i]])) - order[i] = j; - - /* Have we found a suitable register? if not, one must be used more - than once. */ - if (order[i] == order[i - 1]) - return 0; + order[0] has been set to the lowest offset in the list. Sort + the offsets into order, verifying that they are adjacent, and + check that the register numbers are ascending. */ + if (!compute_offset_order (nops, unsorted_offsets, order, + check_regs ? unsorted_regs : NULL)) + return 0; - /* Is the memory address adjacent and ascending? */ - if (unsorted_offsets[order[i]] != unsorted_offsets[order[i - 1]] + 4) - return 0; - } + if (saved_order) + memcpy (saved_order, order, sizeof order); if (base) { *base = base_reg; for (i = 0; i < nops; i++) - regs[i] = unsorted_regs[order[i]]; + { + regs[i] = unsorted_regs[check_regs ? order[i] : i]; + if (reg_rtxs) + reg_rtxs[i] = unsorted_reg_rtxs[check_regs ? order[i] : i]; + } *load_offset = unsorted_offsets[order[0]]; } + if (TARGET_THUMB1 + && !peep2_reg_dead_p (nops_total, base_reg_rtx)) + return 0; + if (unsorted_offsets[order[0]] == 0) - return 1; /* stmia */ + stm_case = 1; /* stmia */ + else if (TARGET_ARM && unsorted_offsets[order[0]] == 4) + stm_case = 2; /* stmib */ + else if (TARGET_ARM && unsorted_offsets[order[nops - 1]] == 0) + stm_case = 3; /* stmda */ + else if (TARGET_32BIT && unsorted_offsets[order[nops - 1]] == -4) + stm_case = 4; /* stmdb */ + else + return 0; + + if (!multiple_operation_profitable_p (false, nops, 0)) + return 0; + + return stm_case; +} + +/* Routines for use in generating RTL. */ + +/* Generate a load-multiple instruction. COUNT is the number of loads in + the instruction; REGS and MEMS are arrays containing the operands. + BASEREG is the base register to be used in addressing the memory operands. + WBACK_OFFSET is nonzero if the instruction should update the base + register. */ + +static rtx +arm_gen_load_multiple_1 (int count, int *regs, rtx *mems, rtx basereg, + HOST_WIDE_INT wback_offset) +{ + int i = 0, j; + rtx result; - if (unsorted_offsets[order[0]] == 4) - return 2; /* stmib */ + if (!multiple_operation_profitable_p (false, count, 0)) + { + rtx seq; + + start_sequence (); - if (unsorted_offsets[order[nops - 1]] == 0) - return 3; /* stmda */ + for (i = 0; i < count; i++) + emit_move_insn (gen_rtx_REG (SImode, regs[i]), mems[i]); - if (unsorted_offsets[order[nops - 1]] == -4) - return 4; /* stmdb */ + if (wback_offset != 0) + emit_move_insn (basereg, plus_constant (basereg, wback_offset)); - return 0; + seq = get_insns (); + end_sequence (); + + return seq; + } + + result = gen_rtx_PARALLEL (VOIDmode, + rtvec_alloc (count + (wback_offset != 0 ? 1 : 0))); + if (wback_offset != 0) + { + XVECEXP (result, 0, 0) + = gen_rtx_SET (VOIDmode, basereg, + plus_constant (basereg, wback_offset)); + i = 1; + count++; + } + + for (j = 0; i < count; i++, j++) + XVECEXP (result, 0, i) + = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, regs[j]), mems[j]); + + return result; } -const char * -emit_stm_seq (rtx *operands, int nops) +/* Generate a store-multiple instruction. COUNT is the number of stores in + the instruction; REGS and MEMS are arrays containing the operands. + BASEREG is the base register to be used in addressing the memory operands. + WBACK_OFFSET is nonzero if the instruction should update the base + register. */ + +static rtx +arm_gen_store_multiple_1 (int count, int *regs, rtx *mems, rtx basereg, + HOST_WIDE_INT wback_offset) { - int regs[4]; - int base_reg; - HOST_WIDE_INT offset; - char buf[100]; - int i; + int i = 0, j; + rtx result; + + if (GET_CODE (basereg) == PLUS) + basereg = XEXP (basereg, 0); - switch (store_multiple_sequence (operands, nops, regs, &base_reg, &offset)) + if (!multiple_operation_profitable_p (false, count, 0)) { - case 1: - strcpy (buf, "stm%(ia%)\t"); - break; + rtx seq; - case 2: - strcpy (buf, "stm%(ib%)\t"); - break; + start_sequence (); - case 3: - strcpy (buf, "stm%(da%)\t"); - break; + for (i = 0; i < count; i++) + emit_move_insn (mems[i], gen_rtx_REG (SImode, regs[i])); - case 4: - strcpy (buf, "stm%(db%)\t"); - break; + if (wback_offset != 0) + emit_move_insn (basereg, plus_constant (basereg, wback_offset)); - default: - gcc_unreachable (); + seq = get_insns (); + end_sequence (); + + return seq; } - sprintf (buf + strlen (buf), "%s%s, {%s%s", REGISTER_PREFIX, - reg_names[base_reg], REGISTER_PREFIX, reg_names[regs[0]]); + result = gen_rtx_PARALLEL (VOIDmode, + rtvec_alloc (count + (wback_offset != 0 ? 1 : 0))); + if (wback_offset != 0) + { + XVECEXP (result, 0, 0) + = gen_rtx_SET (VOIDmode, basereg, + plus_constant (basereg, wback_offset)); + i = 1; + count++; + } - for (i = 1; i < nops; i++) - sprintf (buf + strlen (buf), ", %s%s", REGISTER_PREFIX, - reg_names[regs[i]]); + for (j = 0; i < count; i++, j++) + XVECEXP (result, 0, i) + = gen_rtx_SET (VOIDmode, mems[j], gen_rtx_REG (SImode, regs[j])); - strcat (buf, "}\t%@ phole stm"); + return result; +} - output_asm_insn (buf, operands); - return ""; +/* Generate either a load-multiple or a store-multiple instruction. This + function can be used in situations where we can start with a single MEM + rtx and adjust its address upwards. + COUNT is the number of operations in the instruction, not counting a + possible update of the base register. REGS is an array containing the + register operands. + BASEREG is the base register to be used in addressing the memory operands, + which are constructed from BASEMEM. + WRITE_BACK specifies whether the generated instruction should include an + update of the base register. + OFFSETP is used to pass an offset to and from this function; this offset + is not used when constructing the address (instead BASEMEM should have an + appropriate offset in its address), it is used only for setting + MEM_OFFSET. It is updated only if WRITE_BACK is true.*/ + +static rtx +arm_gen_multiple_op (bool is_load, int *regs, int count, rtx basereg, + bool write_back, rtx basemem, HOST_WIDE_INT *offsetp) +{ + rtx mems[MAX_LDM_STM_OPS]; + HOST_WIDE_INT offset = *offsetp; + int i; + + gcc_assert (count <= MAX_LDM_STM_OPS); + + if (GET_CODE (basereg) == PLUS) + basereg = XEXP (basereg, 0); + + for (i = 0; i < count; i++) + { + rtx addr = plus_constant (basereg, i * 4); + mems[i] = adjust_automodify_address_nv (basemem, SImode, addr, offset); + offset += 4; + } + + if (write_back) + *offsetp = offset; + + if (is_load) + return arm_gen_load_multiple_1 (count, regs, mems, basereg, + write_back ? 4 * count : 0); + else + return arm_gen_store_multiple_1 (count, regs, mems, basereg, + write_back ? 4 * count : 0); } - -/* Routines for use in generating RTL. */ rtx -arm_gen_load_multiple (int base_regno, int count, rtx from, int up, - int write_back, rtx basemem, HOST_WIDE_INT *offsetp) +arm_gen_load_multiple (int *regs, int count, rtx basereg, int write_back, + rtx basemem, HOST_WIDE_INT *offsetp) { - HOST_WIDE_INT offset = *offsetp; - int i = 0, j; - rtx result; - int sign = up ? 1 : -1; - rtx mem, addr; + return arm_gen_multiple_op (TRUE, regs, count, basereg, write_back, basemem, + offsetp); +} - /* XScale has load-store double instructions, but they have stricter - alignment requirements than load-store multiple, so we cannot - use them. +rtx +arm_gen_store_multiple (int *regs, int count, rtx basereg, int write_back, + rtx basemem, HOST_WIDE_INT *offsetp) +{ + return arm_gen_multiple_op (FALSE, regs, count, basereg, write_back, basemem, + offsetp); +} - For XScale ldm requires 2 + NREGS cycles to complete and blocks - the pipeline until completion. +/* Called from a peephole2 expander to turn a sequence of loads into an + LDM instruction. OPERANDS are the operands found by the peephole matcher; + NOPS indicates how many separate loads we are trying to combine. SORT_REGS + is true if we can reorder the registers because they are used commutatively + subsequently. + Returns true iff we could generate a new instruction. */ - NREGS CYCLES - 1 3 - 2 4 - 3 5 - 4 6 +bool +gen_ldm_seq (rtx *operands, int nops, bool sort_regs) +{ + int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS]; + rtx mems[MAX_LDM_STM_OPS]; + int i, j, base_reg; + rtx base_reg_rtx; + HOST_WIDE_INT offset; + int write_back = FALSE; + int ldm_case; + rtx addr; - An ldr instruction takes 1-3 cycles, but does not block the - pipeline. + ldm_case = load_multiple_sequence (operands, nops, regs, mem_order, + &base_reg, &offset, !sort_regs); - NREGS CYCLES - 1 1-3 - 2 2-6 - 3 3-9 - 4 4-12 + if (ldm_case == 0) + return false; - Best case ldr will always win. However, the more ldr instructions - we issue, the less likely we are to be able to schedule them well. - Using ldr instructions also increases code size. + if (sort_regs) + for (i = 0; i < nops - 1; i++) + for (j = i + 1; j < nops; j++) + if (regs[i] > regs[j]) + { + int t = regs[i]; + regs[i] = regs[j]; + regs[j] = t; + } + base_reg_rtx = gen_rtx_REG (Pmode, base_reg); + + if (TARGET_THUMB1) + { + gcc_assert (peep2_reg_dead_p (nops, base_reg_rtx)); + gcc_assert (ldm_case == 1 || ldm_case == 5); + write_back = TRUE; + } + + if (ldm_case == 5) + { + rtx newbase = TARGET_THUMB1 ? base_reg_rtx : gen_rtx_REG (SImode, regs[0]); + emit_insn (gen_addsi3 (newbase, base_reg_rtx, GEN_INT (offset))); + offset = 0; + if (!TARGET_THUMB1) + { + base_reg = regs[0]; + base_reg_rtx = newbase; + } + } - As a compromise, we use ldr for counts of 1 or 2 regs, and ldm - for counts of 3 or 4 regs. */ - if (arm_tune_xscale && count <= 2 && ! optimize_size) + for (i = 0; i < nops; i++) { - rtx seq; + addr = plus_constant (base_reg_rtx, offset + i * 4); + mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]], + SImode, addr, 0); + } + emit_insn (arm_gen_load_multiple_1 (nops, regs, mems, base_reg_rtx, + write_back ? offset + i * 4 : 0)); + return true; +} - start_sequence (); +/* Called from a peephole2 expander to turn a sequence of stores into an + STM instruction. OPERANDS are the operands found by the peephole matcher; + NOPS indicates how many separate stores we are trying to combine. + Returns true iff we could generate a new instruction. */ - for (i = 0; i < count; i++) - { - addr = plus_constant (from, i * 4 * sign); - mem = adjust_automodify_address (basemem, SImode, addr, offset); - emit_move_insn (gen_rtx_REG (SImode, base_regno + i), mem); - offset += 4 * sign; - } +bool +gen_stm_seq (rtx *operands, int nops) +{ + int i; + int regs[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS]; + rtx mems[MAX_LDM_STM_OPS]; + int base_reg; + rtx base_reg_rtx; + HOST_WIDE_INT offset; + int write_back = FALSE; + int stm_case; + rtx addr; + bool base_reg_dies; - if (write_back) - { - emit_move_insn (from, plus_constant (from, count * 4 * sign)); - *offsetp = offset; - } + stm_case = store_multiple_sequence (operands, nops, nops, regs, NULL, + mem_order, &base_reg, &offset, true); - seq = get_insns (); - end_sequence (); + if (stm_case == 0) + return false; - return seq; - } + base_reg_rtx = gen_rtx_REG (Pmode, base_reg); - result = gen_rtx_PARALLEL (VOIDmode, - rtvec_alloc (count + (write_back ? 1 : 0))); - if (write_back) + base_reg_dies = peep2_reg_dead_p (nops, base_reg_rtx); + if (TARGET_THUMB1) { - XVECEXP (result, 0, 0) - = gen_rtx_SET (VOIDmode, from, plus_constant (from, count * 4 * sign)); - i = 1; - count++; + gcc_assert (base_reg_dies); + write_back = TRUE; } - for (j = 0; i < count; i++, j++) + if (stm_case == 5) { - addr = plus_constant (from, j * 4 * sign); - mem = adjust_automodify_address_nv (basemem, SImode, addr, offset); - XVECEXP (result, 0, i) - = gen_rtx_SET (VOIDmode, gen_rtx_REG (SImode, base_regno + j), mem); - offset += 4 * sign; + gcc_assert (base_reg_dies); + emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset))); + offset = 0; } - if (write_back) - *offsetp = offset; + addr = plus_constant (base_reg_rtx, offset); - return result; + for (i = 0; i < nops; i++) + { + addr = plus_constant (base_reg_rtx, offset + i * 4); + mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]], + SImode, addr, 0); + } + emit_insn (arm_gen_store_multiple_1 (nops, regs, mems, base_reg_rtx, + write_back ? offset + i * 4 : 0)); + return true; } -rtx -arm_gen_store_multiple (int base_regno, int count, rtx to, int up, - int write_back, rtx basemem, HOST_WIDE_INT *offsetp) +/* Called from a peephole2 expander to turn a sequence of stores that are + preceded by constant loads into an STM instruction. OPERANDS are the + operands found by the peephole matcher; NOPS indicates how many + separate stores we are trying to combine; there are 2 * NOPS + instructions in the peephole. + Returns true iff we could generate a new instruction. */ + +bool +gen_const_stm_seq (rtx *operands, int nops) { - HOST_WIDE_INT offset = *offsetp; - int i = 0, j; - rtx result; - int sign = up ? 1 : -1; - rtx mem, addr; + int regs[MAX_LDM_STM_OPS], sorted_regs[MAX_LDM_STM_OPS]; + int reg_order[MAX_LDM_STM_OPS], mem_order[MAX_LDM_STM_OPS]; + rtx reg_rtxs[MAX_LDM_STM_OPS], orig_reg_rtxs[MAX_LDM_STM_OPS]; + rtx mems[MAX_LDM_STM_OPS]; + int base_reg; + rtx base_reg_rtx; + HOST_WIDE_INT offset; + int write_back = FALSE; + int stm_case; + rtx addr; + bool base_reg_dies; + int i, j; + HARD_REG_SET allocated; - /* See arm_gen_load_multiple for discussion of - the pros/cons of ldm/stm usage for XScale. */ - if (arm_tune_xscale && count <= 2 && ! optimize_size) - { - rtx seq; + stm_case = store_multiple_sequence (operands, nops, 2 * nops, regs, reg_rtxs, + mem_order, &base_reg, &offset, false); - start_sequence (); + if (stm_case == 0) + return false; - for (i = 0; i < count; i++) - { - addr = plus_constant (to, i * 4 * sign); - mem = adjust_automodify_address (basemem, SImode, addr, offset); - emit_move_insn (mem, gen_rtx_REG (SImode, base_regno + i)); - offset += 4 * sign; - } + memcpy (orig_reg_rtxs, reg_rtxs, sizeof orig_reg_rtxs); - if (write_back) - { - emit_move_insn (to, plus_constant (to, count * 4 * sign)); - *offsetp = offset; - } + /* If the same register is used more than once, try to find a free + register. */ + CLEAR_HARD_REG_SET (allocated); + for (i = 0; i < nops; i++) + { + for (j = i + 1; j < nops; j++) + if (regs[i] == regs[j]) + { + rtx t = peep2_find_free_register (0, nops * 2, + TARGET_THUMB1 ? "l" : "r", + SImode, &allocated); + if (t == NULL_RTX) + return false; + reg_rtxs[i] = t; + regs[i] = REGNO (t); + } + } - seq = get_insns (); - end_sequence (); + /* Compute an ordering that maps the register numbers to an ascending + sequence. */ + reg_order[0] = 0; + for (i = 0; i < nops; i++) + if (regs[i] < regs[reg_order[0]]) + reg_order[0] = i; - return seq; + for (i = 1; i < nops; i++) + { + int this_order = reg_order[i - 1]; + for (j = 0; j < nops; j++) + if (regs[j] > regs[reg_order[i - 1]] + && (this_order == reg_order[i - 1] + || regs[j] < regs[this_order])) + this_order = j; + reg_order[i] = this_order; } - result = gen_rtx_PARALLEL (VOIDmode, - rtvec_alloc (count + (write_back ? 1 : 0))); - if (write_back) + /* Ensure that registers that must be live after the instruction end + up with the correct value. */ + for (i = 0; i < nops; i++) { - XVECEXP (result, 0, 0) - = gen_rtx_SET (VOIDmode, to, - plus_constant (to, count * 4 * sign)); - i = 1; - count++; + int this_order = reg_order[i]; + if ((this_order != mem_order[i] + || orig_reg_rtxs[this_order] != reg_rtxs[this_order]) + && !peep2_reg_dead_p (nops * 2, orig_reg_rtxs[this_order])) + return false; } - for (j = 0; i < count; i++, j++) + /* Load the constants. */ + for (i = 0; i < nops; i++) { - addr = plus_constant (to, j * 4 * sign); - mem = adjust_automodify_address_nv (basemem, SImode, addr, offset); - XVECEXP (result, 0, i) - = gen_rtx_SET (VOIDmode, mem, gen_rtx_REG (SImode, base_regno + j)); - offset += 4 * sign; + rtx op = operands[2 * nops + mem_order[i]]; + sorted_regs[i] = regs[reg_order[i]]; + emit_move_insn (reg_rtxs[reg_order[i]], op); } - if (write_back) - *offsetp = offset; + base_reg_rtx = gen_rtx_REG (Pmode, base_reg); - return result; + base_reg_dies = peep2_reg_dead_p (nops * 2, base_reg_rtx); + if (TARGET_THUMB1) + { + gcc_assert (base_reg_dies); + write_back = TRUE; + } + + if (stm_case == 5) + { + gcc_assert (base_reg_dies); + emit_insn (gen_addsi3 (base_reg_rtx, base_reg_rtx, GEN_INT (offset))); + offset = 0; + } + + addr = plus_constant (base_reg_rtx, offset); + + for (i = 0; i < nops; i++) + { + addr = plus_constant (base_reg_rtx, offset + i * 4); + mems[i] = adjust_automodify_address_nv (operands[nops + mem_order[i]], + SImode, addr, 0); + } + emit_insn (arm_gen_store_multiple_1 (nops, sorted_regs, mems, base_reg_rtx, + write_back ? offset + i * 4 : 0)); + return true; } int @@ -7485,20 +10156,21 @@ arm_gen_movmemqi (rtx *operands) for (i = 0; in_words_to_go >= 2; i+=4) { if (in_words_to_go > 4) - emit_insn (arm_gen_load_multiple (0, 4, src, TRUE, TRUE, - srcbase, &srcoffset)); + emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, 4, src, + TRUE, srcbase, &srcoffset)); else - emit_insn (arm_gen_load_multiple (0, in_words_to_go, src, TRUE, - FALSE, srcbase, &srcoffset)); + emit_insn (arm_gen_load_multiple (arm_regs_in_sequence, in_words_to_go, + src, FALSE, srcbase, + &srcoffset)); if (out_words_to_go) { if (out_words_to_go > 4) - emit_insn (arm_gen_store_multiple (0, 4, dst, TRUE, TRUE, - dstbase, &dstoffset)); + emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, 4, dst, + TRUE, dstbase, &dstoffset)); else if (out_words_to_go != 1) - emit_insn (arm_gen_store_multiple (0, out_words_to_go, - dst, TRUE, + emit_insn (arm_gen_store_multiple (arm_regs_in_sequence, + out_words_to_go, dst, (last_bytes == 0 ? FALSE : TRUE), dstbase, &dstoffset)); @@ -7782,7 +10454,8 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y) /* A compare with a shifted operand. Because of canonicalization, the comparison will have to be swapped when we emit the assembler. */ - if (GET_MODE (y) == SImode && GET_CODE (y) == REG + if (GET_MODE (y) == SImode + && (REG_P (y) || (GET_CODE (y) == SUBREG)) && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT || GET_CODE (x) == LSHIFTRT || GET_CODE (x) == ROTATE || GET_CODE (x) == ROTATERT)) @@ -7790,7 +10463,8 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y) /* This operation is performed swapped, but since we only rely on the Z flag we don't need an additional mode. */ - if (GET_MODE (y) == SImode && REG_P (y) + if (GET_MODE (y) == SImode + && (REG_P (y) || (GET_CODE (y) == SUBREG)) && GET_CODE (x) == NEG && (op == EQ || op == NE)) return CC_Zmode; @@ -7871,6 +10545,55 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y) && (rtx_equal_p (XEXP (x, 0), y) || rtx_equal_p (XEXP (x, 1), y))) return CC_Cmode; + if (GET_MODE (x) == DImode || GET_MODE (y) == DImode) + { + /* To keep things simple, always use the Cirrus cfcmp64 if it is + available. */ + if (TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK) + return CCmode; + + switch (op) + { + case EQ: + case NE: + /* A DImode comparison against zero can be implemented by + or'ing the two halves together. */ + if (y == const0_rtx) + return CC_Zmode; + + /* We can do an equality test in three Thumb instructions. */ + if (!TARGET_ARM) + return CC_Zmode; + + /* FALLTHROUGH */ + + case LTU: + case LEU: + case GTU: + case GEU: + /* DImode unsigned comparisons can be implemented by cmp + + cmpeq without a scratch register. Not worth doing in + Thumb-2. */ + if (TARGET_ARM) + return CC_CZmode; + + /* FALLTHROUGH */ + + case LT: + case LE: + case GT: + case GE: + /* DImode signed and unsigned comparisons can be implemented + by cmp + sbcs with a scratch register, but that does not + set the Z flag - we must reverse GT/LE/GTU/LEU. */ + gcc_assert (op != EQ && op != NE); + return CC_NCVmode; + + default: + gcc_unreachable (); + } + } + return CCmode; } @@ -7880,10 +10603,39 @@ arm_select_cc_mode (enum rtx_code op, rtx x, rtx y) rtx arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y) { - enum machine_mode mode = SELECT_CC_MODE (code, x, y); - rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM); + enum machine_mode mode; + rtx cc_reg; + int dimode_comparison = GET_MODE (x) == DImode || GET_MODE (y) == DImode; + + /* We might have X as a constant, Y as a register because of the predicates + used for cmpdi. If so, force X to a register here. */ + if (dimode_comparison && !REG_P (x)) + x = force_reg (DImode, x); + + mode = SELECT_CC_MODE (code, x, y); + cc_reg = gen_rtx_REG (mode, CC_REGNUM); + + if (dimode_comparison + && !(TARGET_HARD_FLOAT && TARGET_MAVERICK) + && mode != CC_CZmode) + { + rtx clobber, set; - emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y)); + /* To compare two non-zero values for equality, XOR them and + then compare against zero. Not used for ARM mode; there + CC_CZmode is cheaper. */ + if (mode == CC_Zmode && y != const0_rtx) + { + x = expand_binop (DImode, xor_optab, x, y, NULL_RTX, 0, OPTAB_WIDEN); + y = const0_rtx; + } + /* A scratch register is required. */ + clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (SImode)); + set = gen_rtx_SET (VOIDmode, cc_reg, gen_rtx_COMPARE (mode, x, y)); + emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber))); + } + else + emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y)); return cc_reg; } @@ -8769,17 +11521,20 @@ add_minipool_backward_ref (Mfix *fix) its maximum address (which can happen if we have re-located a forwards fix); force the new fix to come after it. */ - min_mp = mp; - min_address = mp->min_address + fix->fix_size; + if (ARM_DOUBLEWORD_ALIGN + && fix->fix_size >= 8 && mp->fix_size < 8) + return NULL; + else + { + min_mp = mp; + min_address = mp->min_address + fix->fix_size; + } } - /* If we are inserting an 8-bytes aligned quantity and - we have not already found an insertion point, then - make sure that all such 8-byte aligned quantities are - placed at the start of the pool. */ + /* Do not insert a non-8-byte aligned quantity before 8-byte + aligned quantities. */ else if (ARM_DOUBLEWORD_ALIGN - && min_mp == NULL - && fix->fix_size >= 8 - && mp->fix_size < 8) + && fix->fix_size < 8 + && mp->fix_size >= 8) { min_mp = mp; min_address = mp->min_address + fix->fix_size; @@ -9209,6 +11964,34 @@ arm_const_double_by_parts (rtx val) return false; } +/* Return true if it is possible to inline both the high and low parts + of a 64-bit constant into 32-bit data processing instructions. */ +bool +arm_const_double_by_immediates (rtx val) +{ + enum machine_mode mode = GET_MODE (val); + rtx part; + + if (mode == VOIDmode) + mode = DImode; + + part = gen_highpart_mode (SImode, mode, val); + + gcc_assert (GET_CODE (part) == CONST_INT); + + if (!const_ok_for_arm (INTVAL (part))) + return false; + + part = gen_lowpart (SImode, val); + + gcc_assert (GET_CODE (part) == CONST_INT); + + if (!const_ok_for_arm (INTVAL (part))) + return false; + + return true; +} + /* Scan INSN and note any of its operands that need fixing. If DO_PUSHES is false we do not actually push any of the fixups needed. The function returns TRUE if any fixups were needed/pushed. @@ -9282,6 +12065,60 @@ note_invalid_constants (rtx insn, HOST_WIDE_INT address, int do_pushes) return result; } +/* Convert instructions to their cc-clobbering variant if possible, since + that allows us to use smaller encodings. */ + +static void +thumb2_reorg (void) +{ + basic_block bb; + regset_head live; + + INIT_REG_SET (&live); + + /* We are freeing block_for_insn in the toplev to keep compatibility + with old MDEP_REORGS that are not CFG based. Recompute it now. */ + compute_bb_for_insn (); + df_analyze (); + + FOR_EACH_BB (bb) + { + rtx insn; + COPY_REG_SET (&live, DF_LR_OUT (bb)); + df_simulate_initialize_backwards (bb, &live); + FOR_BB_INSNS_REVERSE (bb, insn) + { + if (NONJUMP_INSN_P (insn) + && !REGNO_REG_SET_P (&live, CC_REGNUM)) + { + rtx pat = PATTERN (insn); + if (GET_CODE (pat) == SET + && low_register_operand (XEXP (pat, 0), SImode) + && thumb_16bit_operator (XEXP (pat, 1), SImode) + && low_register_operand (XEXP (XEXP (pat, 1), 0), SImode) + && low_register_operand (XEXP (XEXP (pat, 1), 1), SImode)) + { + rtx dst = XEXP (pat, 0); + rtx src = XEXP (pat, 1); + rtx op0 = XEXP (src, 0); + if (rtx_equal_p (dst, op0) + || GET_CODE (src) == PLUS || GET_CODE (src) == MINUS) + { + rtx ccreg = gen_rtx_REG (CCmode, CC_REGNUM); + rtx clobber = gen_rtx_CLOBBER (VOIDmode, ccreg); + rtvec vec = gen_rtvec (2, pat, clobber); + PATTERN (insn) = gen_rtx_PARALLEL (VOIDmode, vec); + INSN_CODE (insn) = -1; + } + } + } + if (NONDEBUG_INSN_P (insn)) + df_simulate_one_insn_backwards (bb, insn, &live); + } + } + CLEAR_REG_SET (&live); +} + /* Gcc puts the pool in the wrong place for ARM, since we can only load addresses a limited distance around the pc. We do some special munging to move the constant pool values to the correct @@ -9293,6 +12130,9 @@ arm_reorg (void) HOST_WIDE_INT address = 0; Mfix * fix; + if (TARGET_THUMB2) + thumb2_reorg (); + minipool_fix_head = minipool_fix_tail = NULL; /* The first insn must always be a note, or the code below won't @@ -9633,9 +12473,14 @@ vfp_emit_fstmd (int base_reg, int count) XVECEXP (par, 0, 0) = gen_rtx_SET (VOIDmode, - gen_frame_mem (BLKmode, - gen_rtx_PRE_DEC (BLKmode, - stack_pointer_rtx)), + gen_frame_mem + (BLKmode, + gen_rtx_PRE_MODIFY (Pmode, + stack_pointer_rtx, + plus_constant + (stack_pointer_rtx, + - (count * 8))) + ), gen_rtx_UNSPEC (BLKmode, gen_rtvec (1, reg), UNSPEC_PUSH_MULT)); @@ -9667,8 +12512,7 @@ vfp_emit_fstmd (int base_reg, int count) } par = emit_insn (par); - REG_NOTES (par) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, dwarf, - REG_NOTES (par)); + add_reg_note (par, REG_FRAME_RELATED_EXPR, dwarf); RTX_FRAME_RELATED_P (par) = 1; return count * 8; @@ -9722,11 +12566,14 @@ output_call (rtx *operands) return ""; } -/* Output a 'call' insn that is a reference in memory. */ +/* Output a 'call' insn that is a reference in memory. This is + disabled for ARMv5 and we prefer a blx instead because otherwise + there's a significant performance overhead. */ const char * output_call_mem (rtx *operands) { - if (TARGET_INTERWORK && !arm_arch5) + gcc_assert (!arm_arch5); + if (TARGET_INTERWORK) { output_asm_insn ("ldr%?\t%|ip, %0", operands); output_asm_insn ("mov%?\t%|lr, %|pc", operands); @@ -9738,16 +12585,11 @@ output_call_mem (rtx *operands) first instruction. It's safe to use IP as the target of the load since the call will kill it anyway. */ output_asm_insn ("ldr%?\t%|ip, %0", operands); - if (arm_arch5) - output_asm_insn ("blx%?\t%|ip", operands); + output_asm_insn ("mov%?\t%|lr, %|pc", operands); + if (arm_arch4t) + output_asm_insn ("bx%?\t%|ip", operands); else - { - output_asm_insn ("mov%?\t%|lr, %|pc", operands); - if (arm_arch4t) - output_asm_insn ("bx%?\t%|ip", operands); - else - output_asm_insn ("mov%?\t%|pc, %|ip", operands); - } + output_asm_insn ("mov%?\t%|pc, %|ip", operands); } else { @@ -9834,6 +12676,23 @@ output_mov_long_double_arm_from_arm (rtx *operands) return ""; } +void +arm_emit_movpair (rtx dest, rtx src) + { + /* If the src is an immediate, simplify it. */ + if (CONST_INT_P (src)) + { + HOST_WIDE_INT val = INTVAL (src); + emit_set_insn (dest, GEN_INT (val & 0x0000ffff)); + if ((val >> 16) & 0x0000ffff) + emit_set_insn (gen_rtx_ZERO_EXTRACT (SImode, dest, GEN_INT (16), + GEN_INT (16)), + GEN_INT ((val >> 16) & 0x0000ffff)); + return; + } + emit_set_insn (dest, gen_rtx_HIGH (SImode, src)); + emit_set_insn (dest, gen_rtx_LO_SUM (SImode, dest, src)); + } /* Output a move from arm registers to an fpa registers. OPERANDS[0] is an fpa register. @@ -9883,7 +12742,7 @@ output_move_double (rtx *operands) if (code0 == REG) { - int reg0 = REGNO (operands[0]); + unsigned int reg0 = REGNO (operands[0]); otherops[0] = gen_rtx_REG (SImode, 1 + reg0); @@ -9892,7 +12751,8 @@ output_move_double (rtx *operands) switch (GET_CODE (XEXP (operands[1], 0))) { case REG: - if (TARGET_LDRD) + if (TARGET_LDRD + && !(fix_cm3_ldrd && reg0 == REGNO(XEXP (operands[1], 0)))) output_asm_insn ("ldr%(d%)\t%0, [%m1]", operands); else output_asm_insn ("ldm%(ia%)\t%m1, %M0", operands); @@ -9924,6 +12784,10 @@ output_move_double (rtx *operands) case PRE_MODIFY: case POST_MODIFY: + /* Autoicrement addressing modes should never have overlapping + base and destination registers, and overlapping index registers + are already prohibited, so this doesn't need to worry about + fix_cm3_ldrd. */ otherops[0] = operands[0]; otherops[1] = XEXP (XEXP (XEXP (operands[1], 0), 1), 0); otherops[2] = XEXP (XEXP (XEXP (operands[1], 0), 1), 1); @@ -9938,36 +12802,36 @@ output_move_double (rtx *operands) } else { - /* IWMMXT allows offsets larger than ldrd can handle, - fix these up with a pair of ldr. */ - if (GET_CODE (otherops[2]) == CONST_INT - && (INTVAL(otherops[2]) <= -256 - || INTVAL(otherops[2]) >= 256)) + /* Use a single insn if we can. + FIXME: IWMMXT allows offsets larger than ldrd can + handle, fix these up with a pair of ldr. */ + if (TARGET_THUMB2 + || GET_CODE (otherops[2]) != CONST_INT + || (INTVAL (otherops[2]) > -256 + && INTVAL (otherops[2]) < 256)) + output_asm_insn ("ldr%(d%)\t%0, [%1, %2]!", otherops); + else { output_asm_insn ("ldr%?\t%0, [%1, %2]!", otherops); - otherops[0] = gen_rtx_REG (SImode, 1 + reg0); - output_asm_insn ("ldr%?\t%0, [%1, #4]", otherops); + output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops); } - else - output_asm_insn ("ldr%(d%)\t%0, [%1, %2]!", otherops); } } else { - /* IWMMXT allows offsets larger than ldrd can handle, + /* Use a single insn if we can. + FIXME: IWMMXT allows offsets larger than ldrd can handle, fix these up with a pair of ldr. */ - if (GET_CODE (otherops[2]) == CONST_INT - && (INTVAL(otherops[2]) <= -256 - || INTVAL(otherops[2]) >= 256)) + if (TARGET_THUMB2 + || GET_CODE (otherops[2]) != CONST_INT + || (INTVAL (otherops[2]) > -256 + && INTVAL (otherops[2]) < 256)) + output_asm_insn ("ldr%(d%)\t%0, [%1], %2", otherops); + else { - otherops[0] = gen_rtx_REG (SImode, 1 + reg0); - output_asm_insn ("ldr%?\t%0, [%1, #4]", otherops); - otherops[0] = operands[0]; + output_asm_insn ("ldr%?\t%H0, [%1, #4]", otherops); output_asm_insn ("ldr%?\t%0, [%1], %2", otherops); } - else - /* We only allow constant increments, so this is safe. */ - output_asm_insn ("ldr%(d%)\t%0, [%1], %2", otherops); } break; @@ -9976,11 +12840,15 @@ output_move_double (rtx *operands) /* We might be able to use ldrd %0, %1 here. However the range is different to ldr/adr, and it is broken on some ARMv7-M implementations. */ - output_asm_insn ("adr%?\t%0, %1", operands); + /* Use the second register of the pair to avoid problematic + overlap. */ + otherops[1] = operands[1]; + output_asm_insn ("adr%?\t%0, %1", otherops); + operands[1] = otherops[0]; if (TARGET_LDRD) - output_asm_insn ("ldr%(d%)\t%0, [%0]", operands); + output_asm_insn ("ldr%(d%)\t%0, [%1]", operands); else - output_asm_insn ("ldm%(ia%)\t%0, %M0", operands); + output_asm_insn ("ldm%(ia%)\t%1, %M0", operands); break; /* ??? This needs checking for thumb2. */ @@ -10013,30 +12881,38 @@ output_move_double (rtx *operands) return ""; } } + otherops[0] = gen_rtx_REG(SImode, REGNO(operands[0]) + 1); + operands[1] = otherops[0]; if (TARGET_LDRD && (GET_CODE (otherops[2]) == REG + || TARGET_THUMB2 || (GET_CODE (otherops[2]) == CONST_INT && INTVAL (otherops[2]) > -256 && INTVAL (otherops[2]) < 256))) { - if (reg_overlap_mentioned_p (otherops[0], + if (reg_overlap_mentioned_p (operands[0], otherops[2])) { + rtx tmp; /* Swap base and index registers over to avoid a conflict. */ - otherops[1] = XEXP (XEXP (operands[1], 0), 1); - otherops[2] = XEXP (XEXP (operands[1], 0), 0); + tmp = otherops[1]; + otherops[1] = otherops[2]; + otherops[2] = tmp; } /* If both registers conflict, it will usually have been fixed by a splitter. */ - if (reg_overlap_mentioned_p (otherops[0], otherops[2])) + if (reg_overlap_mentioned_p (operands[0], otherops[2]) + || (fix_cm3_ldrd && reg0 == REGNO (otherops[1]))) { - output_asm_insn ("add%?\t%1, %1, %2", otherops); - output_asm_insn ("ldr%(d%)\t%0, [%1]", - otherops); + output_asm_insn ("add%?\t%0, %1, %2", otherops); + output_asm_insn ("ldr%(d%)\t%0, [%1]", operands); } else - output_asm_insn ("ldr%(d%)\t%0, [%1, %2]", otherops); + { + otherops[0] = operands[0]; + output_asm_insn ("ldr%(d%)\t%0, [%1, %2]", otherops); + } return ""; } @@ -10054,9 +12930,9 @@ output_move_double (rtx *operands) output_asm_insn ("sub%?\t%0, %1, %2", otherops); if (TARGET_LDRD) - return "ldr%(d%)\t%0, [%0]"; + return "ldr%(d%)\t%0, [%1]"; - return "ldm%(ia%)\t%0, %M0"; + return "ldm%(ia%)\t%1, %M0"; } else { @@ -10122,24 +12998,20 @@ output_move_double (rtx *operands) /* IWMMXT allows offsets larger than ldrd can handle, fix these up with a pair of ldr. */ - if (GET_CODE (otherops[2]) == CONST_INT + if (!TARGET_THUMB2 + && GET_CODE (otherops[2]) == CONST_INT && (INTVAL(otherops[2]) <= -256 || INTVAL(otherops[2]) >= 256)) { - rtx reg1; - reg1 = gen_rtx_REG (SImode, 1 + REGNO (operands[1])); if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY) { - output_asm_insn ("ldr%?\t%0, [%1, %2]!", otherops); - otherops[0] = reg1; - output_asm_insn ("ldr%?\t%0, [%1, #4]", otherops); + output_asm_insn ("str%?\t%0, [%1, %2]!", otherops); + output_asm_insn ("str%?\t%H0, [%1, #4]", otherops); } else { - otherops[0] = reg1; - output_asm_insn ("ldr%?\t%0, [%1, #4]", otherops); - otherops[0] = operands[1]; - output_asm_insn ("ldr%?\t%0, [%1], %2", otherops); + output_asm_insn ("str%?\t%H0, [%1, #4]", otherops); + output_asm_insn ("str%?\t%0, [%1], %2", otherops); } } else if (GET_CODE (XEXP (operands[0], 0)) == PRE_MODIFY) @@ -10173,6 +13045,7 @@ output_move_double (rtx *operands) } if (TARGET_LDRD && (GET_CODE (otherops[2]) == REG + || TARGET_THUMB2 || (GET_CODE (otherops[2]) == CONST_INT && INTVAL (otherops[2]) > -256 && INTVAL (otherops[2]) < 256))) @@ -10186,9 +13059,9 @@ output_move_double (rtx *operands) default: otherops[0] = adjust_address (operands[0], SImode, 4); - otherops[1] = gen_rtx_REG (SImode, 1 + REGNO (operands[1])); + otherops[1] = operands[1]; output_asm_insn ("str%?\t%1, %0", operands); - output_asm_insn ("str%?\t%1, %0", otherops); + output_asm_insn ("str%?\t%H1, %0", otherops); } } @@ -10196,7 +13069,7 @@ output_move_double (rtx *operands) } /* Output a move, load or store for quad-word vectors in ARM registers. Only - handles MEMs accepted by neon_vector_mem_operand with CORE=true. */ + handles MEMs accepted by neon_vector_mem_operand with TYPE=1. */ const char * output_move_quad (rtx *operands) @@ -10280,7 +13153,7 @@ output_move_vfp (rtx *operands) int load = REG_P (operands[0]); int dp = GET_MODE_SIZE (GET_MODE (operands[0])) == 8; int integer_p = GET_MODE_CLASS (GET_MODE (operands[0])) == MODE_INT; - const char *template; + const char *templ; char buff[50]; enum machine_mode mode; @@ -10303,25 +13176,25 @@ output_move_vfp (rtx *operands) switch (GET_CODE (addr)) { case PRE_DEC: - template = "f%smdb%c%%?\t%%0!, {%%%s1}%s"; + templ = "f%smdb%c%%?\t%%0!, {%%%s1}%s"; ops[0] = XEXP (addr, 0); ops[1] = reg; break; case POST_INC: - template = "f%smia%c%%?\t%%0!, {%%%s1}%s"; + templ = "f%smia%c%%?\t%%0!, {%%%s1}%s"; ops[0] = XEXP (addr, 0); ops[1] = reg; break; default: - template = "f%s%c%%?\t%%%s0, %%1%s"; + templ = "f%s%c%%?\t%%%s0, %%1%s"; ops[0] = reg; ops[1] = mem; break; } - sprintf (buff, template, + sprintf (buff, templ, load ? "ld" : "st", dp ? 'd' : 's', dp ? "P" : "", @@ -10332,37 +13205,35 @@ output_move_vfp (rtx *operands) } /* Output a Neon quad-word load or store, or a load or store for - larger structure modes. We could also support post-modify forms using - VLD1/VST1 (for the vectorizer, and perhaps otherwise), but we don't do that - yet. - WARNING: The ordering of elements in memory is weird in big-endian mode, - because we use VSTM instead of VST1, to make it easy to make vector stores - via ARM registers write values in the same order as stores direct from Neon - registers. For example, the byte ordering of a quadword vector with 16-byte - elements like this: + larger structure modes. - [e7:e6:e5:e4:e3:e2:e1:e0] (highest-numbered element first) + WARNING: The ordering of elements is weird in big-endian mode, + because we use VSTM, as required by the EABI. GCC RTL defines + element ordering based on in-memory order. This can be differ + from the architectural ordering of elements within a NEON register. + The intrinsics defined in arm_neon.h use the NEON register element + ordering, not the GCC RTL element ordering. - will be (with lowest address first, h = most-significant byte, - l = least-significant byte of element): + For example, the in-memory ordering of a big-endian a quadword + vector with 16-bit elements when stored from register pair {d0,d1} + will be (lowest address first, d0[N] is NEON register element N): - [e3h, e3l, e2h, e2l, e1h, e1l, e0h, e0l, - e7h, e7l, e6h, e6l, e5h, e5l, e4h, e4l] + [d0[3], d0[2], d0[1], d0[0], d1[7], d1[6], d1[5], d1[4]] - When necessary, quadword registers (dN, dN+1) are moved to ARM registers from - rN in the order: + When necessary, quadword registers (dN, dN+1) are moved to ARM + registers from rN in the order: dN -> (rN+1, rN), dN+1 -> (rN+3, rN+2) - So that STM/LDM can be used on vectors in ARM registers, and the same memory - layout will result as if VSTM/VLDM were used. */ + So that STM/LDM can be used on vectors in ARM registers, and the + same memory layout will result as if VSTM/VLDM were used. */ const char * output_move_neon (rtx *operands) { rtx reg, mem, addr, ops[2]; int regno, load = REG_P (operands[0]); - const char *template; + const char *templ; char buff[50]; enum machine_mode mode; @@ -10389,11 +13260,18 @@ output_move_neon (rtx *operands) switch (GET_CODE (addr)) { case POST_INC: - template = "v%smia%%?\t%%0!, %%h1"; + templ = "v%smia%%?\t%%0!, %%h1"; ops[0] = XEXP (addr, 0); ops[1] = reg; break; + case PRE_DEC: + /* FIXME: We should be using vld1/vst1 here in BE mode? */ + templ = "v%smdb%%?\t%%0!, %%h1"; + ops[0] = XEXP (addr, 0); + ops[1] = reg; + break; + case POST_MODIFY: /* FIXME: Not currently enabled in neon_vector_mem_operand. */ gcc_unreachable (); @@ -10408,7 +13286,7 @@ output_move_neon (rtx *operands) { /* We're only using DImode here because it's a convenient size. */ ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i); - ops[1] = adjust_address (mem, SImode, 8 * i); + ops[1] = adjust_address (mem, DImode, 8 * i); if (reg_overlap_mentioned_p (ops[0], mem)) { gcc_assert (overlap == -1); @@ -10432,17 +13310,95 @@ output_move_neon (rtx *operands) } default: - template = "v%smia%%?\t%%m0, %%h1"; + templ = "v%smia%%?\t%%m0, %%h1"; ops[0] = mem; ops[1] = reg; } - sprintf (buff, template, load ? "ld" : "st"); + sprintf (buff, templ, load ? "ld" : "st"); output_asm_insn (buff, ops); return ""; } +/* Compute and return the length of neon_mov, where is + one of VSTRUCT modes: EI, OI, CI or XI. */ +int +arm_attr_length_move_neon (rtx insn) +{ + rtx reg, mem, addr; + int load; + enum machine_mode mode; + + extract_insn_cached (insn); + + if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1])) + { + mode = GET_MODE (recog_data.operand[0]); + switch (mode) + { + case EImode: + case OImode: + return 8; + case CImode: + return 12; + case XImode: + return 16; + default: + gcc_unreachable (); + } + } + + load = REG_P (recog_data.operand[0]); + reg = recog_data.operand[!load]; + mem = recog_data.operand[load]; + + gcc_assert (MEM_P (mem)); + + mode = GET_MODE (reg); + addr = XEXP (mem, 0); + + /* Strip off const from addresses like (const (plus (...))). */ + if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS) + addr = XEXP (addr, 0); + + if (GET_CODE (addr) == LABEL_REF || GET_CODE (addr) == PLUS) + { + int insns = HARD_REGNO_NREGS (REGNO (reg), mode) / 2; + return insns * 4; + } + else + return 4; +} + +/* Return nonzero if the offset in the address is an immediate. Otherwise, + return zero. */ + +int +arm_address_offset_is_imm (rtx insn) +{ + rtx mem, addr; + + extract_insn_cached (insn); + + if (REG_P (recog_data.operand[0])) + return 0; + + mem = recog_data.operand[0]; + + gcc_assert (MEM_P (mem)); + + addr = XEXP (mem, 0); + + if (GET_CODE (addr) == REG + || (GET_CODE (addr) == PLUS + && GET_CODE (XEXP (addr, 0)) == REG + && GET_CODE (XEXP (addr, 1)) == CONST_INT)) + return 1; + else + return 0; +} + /* Output an ADD r, s, #n where n may be too big for one instruction. If adding zero to one register, output nothing. */ const char * @@ -10749,6 +13705,20 @@ arm_compute_save_reg0_reg12_mask (void) && crtl->uses_pic_offset_table) save_reg_mask |= 1 << PIC_OFFSET_TABLE_REGNUM; } + else if (IS_VOLATILE(func_type)) + { + /* For noreturn functions we historically omitted register saves + altogether. However this really messes up debugging. As a + compromise save just the frame pointers. Combined with the link + register saved elsewhere this should be sufficient to get + a backtrace. */ + if (frame_pointer_needed) + save_reg_mask |= 1 << HARD_FRAME_POINTER_REGNUM; + if (df_regs_ever_live_p (ARM_HARD_FRAME_POINTER_REGNUM)) + save_reg_mask |= 1 << ARM_HARD_FRAME_POINTER_REGNUM; + if (df_regs_ever_live_p (THUMB_HARD_FRAME_POINTER_REGNUM)) + save_reg_mask |= 1 << THUMB_HARD_FRAME_POINTER_REGNUM; + } else { /* In the normal case we only need to save those registers @@ -10793,6 +13763,24 @@ arm_compute_save_reg0_reg12_mask (void) } +/* Compute the number of bytes used to store the static chain register on the + stack, above the stack frame. We need to know this accurately to get the + alignment of the rest of the stack frame correct. */ + +static int arm_compute_static_chain_stack_bytes (void) +{ + unsigned long func_type = arm_current_func_type (); + int static_chain_stack_bytes = 0; + + if (TARGET_APCS_FRAME && frame_pointer_needed && TARGET_ARM && + IS_NESTED (func_type) && + df_regs_ever_live_p (3) && crtl->args.pretend_args_size == 0) + static_chain_stack_bytes = 4; + + return static_chain_stack_bytes; +} + + /* Compute a bit mask of which registers need to be saved on the stack for the current function. This is used by arm_get_frame_offsets, which may add extra registers. */ @@ -10817,11 +13805,6 @@ arm_compute_save_reg_mask (void) | (1 << LR_REGNUM) | (1 << PC_REGNUM); - /* Volatile functions do not return, so there - is no need to save any other registers. */ - if (IS_VOLATILE (func_type)) - return save_reg_mask; - save_reg_mask |= arm_compute_save_reg0_reg12_mask (); /* Decide if we need to save the link register. @@ -10845,7 +13828,9 @@ arm_compute_save_reg_mask (void) if (TARGET_REALLY_IWMMXT && ((bit_count (save_reg_mask) - + ARM_NUM_INTS (crtl->args.pretend_args_size)) % 2) != 0) + + ARM_NUM_INTS (crtl->args.pretend_args_size + + arm_compute_static_chain_stack_bytes()) + ) % 2) != 0) { /* The total number of registers that are going to be pushed onto the stack is odd. We need to ensure that the stack @@ -10930,6 +13915,26 @@ thumb1_compute_save_reg_mask (void) mask |= 1 << reg; } + /* The 504 below is 8 bytes less than 512 because there are two possible + alignment words. We can't tell here if they will be present or not so we + have to play it safe and assume that they are. */ + if ((CALLER_INTERWORKING_SLOT_SIZE + + ROUND_UP_WORD (get_frame_size ()) + + crtl->outgoing_args_size) >= 504) + { + /* This is the same as the code in thumb1_expand_prologue() which + determines which register to use for stack decrement. */ + for (reg = LAST_ARG_REGNUM + 1; reg <= LAST_LO_REGNUM; reg++) + if (mask & (1 << reg)) + break; + + if (reg > LAST_LO_REGNUM) + { + /* Make sure we have a register available for stack decrement. */ + mask |= 1 << LAST_LO_REGNUM; + } + } + return mask; } @@ -11018,7 +14023,7 @@ output_return_instruction (rtx operand, int really_return, int reverse) sprintf (conditional, "%%?%%%c0", reverse ? 'D' : 'd'); - return_used_this_function = 1; + cfun->machine->return_used_this_function = 1; offsets = arm_get_frame_offsets (); live_regs_mask = offsets->saved_regs_mask; @@ -11087,18 +14092,28 @@ output_return_instruction (rtx operand, int really_return, int reverse) gcc_assert (stack_adjust == 0 || stack_adjust == 4); if (stack_adjust && arm_arch5 && TARGET_ARM) - sprintf (instr, "ldm%sib\t%%|sp, {", conditional); + if (TARGET_UNIFIED_ASM) + sprintf (instr, "ldmib%s\t%%|sp, {", conditional); + else + sprintf (instr, "ldm%sib\t%%|sp, {", conditional); else { /* If we can't use ldmib (SA110 bug), then try to pop r3 instead. */ if (stack_adjust) live_regs_mask |= 1 << 3; - sprintf (instr, "ldm%sfd\t%%|sp, {", conditional); + + if (TARGET_UNIFIED_ASM) + sprintf (instr, "ldmfd%s\t%%|sp, {", conditional); + else + sprintf (instr, "ldm%sfd\t%%|sp, {", conditional); } } else - sprintf (instr, "ldm%sfd\t%%|sp!, {", conditional); + if (TARGET_UNIFIED_ASM) + sprintf (instr, "pop%s\t{", conditional); + else + sprintf (instr, "ldm%sfd\t%%|sp!, {", conditional); p = instr + strlen (instr); @@ -11283,7 +14298,6 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_INT frame_size) if (crtl->calls_eh_return) asm_fprintf (f, "\t@ Calls __builtin_eh_return.\n"); - return_used_this_function = 0; } const char * @@ -11304,7 +14318,8 @@ arm_output_epilogue (rtx sibling) /* If we have already generated the return instruction then it is futile to generate anything else. */ - if (use_return_insn (FALSE, sibling) && return_used_this_function) + if (use_return_insn (FALSE, sibling) && + (cfun->machine->return_used_this_function != 0)) return ""; func_type = arm_current_func_type (); @@ -11346,7 +14361,7 @@ arm_output_epilogue (rtx sibling) /* This variable is for the Virtual Frame Pointer, not VFP regs. */ int vfp_offset = offsets->frame; - if (arm_fpu_arch == FPUTYPE_FPA_EMU2) + if (TARGET_FPA_EMU2) { for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--) if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) @@ -11504,7 +14519,7 @@ arm_output_epilogue (rtx sibling) (where frame pointer is required to point at first register) and ARM-non-apcs-frame. Therefore, such change is postponed until real need arise. */ - HOST_WIDE_INT amount; + unsigned HOST_WIDE_INT amount; int rfe; /* Restore stack pointer if necessary. */ if (TARGET_ARM && frame_pointer_needed) @@ -11541,7 +14556,8 @@ arm_output_epilogue (rtx sibling) && !crtl->tail_call_emit) { unsigned long mask; - mask = (1 << (arm_size_return_regs() / 4)) - 1; + /* Preserve return values, of any size. */ + mask = (1 << ((arm_size_return_regs() + 3) / 4)) - 1; mask ^= 0xf; mask &= ~saved_regs_mask; reg = 0; @@ -11569,7 +14585,7 @@ arm_output_epilogue (rtx sibling) SP_REGNUM, HARD_FRAME_POINTER_REGNUM); } - if (arm_fpu_arch == FPUTYPE_FPA_EMU2) + if (TARGET_FPA_EMU2) { for (reg = FIRST_FPA_REGNUM; reg <= LAST_FPA_REGNUM; reg++) if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) @@ -11610,24 +14626,29 @@ arm_output_epilogue (rtx sibling) if (TARGET_HARD_FLOAT && TARGET_VFP) { - start_reg = FIRST_VFP_REGNUM; - for (reg = FIRST_VFP_REGNUM; reg < LAST_VFP_REGNUM; reg += 2) + int end_reg = LAST_VFP_REGNUM + 1; + + /* Scan the registers in reverse order. We need to match + any groupings made in the prologue and generate matching + pop operations. */ + for (reg = LAST_VFP_REGNUM - 1; reg >= FIRST_VFP_REGNUM; reg -= 2) { if ((!df_regs_ever_live_p (reg) || call_used_regs[reg]) - && (!df_regs_ever_live_p (reg + 1) || call_used_regs[reg + 1])) + && (!df_regs_ever_live_p (reg + 1) + || call_used_regs[reg + 1])) { - if (start_reg != reg) + if (end_reg > reg + 2) vfp_output_fldmd (f, SP_REGNUM, - (start_reg - FIRST_VFP_REGNUM) / 2, - (reg - start_reg) / 2); - start_reg = reg + 2; + (reg + 2 - FIRST_VFP_REGNUM) / 2, + (end_reg - (reg + 2)) / 2); + end_reg = reg; } } - if (start_reg != reg) - vfp_output_fldmd (f, SP_REGNUM, - (start_reg - FIRST_VFP_REGNUM) / 2, - (reg - start_reg) / 2); + if (end_reg > reg + 2) + vfp_output_fldmd (f, SP_REGNUM, 0, + (end_reg - (reg + 2)) / 2); } + if (TARGET_IWMMXT) for (reg = FIRST_IWMMXT_REGNUM; reg <= LAST_IWMMXT_REGNUM; reg++) if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) @@ -11751,7 +14772,7 @@ arm_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, /* ??? Probably not safe to set this here, since it assumes that a function will be emitted as assembly immediately after we generate RTL for it. This does not happen for inline functions. */ - return_used_this_function = 0; + cfun->machine->return_used_this_function = 0; } else /* TARGET_32BIT */ { @@ -11759,7 +14780,7 @@ arm_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, offsets = arm_get_frame_offsets (); gcc_assert (!use_return_insn (FALSE, NULL) - || !return_used_this_function + || (cfun->machine->return_used_this_function != 0) || offsets->saved_regs == offsets->outgoing_args || frame_pointer_needed); @@ -11796,16 +14817,17 @@ emit_multi_reg_push (unsigned long mask) /* For the body of the insn we are going to generate an UNSPEC in parallel with several USEs. This allows the insn to be recognized - by the push_multi pattern in the arm.md file. The insn looks - something like this: + by the push_multi pattern in the arm.md file. + + The body of the insn looks something like this: (parallel [ - (set (mem:BLK (pre_dec:BLK (reg:SI sp))) + (set (mem:BLK (pre_modify:SI (reg:SI sp) + (const_int:SI ))) (unspec:BLK [(reg:SI r4)] UNSPEC_PUSH_MULT)) - (use (reg:SI 11 fp)) - (use (reg:SI 12 ip)) - (use (reg:SI 14 lr)) - (use (reg:SI 15 pc)) + (use (reg:SI XX)) + (use (reg:SI YY)) + ... ]) For the frame note however, we try to be more explicit and actually @@ -11818,13 +14840,20 @@ emit_multi_reg_push (unsigned long mask) (sequence [ (set (reg:SI sp) (plus:SI (reg:SI sp) (const_int -20))) (set (mem:SI (reg:SI sp)) (reg:SI r4)) - (set (mem:SI (plus:SI (reg:SI sp) (const_int 4))) (reg:SI fp)) - (set (mem:SI (plus:SI (reg:SI sp) (const_int 8))) (reg:SI ip)) - (set (mem:SI (plus:SI (reg:SI sp) (const_int 12))) (reg:SI lr)) + (set (mem:SI (plus:SI (reg:SI sp) (const_int 4))) (reg:SI XX)) + (set (mem:SI (plus:SI (reg:SI sp) (const_int 8))) (reg:SI YY)) + ... ]) - This sequence is used both by the code to support stack unwinding for - exceptions handlers and the code to generate dwarf2 frame debugging. */ + FIXME:: In an ideal world the PRE_MODIFY would not exist and + instead we'd have a parallel expression detailing all + the stores to the various memory addresses so that debug + information is more up-to-date. Remember however while writing + this to take care of the constraints with the push instruction. + + Note also that this has to be taken care of for the VFP registers. + + For more see PR43399. */ par = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (num_regs)); dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (num_dwarf_regs + 1)); @@ -11838,9 +14867,14 @@ emit_multi_reg_push (unsigned long mask) XVECEXP (par, 0, 0) = gen_rtx_SET (VOIDmode, - gen_frame_mem (BLKmode, - gen_rtx_PRE_DEC (BLKmode, - stack_pointer_rtx)), + gen_frame_mem + (BLKmode, + gen_rtx_PRE_MODIFY (Pmode, + stack_pointer_rtx, + plus_constant + (stack_pointer_rtx, + -4 * num_regs)) + ), gen_rtx_UNSPEC (BLKmode, gen_rtvec (1, reg), UNSPEC_PUSH_MULT)); @@ -11871,9 +14905,10 @@ emit_multi_reg_push (unsigned long mask) { tmp = gen_rtx_SET (VOIDmode, - gen_frame_mem (SImode, - plus_constant (stack_pointer_rtx, - 4 * j)), + gen_frame_mem + (SImode, + plus_constant (stack_pointer_rtx, + 4 * j)), reg); RTX_FRAME_RELATED_P (tmp) = 1; XVECEXP (dwarf, 0, dwarf_par_index++) = tmp; @@ -11891,8 +14926,8 @@ emit_multi_reg_push (unsigned long mask) RTX_FRAME_RELATED_P (tmp) = 1; XVECEXP (dwarf, 0, 0) = tmp; - REG_NOTES (par) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, dwarf, - REG_NOTES (par)); + add_reg_note (par, REG_FRAME_RELATED_EXPR, dwarf); + return par; } @@ -11925,9 +14960,14 @@ emit_sfm (int base_reg, int count) XVECEXP (par, 0, 0) = gen_rtx_SET (VOIDmode, - gen_frame_mem (BLKmode, - gen_rtx_PRE_DEC (BLKmode, - stack_pointer_rtx)), + gen_frame_mem + (BLKmode, + gen_rtx_PRE_MODIFY (Pmode, + stack_pointer_rtx, + plus_constant + (stack_pointer_rtx, + -12 * count)) + ), gen_rtx_UNSPEC (BLKmode, gen_rtvec (1, reg), UNSPEC_PUSH_MULT)); @@ -11958,8 +14998,8 @@ emit_sfm (int base_reg, int count) XVECEXP (dwarf, 0, 0) = tmp; par = emit_insn (par); - REG_NOTES (par) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, dwarf, - REG_NOTES (par)); + add_reg_note (par, REG_FRAME_RELATED_EXPR, dwarf); + return par; } @@ -12065,7 +15105,8 @@ arm_get_frame_offsets (void) offsets->saved_args = crtl->args.pretend_args_size; /* In Thumb mode this is incorrect, but never used. */ - offsets->frame = offsets->saved_args + (frame_pointer_needed ? 4 : 0); + offsets->frame = offsets->saved_args + (frame_pointer_needed ? 4 : 0) + + arm_compute_static_chain_stack_bytes(); if (TARGET_32BIT) { @@ -12112,7 +15153,8 @@ arm_get_frame_offsets (void) } /* Saved registers include the stack frame. */ - offsets->saved_regs = offsets->saved_args + saved; + offsets->saved_regs = offsets->saved_args + saved + + arm_compute_static_chain_stack_bytes(); offsets->soft_frame = offsets->saved_regs + CALLER_INTERWORKING_SLOT_SIZE; /* A leaf function does not need any stack alignment if it has nothing on the stack. */ @@ -12135,22 +15177,24 @@ arm_get_frame_offsets (void) { int reg = -1; - for (i = 4; i <= (TARGET_THUMB1 ? LAST_LO_REGNUM : 11); i++) - { - if ((offsets->saved_regs_mask & (1 << i)) == 0) - { - reg = i; - break; - } - } - - if (reg == -1 && arm_size_return_regs () <= 12 - && !crtl->tail_call_emit) + /* If it is safe to use r3, then do so. This sometimes + generates better code on Thumb-2 by avoiding the need to + use 32-bit push/pop instructions. */ + if (!crtl->tail_call_emit + && arm_size_return_regs () <= 12 + && (offsets->saved_regs_mask & (1 << 3)) == 0) { - /* Push/pop an argument register (r3) if all callee saved - registers are already being pushed. */ reg = 3; } + else + for (i = 4; i <= (TARGET_THUMB1 ? LAST_LO_REGNUM : 11); i++) + { + if ((offsets->saved_regs_mask & (1 << i)) == 0) + { + reg = i; + break; + } + } if (reg != -1) { @@ -12204,14 +15248,9 @@ arm_compute_initial_elimination_offset (unsigned int from, unsigned int to) return offsets->soft_frame - offsets->saved_args; case ARM_HARD_FRAME_POINTER_REGNUM: - /* If there is no stack frame then the hard - frame pointer and the arg pointer coincide. */ - if (offsets->frame == offsets->saved_regs) - return 0; - /* FIXME: Not sure about this. Maybe we should always return 0 ? */ - return (frame_pointer_needed - && cfun->static_chain_decl != NULL - && ! cfun->machine->uses_anonymous_args) ? 4 : 0; + /* This is only non-zero in the case where the static chain register + is stored above the frame. */ + return offsets->frame - offsets->saved_args - 4; case STACK_POINTER_REGNUM: /* If nothing has been pushed on the stack at all @@ -12255,6 +15294,24 @@ arm_compute_initial_elimination_offset (unsigned int from, unsigned int to) } } +/* Given FROM and TO register numbers, say whether this elimination is + allowed. Frame pointer elimination is automatically handled. + + All eliminations are permissible. Note that ARG_POINTER_REGNUM and + HARD_FRAME_POINTER_REGNUM are in fact the same thing. If we need a frame + pointer, we must eliminate FRAME_POINTER_REGNUM into + HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM or + ARG_POINTER_REGNUM. */ + +bool +arm_can_eliminate (const int from, const int to) +{ + return ((to == FRAME_POINTER_REGNUM && from == ARG_POINTER_REGNUM) ? false : + (to == STACK_POINTER_REGNUM && frame_pointer_needed) ? false : + (to == ARM_HARD_FRAME_POINTER_REGNUM && TARGET_THUMB) ? false : + (to == THUMB_HARD_FRAME_POINTER_REGNUM && TARGET_ARM) ? false : + true); +} /* Emit RTL to save coprocessor registers on function entry. Returns the number of bytes pushed. */ @@ -12270,7 +15327,7 @@ arm_save_coproc_regs(void) for (reg = LAST_IWMMXT_REGNUM; reg >= FIRST_IWMMXT_REGNUM; reg--) if (df_regs_ever_live_p (reg) && ! call_used_regs[reg]) { - insn = gen_rtx_PRE_DEC (V2SImode, stack_pointer_rtx); + insn = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx); insn = gen_rtx_MEM (V2SImode, insn); insn = emit_set_insn (insn, gen_rtx_REG (V2SImode, reg)); RTX_FRAME_RELATED_P (insn) = 1; @@ -12279,12 +15336,12 @@ arm_save_coproc_regs(void) /* Save any floating point call-saved registers used by this function. */ - if (arm_fpu_arch == FPUTYPE_FPA_EMU2) + if (TARGET_FPA_EMU2) { for (reg = LAST_FPA_REGNUM; reg >= FIRST_FPA_REGNUM; reg--) if (df_regs_ever_live_p (reg) && !call_used_regs[reg]) { - insn = gen_rtx_PRE_DEC (XFmode, stack_pointer_rtx); + insn = gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx); insn = gen_rtx_MEM (XFmode, insn); insn = emit_set_insn (insn, gen_rtx_REG (XFmode, reg)); RTX_FRAME_RELATED_P (insn) = 1; @@ -12381,8 +15438,7 @@ thumb_set_frame_pointer (arm_stack_offsets *offsets) dwarf = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx, plus_constant (stack_pointer_rtx, amount)); RTX_FRAME_RELATED_P (dwarf) = 1; - REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, dwarf, - REG_NOTES (insn)); + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf); } RTX_FRAME_RELATED_P (insn) = 1; @@ -12439,12 +15495,13 @@ arm_expand_prologue (void) r0 = gen_rtx_REG (SImode, 0); r1 = gen_rtx_REG (SImode, 1); - dwarf = gen_rtx_UNSPEC (SImode, NULL_RTVEC, UNSPEC_STACK_ALIGN); + /* Use a real rtvec rather than NULL_RTVEC so the rest of the + compiler won't choke. */ + dwarf = gen_rtx_UNSPEC (SImode, rtvec_alloc (0), UNSPEC_STACK_ALIGN); dwarf = gen_rtx_SET (VOIDmode, r0, dwarf); insn = gen_movsi (r0, stack_pointer_rtx); RTX_FRAME_RELATED_P (insn) = 1; - REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, - dwarf, REG_NOTES (insn)); + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf); emit_insn (insn); emit_insn (gen_andsi3 (r1, r0, GEN_INT (~(HOST_WIDE_INT)7))); emit_insn (gen_movsi (stack_pointer_rtx, r1)); @@ -12499,6 +15556,9 @@ arm_expand_prologue (void) { rtx dwarf; + gcc_assert(arm_compute_static_chain_stack_bytes() == 4); + saved_regs += 4; + insn = gen_rtx_PRE_DEC (SImode, stack_pointer_rtx); insn = emit_set_insn (gen_frame_mem (SImode, insn), ip_rtx); fp_offset = 4; @@ -12508,8 +15568,7 @@ arm_expand_prologue (void) plus_constant (stack_pointer_rtx, -fp_offset)); RTX_FRAME_RELATED_P (insn) = 1; - REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, - dwarf, REG_NOTES (insn)); + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf); } else { @@ -12677,7 +15736,8 @@ arm_expand_prologue (void) using the EABI unwinder, to prevent faulting instructions from being swapped with a stack adjustment. */ if (crtl->profile || !TARGET_SCHED_PROLOG - || (ARM_EABI_UNWIND_TABLES && flag_non_call_exceptions)) + || (arm_except_unwind_info () == UI_TARGET + && cfun->can_throw_non_call_exceptions)) emit_insn (gen_blockage ()); /* If the link register is being kept alive, with the return address in it, @@ -12736,7 +15796,7 @@ arm_print_condition (FILE *stream) before output. If CODE is 'B' then output a bitwise inverted value of X (a const int). If X is a REG and CODE is `M', output a ldm/stm style multi-reg. */ -void +static void arm_print_operand (FILE *stream, rtx x, int code) { switch (code) @@ -12805,15 +15865,26 @@ arm_print_operand (FILE *stream, rtx x, int code) { REAL_VALUE_TYPE r; REAL_VALUE_FROM_CONST_DOUBLE (r, x); - r = REAL_VALUE_NEGATE (r); + r = real_value_negate (&r); fprintf (stream, "%s", fp_const_from_val (&r)); } return; - /* An integer without a preceding # sign. */ + /* An integer or symbol address without a preceding # sign. */ case 'c': - gcc_assert (GET_CODE (x) == CONST_INT); - fprintf (stream, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); + switch (GET_CODE (x)) + { + case CONST_INT: + fprintf (stream, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); + break; + + case SYMBOL_REF: + output_addr_const (stream, x); + break; + + default: + gcc_unreachable (); + } return; case 'B': @@ -12903,8 +15974,18 @@ arm_print_operand (FILE *stream, rtx x, int code) the value being loaded is big-wordian or little-wordian. The order of the two register loads can matter however, if the address of the memory location is actually held in one of the registers - being overwritten by the load. */ + being overwritten by the load. + + The 'Q' and 'R' constraints are also available for 64-bit + constants. */ case 'Q': + if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE) + { + rtx part = gen_lowpart (SImode, x); + fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part)); + return; + } + if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM) { output_operand_lossage ("invalid operand for code '%c'", code); @@ -12915,6 +15996,18 @@ arm_print_operand (FILE *stream, rtx x, int code) return; case 'R': + if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE) + { + enum machine_mode mode = GET_MODE (x); + rtx part; + + if (mode == VOIDmode) + mode = DImode; + part = gen_highpart_mode (SImode, mode, x); + fprintf (stream, "#" HOST_WIDE_INT_PRINT_DEC, INTVAL (part)); + return; + } + if (GET_CODE (x) != REG || REGNO (x) > LAST_ARM_REGNUM) { output_operand_lossage ("invalid operand for code '%c'", code); @@ -13091,6 +16184,30 @@ arm_print_operand (FILE *stream, rtx x, int code) } return; + /* Print the high single-precision register of a VFP double-precision + register. */ + case 'p': + { + int mode = GET_MODE (x); + int regno; + + if (GET_MODE_SIZE (mode) != 8 || GET_CODE (x) != REG) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = REGNO (x); + if (!VFP_REGNO_OK_FOR_DOUBLE (regno)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + fprintf (stream, "s%d", regno - FIRST_VFP_REGNUM + 1); + } + return; + /* Print a VFP/Neon double precision or quad precision register name. */ case 'P': case 'q': @@ -13208,6 +16325,108 @@ arm_print_operand (FILE *stream, rtx x, int code) } return; + /* Memory operand for vld1/vst1 instruction. */ + case 'A': + { + rtx addr; + bool postinc = FALSE; + unsigned align, modesize, align_bits; + + gcc_assert (GET_CODE (x) == MEM); + addr = XEXP (x, 0); + if (GET_CODE (addr) == POST_INC) + { + postinc = 1; + addr = XEXP (addr, 0); + } + asm_fprintf (stream, "[%r", REGNO (addr)); + + /* We know the alignment of this access, so we can emit a hint in the + instruction (for some alignments) as an aid to the memory subsystem + of the target. */ + align = MEM_ALIGN (x) >> 3; + modesize = GET_MODE_SIZE (GET_MODE (x)); + + /* Only certain alignment specifiers are supported by the hardware. */ + if (modesize == 16 && (align % 32) == 0) + align_bits = 256; + else if ((modesize == 8 || modesize == 16) && (align % 16) == 0) + align_bits = 128; + else if ((align % 8) == 0) + align_bits = 64; + else + align_bits = 0; + + if (align_bits != 0) + asm_fprintf (stream, ":%d", align_bits); + + asm_fprintf (stream, "]"); + + if (postinc) + fputs("!", stream); + } + return; + + case 'C': + { + rtx addr; + + gcc_assert (GET_CODE (x) == MEM); + addr = XEXP (x, 0); + gcc_assert (GET_CODE (addr) == REG); + asm_fprintf (stream, "[%r]", REGNO (addr)); + } + return; + + /* Translate an S register number into a D register number and element index. */ + case 'y': + { + int mode = GET_MODE (x); + int regno; + + if (GET_MODE_SIZE (mode) != 4 || GET_CODE (x) != REG) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = REGNO (x); + if (!VFP_REGNO_OK_FOR_SINGLE (regno)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = regno - FIRST_VFP_REGNUM; + fprintf (stream, "d%d[%d]", regno / 2, regno % 2); + } + return; + + /* Register specifier for vld1.16/vst1.16. Translate the S register + number into a D register number and element index. */ + case 'z': + { + int mode = GET_MODE (x); + int regno; + + if (GET_MODE_SIZE (mode) != 2 || GET_CODE (x) != REG) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = REGNO (x); + if (!VFP_REGNO_OK_FOR_SINGLE (regno)) + { + output_operand_lossage ("invalid operand for code '%c'", code); + return; + } + + regno = regno - FIRST_VFP_REGNUM; + fprintf (stream, "d%d[%d]", regno/2, ((regno % 2) ? 2 : 0)); + } + return; + default: if (x == 0) { @@ -13241,12 +16460,152 @@ arm_print_operand (FILE *stream, rtx x, int code) default: gcc_assert (GET_CODE (x) != NEG); fputc ('#', stream); + if (GET_CODE (x) == HIGH) + { + fputs (":lower16:", stream); + x = XEXP (x, 0); + } + output_addr_const (stream, x); break; } } } +/* Target hook for printing a memory address. */ +static void +arm_print_operand_address (FILE *stream, rtx x) +{ + if (TARGET_32BIT) + { + int is_minus = GET_CODE (x) == MINUS; + + if (GET_CODE (x) == REG) + asm_fprintf (stream, "[%r, #0]", REGNO (x)); + else if (GET_CODE (x) == PLUS || is_minus) + { + rtx base = XEXP (x, 0); + rtx index = XEXP (x, 1); + HOST_WIDE_INT offset = 0; + if (GET_CODE (base) != REG + || (GET_CODE (index) == REG && REGNO (index) == SP_REGNUM)) + { + /* Ensure that BASE is a register. */ + /* (one of them must be). */ + /* Also ensure the SP is not used as in index register. */ + rtx temp = base; + base = index; + index = temp; + } + switch (GET_CODE (index)) + { + case CONST_INT: + offset = INTVAL (index); + if (is_minus) + offset = -offset; + asm_fprintf (stream, "[%r, #%wd]", + REGNO (base), offset); + break; + + case REG: + asm_fprintf (stream, "[%r, %s%r]", + REGNO (base), is_minus ? "-" : "", + REGNO (index)); + break; + + case MULT: + case ASHIFTRT: + case LSHIFTRT: + case ASHIFT: + case ROTATERT: + { + asm_fprintf (stream, "[%r, %s%r", + REGNO (base), is_minus ? "-" : "", + REGNO (XEXP (index, 0))); + arm_print_operand (stream, index, 'S'); + fputs ("]", stream); + break; + } + + default: + gcc_unreachable (); + } + } + else if (GET_CODE (x) == PRE_INC || GET_CODE (x) == POST_INC + || GET_CODE (x) == PRE_DEC || GET_CODE (x) == POST_DEC) + { + extern enum machine_mode output_memory_reference_mode; + + gcc_assert (GET_CODE (XEXP (x, 0)) == REG); + + if (GET_CODE (x) == PRE_DEC || GET_CODE (x) == PRE_INC) + asm_fprintf (stream, "[%r, #%s%d]!", + REGNO (XEXP (x, 0)), + GET_CODE (x) == PRE_DEC ? "-" : "", + GET_MODE_SIZE (output_memory_reference_mode)); + else + asm_fprintf (stream, "[%r], #%s%d", + REGNO (XEXP (x, 0)), + GET_CODE (x) == POST_DEC ? "-" : "", + GET_MODE_SIZE (output_memory_reference_mode)); + } + else if (GET_CODE (x) == PRE_MODIFY) + { + asm_fprintf (stream, "[%r, ", REGNO (XEXP (x, 0))); + if (GET_CODE (XEXP (XEXP (x, 1), 1)) == CONST_INT) + asm_fprintf (stream, "#%wd]!", + INTVAL (XEXP (XEXP (x, 1), 1))); + else + asm_fprintf (stream, "%r]!", + REGNO (XEXP (XEXP (x, 1), 1))); + } + else if (GET_CODE (x) == POST_MODIFY) + { + asm_fprintf (stream, "[%r], ", REGNO (XEXP (x, 0))); + if (GET_CODE (XEXP (XEXP (x, 1), 1)) == CONST_INT) + asm_fprintf (stream, "#%wd", + INTVAL (XEXP (XEXP (x, 1), 1))); + else + asm_fprintf (stream, "%r", + REGNO (XEXP (XEXP (x, 1), 1))); + } + else output_addr_const (stream, x); + } + else + { + if (GET_CODE (x) == REG) + asm_fprintf (stream, "[%r]", REGNO (x)); + else if (GET_CODE (x) == POST_INC) + asm_fprintf (stream, "%r!", REGNO (XEXP (x, 0))); + else if (GET_CODE (x) == PLUS) + { + gcc_assert (GET_CODE (XEXP (x, 0)) == REG); + if (GET_CODE (XEXP (x, 1)) == CONST_INT) + asm_fprintf (stream, "[%r, #%wd]", + REGNO (XEXP (x, 0)), + INTVAL (XEXP (x, 1))); + else + asm_fprintf (stream, "[%r, %r]", + REGNO (XEXP (x, 0)), + REGNO (XEXP (x, 1))); + } + else + output_addr_const (stream, x); + } +} + +/* Target hook for indicating whether a punctuation character for + TARGET_PRINT_OPERAND is valid. */ +static bool +arm_print_operand_punct_valid_p (unsigned char code) +{ + return (code == '@' || code == '|' || code == '.' + || code == '(' || code == ')' || code == '#' + || (TARGET_32BIT && (code == '?')) + || (TARGET_THUMB2 && (code == '!')) + || (TARGET_THUMB && (code == '_'))); +} + /* Target hook for assembling integer objects. The ARM version needs to handle word-sized values specially. */ static bool @@ -13281,28 +16640,16 @@ arm_assemble_integer (rtx x, unsigned int size, int aligned_p) if (arm_vector_mode_supported_p (mode)) { int i, units; - unsigned int invmask = 0, parts_per_word; gcc_assert (GET_CODE (x) == CONST_VECTOR); units = CONST_VECTOR_NUNITS (x); size = GET_MODE_SIZE (GET_MODE_INNER (mode)); - /* For big-endian Neon vectors, we must permute the vector to the form - which, when loaded by a VLDR or VLDM instruction, will give a vector - with the elements in the right order. */ - if (TARGET_NEON && WORDS_BIG_ENDIAN) - { - parts_per_word = UNITS_PER_WORD / size; - /* FIXME: This might be wrong for 64-bit vector elements, but we don't - support those anywhere yet. */ - invmask = (parts_per_word == 0) ? 0 : (1 << (parts_per_word - 1)) - 1; - } - if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) for (i = 0; i < units; i++) { - rtx elt = CONST_VECTOR_ELT (x, i ^ invmask); + rtx elt = CONST_VECTOR_ELT (x, i); assemble_integer (elt, size, i == 0 ? BIGGEST_ALIGNMENT : size * BITS_PER_UNIT, 1); } @@ -13420,7 +16767,7 @@ static enum arm_cond_code get_arm_condition_code (rtx comparison) { enum machine_mode mode = GET_MODE (XEXP (comparison, 0)); - int code; + enum arm_cond_code code; enum rtx_code comp_code = GET_CODE (comparison); if (GET_MODE_CLASS (mode) != MODE_CC) @@ -13516,11 +16863,33 @@ get_arm_condition_code (rtx comparison) case CC_Cmode: switch (comp_code) - { - case LTU: return ARM_CS; - case GEU: return ARM_CC; - default: gcc_unreachable (); - } + { + case LTU: return ARM_CS; + case GEU: return ARM_CC; + default: gcc_unreachable (); + } + + case CC_CZmode: + switch (comp_code) + { + case NE: return ARM_NE; + case EQ: return ARM_EQ; + case GEU: return ARM_CS; + case GTU: return ARM_HI; + case LEU: return ARM_LS; + case LTU: return ARM_CC; + default: gcc_unreachable (); + } + + case CC_NCVmode: + switch (comp_code) + { + case GE: return ARM_GE; + case LT: return ARM_LT; + case GEU: return ARM_CS; + case LTU: return ARM_CC; + default: gcc_unreachable (); + } case CCmode: switch (comp_code) @@ -13631,12 +17000,6 @@ arm_final_prescan_insn (rtx insn) reversed if it appears to fail. */ int reverse = 0; - /* JUMP_CLOBBERS will be one implies that the conditions if a branch is - taken are clobbered, even if the rtl suggests otherwise. It also - means that we have to grub around within the jump expression to find - out what the conditions are when the jump isn't taken. */ - int jump_clobbers = 0; - /* If we start with a return insn, we only succeed if we find another one. */ int seeking_return = 0; @@ -13715,14 +17078,6 @@ arm_final_prescan_insn (rtx insn) int then_not_else = TRUE; rtx this_insn = start_insn, label = 0; - /* If the jump cannot be done with one instruction, we cannot - conditionally execute the instruction in the inverse case. */ - if (get_attr_conds (insn) == CONDS_JUMP_CLOB) - { - jump_clobbers = 1; - return; - } - /* Register the insn jumped to. */ if (reverse) { @@ -13765,13 +17120,7 @@ arm_final_prescan_insn (rtx insn) control falls in from somewhere else. */ if (this_insn == label) { - if (jump_clobbers) - { - arm_ccfsm_state = 2; - this_insn = next_nonnote_insn (this_insn); - } - else - arm_ccfsm_state = 1; + arm_ccfsm_state = 1; succeed = TRUE; } else @@ -13786,13 +17135,7 @@ arm_final_prescan_insn (rtx insn) this_insn = next_nonnote_insn (this_insn); if (this_insn && this_insn == label) { - if (jump_clobbers) - { - arm_ccfsm_state = 2; - this_insn = next_nonnote_insn (this_insn); - } - else - arm_ccfsm_state = 1; + arm_ccfsm_state = 1; succeed = TRUE; } else @@ -13820,13 +17163,7 @@ arm_final_prescan_insn (rtx insn) if (this_insn && this_insn == label && insns_skipped < max_insns_skipped) { - if (jump_clobbers) - { - arm_ccfsm_state = 2; - this_insn = next_nonnote_insn (this_insn); - } - else - arm_ccfsm_state = 1; + arm_ccfsm_state = 1; succeed = TRUE; } else @@ -13932,25 +17269,11 @@ arm_final_prescan_insn (rtx insn) } arm_target_insn = this_insn; } - if (jump_clobbers) - { - gcc_assert (!reverse); - arm_current_cc = - get_arm_condition_code (XEXP (XEXP (XEXP (SET_SRC (body), - 0), 0), 1)); - if (GET_CODE (XEXP (XEXP (SET_SRC (body), 0), 0)) == AND) - arm_current_cc = ARM_INVERSE_CONDITION_CODE (arm_current_cc); - if (GET_CODE (XEXP (SET_SRC (body), 0)) == NE) - arm_current_cc = ARM_INVERSE_CONDITION_CODE (arm_current_cc); - } - else - { - /* If REVERSE is true, ARM_CURRENT_CC needs to be inverted from - what it was. */ - if (!reverse) - arm_current_cc = get_arm_condition_code (XEXP (SET_SRC (body), - 0)); - } + + /* If REVERSE is true, ARM_CURRENT_CC needs to be inverted from + what it was. */ + if (!reverse) + arm_current_cc = get_arm_condition_code (XEXP (SET_SRC (body), 0)); if (reverse || then_not_else) arm_current_cc = ARM_INVERSE_CONDITION_CODE (arm_current_cc); @@ -14017,6 +17340,11 @@ arm_hard_regno_mode_ok (unsigned int regno, enum machine_mode mode) if (mode == DFmode) return VFP_REGNO_OK_FOR_DOUBLE (regno); + /* VFP registers can hold HFmode values, but there is no point in + putting them there unless we have hardware conversion insns. */ + if (mode == HFmode) + return TARGET_FP16 && VFP_REGNO_OK_FOR_SINGLE (regno); + if (TARGET_NEON) return (VALID_NEON_DREG_MODE (mode) && VFP_REGNO_OK_FOR_DOUBLE (regno)) || (VALID_NEON_QREG_MODE (mode) @@ -14039,13 +17367,13 @@ arm_hard_regno_mode_ok (unsigned int regno, enum machine_mode mode) return VALID_IWMMXT_REG_MODE (mode); } - /* We allow any value to be stored in the general registers. + /* We allow almost any value to be stored in the general registers. Restrict doubleword quantities to even register pairs so that we can - use ldrd. Do not allow Neon structure opaque modes in general registers; - they would use too many. */ + use ldrd. Do not allow very large Neon structure opaque modes in + general registers; they would use too many. */ if (regno <= LAST_ARM_REGNUM) return !(TARGET_LDRD && GET_MODE_SIZE (mode) > 4 && (regno & 1) != 0) - && !VALID_NEON_STRUCT_MODE (mode); + && ARM_NUM_REGS (mode) <= 4; if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM) @@ -14062,7 +17390,8 @@ arm_hard_regno_mode_ok (unsigned int regno, enum machine_mode mode) /* For efficiency and historical reasons LO_REGS, HI_REGS and CC_REGS are not used in arm mode. */ -int + +enum reg_class arm_regno_class (int regno) { if (TARGET_THUMB1) @@ -14216,7 +17545,7 @@ static const struct builtin_description bdesc_2arg[] = { #define IWMMXT_BUILTIN(code, string, builtin) \ { FL_IWMMXT, CODE_FOR_##code, "__builtin_arm_" string, \ - ARM_BUILTIN_##builtin, 0, 0 }, + ARM_BUILTIN_##builtin, UNKNOWN, 0 }, IWMMXT_BUILTIN (addv8qi3, "waddb", WADDB) IWMMXT_BUILTIN (addv4hi3, "waddh", WADDH) @@ -14278,7 +17607,7 @@ static const struct builtin_description bdesc_2arg[] = IWMMXT_BUILTIN (iwmmxt_wmaddu, "wmaddu", WMADDU) #define IWMMXT_BUILTIN2(code, builtin) \ - { FL_IWMMXT, CODE_FOR_##code, NULL, ARM_BUILTIN_##builtin, 0, 0 }, + { FL_IWMMXT, CODE_FOR_##code, NULL, ARM_BUILTIN_##builtin, UNKNOWN, 0 }, IWMMXT_BUILTIN2 (iwmmxt_wpackhss, WPACKHSS) IWMMXT_BUILTIN2 (iwmmxt_wpackwss, WPACKWSS) @@ -14675,7 +18004,7 @@ arm_init_tls_builtins (void) TREE_READONLY (decl) = 1; } -typedef enum { +enum neon_builtin_type_bits { T_V8QI = 0x0001, T_V4HI = 0x0002, T_V2SI = 0x0004, @@ -14689,7 +18018,7 @@ typedef enum { T_TI = 0x0400, T_EI = 0x0800, T_OI = 0x1000 -} neon_builtin_type_bits; +}; #define v8qi_UP T_V8QI #define v4hi_UP T_V4HI @@ -14752,7 +18081,7 @@ typedef enum { typedef struct { const char *name; const neon_itype itype; - const neon_builtin_type_bits bits; + const int bits; const enum insn_code codes[T_MAX]; const unsigned int num_vars; unsigned int base_fcode; @@ -15090,6 +18419,24 @@ arm_init_neon_builtins (void) TYPE_PRECISION (neon_float_type_node) = FLOAT_TYPE_SIZE; layout_type (neon_float_type_node); + /* Define typedefs which exactly correspond to the modes we are basing vector + types on. If you change these names you'll need to change + the table used by arm_mangle_type too. */ + (*lang_hooks.types.register_builtin_type) (neon_intQI_type_node, + "__builtin_neon_qi"); + (*lang_hooks.types.register_builtin_type) (neon_intHI_type_node, + "__builtin_neon_hi"); + (*lang_hooks.types.register_builtin_type) (neon_intSI_type_node, + "__builtin_neon_si"); + (*lang_hooks.types.register_builtin_type) (neon_float_type_node, + "__builtin_neon_sf"); + (*lang_hooks.types.register_builtin_type) (neon_intDI_type_node, + "__builtin_neon_di"); + (*lang_hooks.types.register_builtin_type) (neon_polyQI_type_node, + "__builtin_neon_poly8"); + (*lang_hooks.types.register_builtin_type) (neon_polyHI_type_node, + "__builtin_neon_poly16"); + intQI_pointer_node = build_pointer_type (neon_intQI_type_node); intHI_pointer_node = build_pointer_type (neon_intHI_type_node); intSI_pointer_node = build_pointer_type (neon_intSI_type_node); @@ -15142,12 +18489,32 @@ arm_init_neon_builtins (void) intUSI_type_node = make_unsigned_type (GET_MODE_PRECISION (SImode)); intUDI_type_node = make_unsigned_type (GET_MODE_PRECISION (DImode)); + (*lang_hooks.types.register_builtin_type) (intUQI_type_node, + "__builtin_neon_uqi"); + (*lang_hooks.types.register_builtin_type) (intUHI_type_node, + "__builtin_neon_uhi"); + (*lang_hooks.types.register_builtin_type) (intUSI_type_node, + "__builtin_neon_usi"); + (*lang_hooks.types.register_builtin_type) (intUDI_type_node, + "__builtin_neon_udi"); + /* Opaque integer types for structures of vectors. */ intEI_type_node = make_signed_type (GET_MODE_PRECISION (EImode)); intOI_type_node = make_signed_type (GET_MODE_PRECISION (OImode)); intCI_type_node = make_signed_type (GET_MODE_PRECISION (CImode)); intXI_type_node = make_signed_type (GET_MODE_PRECISION (XImode)); + (*lang_hooks.types.register_builtin_type) (intTI_type_node, + "__builtin_neon_ti"); + (*lang_hooks.types.register_builtin_type) (intEI_type_node, + "__builtin_neon_ei"); + (*lang_hooks.types.register_builtin_type) (intOI_type_node, + "__builtin_neon_oi"); + (*lang_hooks.types.register_builtin_type) (intCI_type_node, + "__builtin_neon_ci"); + (*lang_hooks.types.register_builtin_type) (intXI_type_node, + "__builtin_neon_xi"); + /* Pointers to vector types. */ V8QI_pointer_node = build_pointer_type (V8QI_type_node); V4HI_pointer_node = build_pointer_type (V4HI_type_node); @@ -15191,44 +18558,6 @@ arm_init_neon_builtins (void) build_function_type_list (void_type_node, V2DI_pointer_node, V2DI_type_node, V2DI_type_node, NULL); - /* Define typedefs which exactly correspond to the modes we are basing vector - types on. If you change these names you'll need to change - the table used by arm_mangle_type too. */ - (*lang_hooks.types.register_builtin_type) (neon_intQI_type_node, - "__builtin_neon_qi"); - (*lang_hooks.types.register_builtin_type) (neon_intHI_type_node, - "__builtin_neon_hi"); - (*lang_hooks.types.register_builtin_type) (neon_intSI_type_node, - "__builtin_neon_si"); - (*lang_hooks.types.register_builtin_type) (neon_float_type_node, - "__builtin_neon_sf"); - (*lang_hooks.types.register_builtin_type) (neon_intDI_type_node, - "__builtin_neon_di"); - - (*lang_hooks.types.register_builtin_type) (neon_polyQI_type_node, - "__builtin_neon_poly8"); - (*lang_hooks.types.register_builtin_type) (neon_polyHI_type_node, - "__builtin_neon_poly16"); - (*lang_hooks.types.register_builtin_type) (intUQI_type_node, - "__builtin_neon_uqi"); - (*lang_hooks.types.register_builtin_type) (intUHI_type_node, - "__builtin_neon_uhi"); - (*lang_hooks.types.register_builtin_type) (intUSI_type_node, - "__builtin_neon_usi"); - (*lang_hooks.types.register_builtin_type) (intUDI_type_node, - "__builtin_neon_udi"); - - (*lang_hooks.types.register_builtin_type) (intTI_type_node, - "__builtin_neon_ti"); - (*lang_hooks.types.register_builtin_type) (intEI_type_node, - "__builtin_neon_ei"); - (*lang_hooks.types.register_builtin_type) (intOI_type_node, - "__builtin_neon_oi"); - (*lang_hooks.types.register_builtin_type) (intCI_type_node, - "__builtin_neon_ci"); - (*lang_hooks.types.register_builtin_type) (intXI_type_node, - "__builtin_neon_xi"); - dreg_types[0] = V8QI_type_node; dreg_types[1] = V4HI_type_node; dreg_types[2] = V2SI_type_node; @@ -15502,6 +18831,15 @@ arm_init_neon_builtins (void) } static void +arm_init_fp16_builtins (void) +{ + tree fp16_type = make_node (REAL_TYPE); + TYPE_PRECISION (fp16_type) = 16; + layout_type (fp16_type); + (*lang_hooks.types.register_builtin_type) (fp16_type, "__fp16"); +} + +static void arm_init_builtins (void) { arm_init_tls_builtins (); @@ -15511,6 +18849,71 @@ arm_init_builtins (void) if (TARGET_NEON) arm_init_neon_builtins (); + + if (arm_fp16_format) + arm_init_fp16_builtins (); +} + +/* Implement TARGET_INVALID_PARAMETER_TYPE. */ + +static const char * +arm_invalid_parameter_type (const_tree t) +{ + if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16) + return N_("function parameters cannot have __fp16 type"); + return NULL; +} + +/* Implement TARGET_INVALID_PARAMETER_TYPE. */ + +static const char * +arm_invalid_return_type (const_tree t) +{ + if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16) + return N_("functions cannot return __fp16 type"); + return NULL; +} + +/* Implement TARGET_PROMOTED_TYPE. */ + +static tree +arm_promoted_type (const_tree t) +{ + if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16) + return float_type_node; + return NULL_TREE; +} + +/* Implement TARGET_CONVERT_TO_TYPE. + Specifically, this hook implements the peculiarity of the ARM + half-precision floating-point C semantics that requires conversions between + __fp16 to or from double to do an intermediate conversion to float. */ + +static tree +arm_convert_to_type (tree type, tree expr) +{ + tree fromtype = TREE_TYPE (expr); + if (!SCALAR_FLOAT_TYPE_P (fromtype) || !SCALAR_FLOAT_TYPE_P (type)) + return NULL_TREE; + if ((TYPE_PRECISION (fromtype) == 16 && TYPE_PRECISION (type) > 32) + || (TYPE_PRECISION (type) == 16 && TYPE_PRECISION (fromtype) > 32)) + return convert (type, convert (float_type_node, expr)); + return NULL_TREE; +} + +/* Implement TARGET_SCALAR_MODE_SUPPORTED_P. + This simply adds HFmode as a supported mode; even though we don't + implement arithmetic on this type directly, it's supported by + optabs conversions, much the way the double-word arithmetic is + special-cased in the default hook. */ + +static bool +arm_scalar_mode_supported_p (enum machine_mode mode) +{ + if (mode == HFmode) + return (arm_fp16_format != ARM_FP16_FORMAT_NONE); + else + return default_scalar_mode_supported_p (mode); } /* Errors in the source file can cause expand_expr to return const0_rtx @@ -15605,8 +19008,8 @@ arm_expand_unop_builtin (enum insn_code icode, static int neon_builtin_compare (const void *a, const void *b) { - const neon_builtin_datum *key = a; - const neon_builtin_datum *memb = b; + const neon_builtin_datum *const key = (const neon_builtin_datum *) a; + const neon_builtin_datum *const memb = (const neon_builtin_datum *) b; unsigned int soughtcode = key->base_fcode; if (soughtcode >= memb->base_fcode @@ -15625,7 +19028,8 @@ locate_neon_builtin_icode (int fcode, neon_itype *itype) int idx; key.base_fcode = fcode; - found = bsearch (&key, &neon_builtin_data[0], ARRAY_SIZE (neon_builtin_data), + found = (neon_builtin_datum *) + bsearch (&key, &neon_builtin_data[0], ARRAY_SIZE (neon_builtin_data), sizeof (neon_builtin_data[0]), neon_builtin_compare); gcc_assert (found); idx = fcode - (int) found->base_fcode; @@ -15668,7 +19072,7 @@ arm_expand_neon_args (rtx target, int icode, int have_retval, for (;;) { - builtin_arg thisarg = va_arg (ap, int); + builtin_arg thisarg = (builtin_arg) va_arg (ap, int); if (thisarg == NEON_ARG_STOP) break; @@ -16213,7 +19617,7 @@ thumb_pushpop (FILE *f, unsigned long mask, int push, int *cfa_offset, return; } - if (ARM_EABI_UNWIND_TABLES && push) + if (push && arm_except_unwind_info () == UI_TARGET) { fprintf (f, "\t.save\t{"); for (regno = 0; regno < 15; regno++) @@ -16281,7 +19685,7 @@ thumb_pushpop (FILE *f, unsigned long mask, int push, int *cfa_offset, if (push && pushed_words && dwarf2out_do_frame ()) { - char *l = dwarf2out_cfi_label (); + char *l = dwarf2out_cfi_label (false); int pushed_mask = real_regs; *cfa_offset += pushed_words * 4; @@ -16573,14 +19977,45 @@ thumb_exit (FILE *f, int reg_containing_return_addr) /* Return to caller. */ asm_fprintf (f, "\tbx\t%r\n", reg_containing_return_addr); } - +/* Scan INSN just before assembler is output for it. + For Thumb-1, we track the status of the condition codes; this + information is used in the cbranchsi4_insn pattern. */ void thumb1_final_prescan_insn (rtx insn) { if (flag_print_asm_name) asm_fprintf (asm_out_file, "%@ 0x%04x\n", INSN_ADDRESSES (INSN_UID (insn))); + /* Don't overwrite the previous setter when we get to a cbranch. */ + if (INSN_CODE (insn) != CODE_FOR_cbranchsi4_insn) + { + enum attr_conds conds; + + if (cfun->machine->thumb1_cc_insn) + { + if (modified_in_p (cfun->machine->thumb1_cc_op0, insn) + || modified_in_p (cfun->machine->thumb1_cc_op1, insn)) + CC_STATUS_INIT; + } + conds = get_attr_conds (insn); + if (conds == CONDS_SET) + { + rtx set = single_set (insn); + cfun->machine->thumb1_cc_insn = insn; + cfun->machine->thumb1_cc_op0 = SET_DEST (set); + cfun->machine->thumb1_cc_op1 = const0_rtx; + cfun->machine->thumb1_cc_mode = CC_NOOVmode; + if (INSN_CODE (insn) == CODE_FOR_thumb1_subsi3_insn) + { + rtx src1 = XEXP (SET_SRC (set), 1); + if (src1 == const0_rtx) + cfun->machine->thumb1_cc_mode = CCmode; + } + } + else if (conds != CONDS_NOCOND) + cfun->machine->thumb1_cc_insn = NULL_RTX; + } } int @@ -16589,6 +20024,7 @@ thumb_shiftable_const (unsigned HOST_WIDE_INT val) unsigned HOST_WIDE_INT mask = 0xff; int i; + val = val & (unsigned HOST_WIDE_INT)0xffffffffu; if (val == 0) /* XXX */ return 0; @@ -16687,6 +20123,81 @@ is_called_in_ARM_mode (tree func) #endif } +/* Given the stack offsets and register mask in OFFSETS, decide how + many additional registers to push instead of subtracting a constant + from SP. For epilogues the principle is the same except we use pop. + FOR_PROLOGUE indicates which we're generating. */ +static int +thumb1_extra_regs_pushed (arm_stack_offsets *offsets, bool for_prologue) +{ + HOST_WIDE_INT amount; + unsigned long live_regs_mask = offsets->saved_regs_mask; + /* Extract a mask of the ones we can give to the Thumb's push/pop + instruction. */ + unsigned long l_mask = live_regs_mask & (for_prologue ? 0x40ff : 0xff); + /* Then count how many other high registers will need to be pushed. */ + unsigned long high_regs_pushed = bit_count (live_regs_mask & 0x0f00); + int n_free, reg_base; + + if (!for_prologue && frame_pointer_needed) + amount = offsets->locals_base - offsets->saved_regs; + else + amount = offsets->outgoing_args - offsets->saved_regs; + + /* If the stack frame size is 512 exactly, we can save one load + instruction, which should make this a win even when optimizing + for speed. */ + if (!optimize_size && amount != 512) + return 0; + + /* Can't do this if there are high registers to push. */ + if (high_regs_pushed != 0) + return 0; + + /* Shouldn't do it in the prologue if no registers would normally + be pushed at all. In the epilogue, also allow it if we'll have + a pop insn for the PC. */ + if (l_mask == 0 + && (for_prologue + || TARGET_BACKTRACE + || (live_regs_mask & 1 << LR_REGNUM) == 0 + || TARGET_INTERWORK + || crtl->args.pretend_args_size != 0)) + return 0; + + /* Don't do this if thumb_expand_prologue wants to emit instructions + between the push and the stack frame allocation. */ + if (for_prologue + && ((flag_pic && arm_pic_register != INVALID_REGNUM) + || (!frame_pointer_needed && CALLER_INTERWORKING_SLOT_SIZE > 0))) + return 0; + + reg_base = 0; + n_free = 0; + if (!for_prologue) + { + reg_base = arm_size_return_regs () / UNITS_PER_WORD; + live_regs_mask >>= reg_base; + } + + while (reg_base + n_free < 8 && !(live_regs_mask & 1) + && (for_prologue || call_used_regs[reg_base + n_free])) + { + live_regs_mask >>= 1; + n_free++; + } + + if (n_free == 0) + return 0; + gcc_assert (amount / 4 * 4 == amount); + + if (amount >= 512 && (amount - n_free * 4) < 512) + return (amount - 508) / 4; + if (amount <= n_free * 4) + return amount / 4; + return 0; +} + /* The bits which aren't usefully expanded as rtl. */ const char * thumb_unexpanded_epilogue (void) @@ -16695,10 +20206,11 @@ thumb_unexpanded_epilogue (void) int regno; unsigned long live_regs_mask = 0; int high_regs_pushed = 0; + int extra_pop; int had_to_push_lr; int size; - if (return_used_this_function) + if (cfun->machine->return_used_this_function != 0) return ""; if (IS_NAKED (arm_current_func_type ())) @@ -16714,6 +20226,13 @@ thumb_unexpanded_epilogue (void) the register is used to hold a return value. */ size = arm_size_return_regs (); + extra_pop = thumb1_extra_regs_pushed (offsets, false); + if (extra_pop > 0) + { + unsigned long extra_mask = (1 << extra_pop) - 1; + live_regs_mask |= extra_mask << (size / UNITS_PER_WORD); + } + /* The prolog may have pushed some high registers to use as work registers. e.g. the testsuite file: gcc/testsuite/gcc/gcc.c-torture/execute/complex-2.c @@ -16797,7 +20316,9 @@ thumb_unexpanded_epilogue (void) live_regs_mask); /* We have either just popped the return address into the - PC or it is was kept in LR for the entire function. */ + PC or it is was kept in LR for the entire function. + Note that thumb_pushpop has already called thumb_exit if the + PC was in the list. */ if (!had_to_push_lr) thumb_exit (asm_out_file, LR_REGNUM); } @@ -16853,7 +20374,7 @@ static struct machine_function * arm_init_machine_status (void) { struct machine_function *machine; - machine = (machine_function *) ggc_alloc_cleared (sizeof (machine_function)); + machine = ggc_alloc_cleared_machine_function (); #if ARM_FT_UNKNOWN != 0 machine->func_type = ARM_FT_UNKNOWN; @@ -16979,6 +20500,7 @@ thumb1_expand_prologue (void) stack_pointer_rtx); amount = offsets->outgoing_args - offsets->saved_regs; + amount -= 4 * thumb1_extra_regs_pushed (offsets, true); if (amount) { if (amount < 512) @@ -17006,62 +20528,23 @@ thumb1_expand_prologue (void) been pushed at the start of the prologue and so we can corrupt it now. */ for (regno = LAST_ARG_REGNUM + 1; regno <= LAST_LO_REGNUM; regno++) - if (live_regs_mask & (1 << regno) - && !(frame_pointer_needed - && (regno == THUMB_HARD_FRAME_POINTER_REGNUM))) + if (live_regs_mask & (1 << regno)) break; - if (regno > LAST_LO_REGNUM) /* Very unlikely. */ - { - rtx spare = gen_rtx_REG (SImode, IP_REGNUM); - - /* Choose an arbitrary, non-argument low register. */ - reg = gen_rtx_REG (SImode, LAST_LO_REGNUM); - - /* Save it by copying it into a high, scratch register. */ - emit_insn (gen_movsi (spare, reg)); - /* Add a USE to stop propagate_one_insn() from barfing. */ - emit_insn (gen_prologue_use (spare)); + gcc_assert(regno <= LAST_LO_REGNUM); - /* Decrement the stack. */ - emit_insn (gen_movsi (reg, GEN_INT (- amount))); - insn = emit_insn (gen_addsi3 (stack_pointer_rtx, - stack_pointer_rtx, reg)); - RTX_FRAME_RELATED_P (insn) = 1; - dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx, - plus_constant (stack_pointer_rtx, - -amount)); - RTX_FRAME_RELATED_P (dwarf) = 1; - REG_NOTES (insn) - = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, dwarf, - REG_NOTES (insn)); - - /* Restore the low register's original value. */ - emit_insn (gen_movsi (reg, spare)); - - /* Emit a USE of the restored scratch register, so that flow - analysis will not consider the restore redundant. The - register won't be used again in this function and isn't - restored by the epilogue. */ - emit_insn (gen_prologue_use (reg)); - } - else - { - reg = gen_rtx_REG (SImode, regno); + reg = gen_rtx_REG (SImode, regno); - emit_insn (gen_movsi (reg, GEN_INT (- amount))); + emit_insn (gen_movsi (reg, GEN_INT (- amount))); - insn = emit_insn (gen_addsi3 (stack_pointer_rtx, - stack_pointer_rtx, reg)); - RTX_FRAME_RELATED_P (insn) = 1; - dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx, - plus_constant (stack_pointer_rtx, - -amount)); - RTX_FRAME_RELATED_P (dwarf) = 1; - REG_NOTES (insn) - = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, dwarf, - REG_NOTES (insn)); - } + insn = emit_insn (gen_addsi3 (stack_pointer_rtx, + stack_pointer_rtx, reg)); + RTX_FRAME_RELATED_P (insn) = 1; + dwarf = gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, + -amount)); + RTX_FRAME_RELATED_P (dwarf) = 1; + add_reg_note (insn, REG_FRAME_RELATED_EXPR, dwarf); } } @@ -17074,7 +20557,8 @@ thumb1_expand_prologue (void) using the EABI unwinder, to prevent faulting instructions from being swapped with a stack adjustment. */ if (crtl->profile || !TARGET_SCHED_PROLOG - || (ARM_EABI_UNWIND_TABLES && flag_non_call_exceptions)) + || (arm_except_unwind_info () == UI_TARGET + && cfun->can_throw_non_call_exceptions)) emit_insn (gen_blockage ()); cfun->machine->lr_save_eliminated = !thumb_force_lr_save (); @@ -17102,6 +20586,7 @@ thumb1_expand_epilogue (void) emit_insn (gen_movsi (stack_pointer_rtx, hard_frame_pointer_rtx)); amount = offsets->locals_base - offsets->saved_regs; } + amount -= 4 * thumb1_extra_regs_pushed (offsets, false); gcc_assert (amount >= 0); if (amount) @@ -17186,7 +20671,7 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED) if (crtl->args.pretend_args_size) { /* Output unwind directive for the stack adjustment. */ - if (ARM_EABI_UNWIND_TABLES) + if (arm_except_unwind_info () == UI_TARGET) fprintf (f, "\t.pad #%d\n", crtl->args.pretend_args_size); @@ -17216,7 +20701,7 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED) the stack pointer. */ if (dwarf2out_do_frame ()) { - char *l = dwarf2out_cfi_label (); + char *l = dwarf2out_cfi_label (false); cfa_offset = cfa_offset + crtl->args.pretend_args_size; dwarf2out_def_cfa (l, SP_REGNUM, cfa_offset); @@ -17256,7 +20741,7 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED) work_register = thumb_find_work_register (live_regs_mask); - if (ARM_EABI_UNWIND_TABLES) + if (arm_except_unwind_info () == UI_TARGET) asm_fprintf (f, "\t.pad #16\n"); asm_fprintf @@ -17265,7 +20750,7 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED) if (dwarf2out_do_frame ()) { - char *l = dwarf2out_cfi_label (); + char *l = dwarf2out_cfi_label (false); cfa_offset = cfa_offset + 16; dwarf2out_def_cfa (l, SP_REGNUM, cfa_offset); @@ -17322,7 +20807,11 @@ thumb1_output_function_prologue (FILE *f, HOST_WIDE_INT size ATTRIBUTE_UNUSED) register. */ else if ((l_mask & 0xff) != 0 || (high_regs_pushed == 0 && l_mask)) - thumb_pushpop (f, l_mask, 1, &cfa_offset, l_mask); + { + unsigned long mask = l_mask; + mask |= (1 << thumb1_extra_regs_pushed (offsets, true)) - 1; + thumb_pushpop (f, mask, 1, &cfa_offset, mask); + } if (high_regs_pushed) { @@ -17696,13 +21185,10 @@ arm_file_start (void) if (TARGET_BPABI) { const char *fpu_name; - if (arm_select[0].string) - asm_fprintf (asm_out_file, "\t.cpu %s\n", arm_select[0].string); - else if (arm_select[1].string) - asm_fprintf (asm_out_file, "\t.arch %s\n", arm_select[1].string); + if (arm_selected_arch) + asm_fprintf (asm_out_file, "\t.arch %s\n", arm_selected_arch->name); else - asm_fprintf (asm_out_file, "\t.cpu %s\n", - all_cores[arm_default_cpu].name); + asm_fprintf (asm_out_file, "\t.cpu %s\n", arm_selected_cpu->name); if (TARGET_SOFT_FLOAT) { @@ -17713,37 +21199,8 @@ arm_file_start (void) } else { - int set_float_abi_attributes = 0; - switch (arm_fpu_arch) - { - case FPUTYPE_FPA: - fpu_name = "fpa"; - break; - case FPUTYPE_FPA_EMU2: - fpu_name = "fpe2"; - break; - case FPUTYPE_FPA_EMU3: - fpu_name = "fpe3"; - break; - case FPUTYPE_MAVERICK: - fpu_name = "maverick"; - break; - case FPUTYPE_VFP: - fpu_name = "vfp"; - set_float_abi_attributes = 1; - break; - case FPUTYPE_VFP3: - fpu_name = "vfp3"; - set_float_abi_attributes = 1; - break; - case FPUTYPE_NEON: - fpu_name = "neon"; - set_float_abi_attributes = 1; - break; - default: - abort(); - } - if (set_float_abi_attributes) + fpu_name = arm_fpu_desc->name; + if (arm_fpu_desc->model == ARM_FP_MODEL_VFP) { if (TARGET_HARD_FLOAT) asm_fprintf (asm_out_file, "\t.eabi_attribute 27, 3\n"); @@ -17793,6 +21250,11 @@ arm_file_start (void) val = 6; asm_fprintf (asm_out_file, "\t.eabi_attribute 30, %d\n", val); + /* Tag_ABI_FP_16bit_format. */ + if (arm_fp16_format) + asm_fprintf (asm_out_file, "\t.eabi_attribute 38, %d\n", + (int)arm_fp16_format); + if (arm_lang_output_object_attributes_hook) arm_lang_output_object_attributes_hook(); } @@ -18022,6 +21484,23 @@ arm_emit_vector_const (FILE *file, rtx x) return 1; } +/* Emit a fp16 constant appropriately padded to occupy a 4-byte word. + HFmode constant pool entries are actually loaded with ldr. */ +void +arm_emit_fp16_const (rtx c) +{ + REAL_VALUE_TYPE r; + long bits; + + REAL_VALUE_FROM_CONST_DOUBLE (r, c); + bits = real_to_target (NULL, &r, HFmode); + if (WORDS_BIG_ENDIAN) + assemble_zeros (2); + assemble_integer (GEN_INT (bits), 2, BITS_PER_WORD, 1); + if (!WORDS_BIG_ENDIAN) + assemble_zeros (2); +} + const char * arm_output_load_gr (rtx *operands) { @@ -18059,19 +21538,24 @@ arm_output_load_gr (rtx *operands) that way. */ static void -arm_setup_incoming_varargs (CUMULATIVE_ARGS *cum, +arm_setup_incoming_varargs (CUMULATIVE_ARGS *pcum, enum machine_mode mode, tree type, int *pretend_size, int second_time ATTRIBUTE_UNUSED) { - int nregs = cum->nregs; - if (nregs & 1 - && ARM_DOUBLEWORD_ALIGN - && arm_needs_doubleword_align (mode, type)) - nregs++; - + int nregs; + cfun->machine->uses_anonymous_args = 1; + if (pcum->pcs_variant <= ARM_PCS_AAPCS_LOCAL) + { + nregs = pcum->aapcs_ncrn; + if ((nregs & 1) && arm_needs_doubleword_align (mode, type)) + nregs++; + } + else + nregs = pcum->nregs; + if (nregs < NUM_ARG_REGS) *pretend_size = (NUM_ARG_REGS - nregs) * UNITS_PER_WORD; } @@ -18099,6 +21583,38 @@ arm_no_early_store_addr_dep (rtx producer, rtx consumer) return !reg_overlap_mentioned_p (value, addr); } +/* Return nonzero if the CONSUMER instruction (a store) does need + PRODUCER's value to calculate the address. */ + +int +arm_early_store_addr_dep (rtx producer, rtx consumer) +{ + return !arm_no_early_store_addr_dep (producer, consumer); +} + +/* Return nonzero if the CONSUMER instruction (a load) does need + PRODUCER's value to calculate the address. */ + +int +arm_early_load_addr_dep (rtx producer, rtx consumer) +{ + rtx value = PATTERN (producer); + rtx addr = PATTERN (consumer); + + if (GET_CODE (value) == COND_EXEC) + value = COND_EXEC_CODE (value); + if (GET_CODE (value) == PARALLEL) + value = XVECEXP (value, 0, 0); + value = XEXP (value, 0); + if (GET_CODE (addr) == COND_EXEC) + addr = COND_EXEC_CODE (addr); + if (GET_CODE (addr) == PARALLEL) + addr = XVECEXP (addr, 0, 0); + addr = XEXP (addr, 1); + + return reg_overlap_mentioned_p (value, addr); +} + /* Return nonzero if the CONSUMER instruction (an ALU op) does not have an early register shift value or amount dependency on the result of PRODUCER. */ @@ -18185,8 +21701,15 @@ arm_no_early_mul_dep (rtx producer, rtx consumer) op = XVECEXP (op, 0, 0); op = XEXP (op, 1); - return (GET_CODE (op) == PLUS - && !reg_overlap_mentioned_p (value, XEXP (op, 0))); + if (GET_CODE (op) == PLUS || GET_CODE (op) == MINUS) + { + if (GET_CODE (XEXP (op, 0)) == MULT) + return !reg_overlap_mentioned_p (value, XEXP (op, 0)); + else + return !reg_overlap_mentioned_p (value, XEXP (op, 1)); + } + + return 0; } /* We can't rely on the caller doing the proper promotion when @@ -18198,6 +21721,19 @@ arm_promote_prototypes (const_tree t ATTRIBUTE_UNUSED) return !TARGET_AAPCS_BASED; } +static enum machine_mode +arm_promote_function_mode (const_tree type ATTRIBUTE_UNUSED, + enum machine_mode mode, + int *punsignedp ATTRIBUTE_UNUSED, + const_tree fntype ATTRIBUTE_UNUSED, + int for_return ATTRIBUTE_UNUSED) +{ + if (GET_MODE_CLASS (mode) == MODE_INT + && GET_MODE_SIZE (mode) < 4) + return SImode; + + return mode; +} /* AAPCS based ABIs use short enums by default. */ @@ -18313,7 +21849,8 @@ arm_cxx_key_method_may_be_inline (void) static void arm_cxx_determine_class_data_visibility (tree decl) { - if (!TARGET_AAPCS_BASED) + if (!TARGET_AAPCS_BASED + || !TARGET_DLLIMPORT_DECL_ATTRIBUTES) return; /* In general, \S 3.2.5.5 of the ARM EABI requires that class data @@ -18447,14 +21984,80 @@ arm_vector_mode_supported_p (enum machine_mode mode) || mode == V16QImode || mode == V4SFmode || mode == V2DImode)) return true; - if ((mode == V2SImode) - || (mode == V4HImode) - || (mode == V8QImode)) + if ((TARGET_NEON || TARGET_IWMMXT) + && ((mode == V2SImode) + || (mode == V4HImode) + || (mode == V8QImode))) + return true; + + return false; +} + +/* Use the option -mvectorize-with-neon-quad to override the use of doubleword + registers when autovectorizing for Neon, at least until multiple vector + widths are supported properly by the middle-end. */ + +static enum machine_mode +arm_preferred_simd_mode (enum machine_mode mode) +{ + if (TARGET_NEON) + switch (mode) + { + case SFmode: + return TARGET_NEON_VECTORIZE_QUAD ? V4SFmode : V2SFmode; + case SImode: + return TARGET_NEON_VECTORIZE_QUAD ? V4SImode : V2SImode; + case HImode: + return TARGET_NEON_VECTORIZE_QUAD ? V8HImode : V4HImode; + case QImode: + return TARGET_NEON_VECTORIZE_QUAD ? V16QImode : V8QImode; + case DImode: + if (TARGET_NEON_VECTORIZE_QUAD) + return V2DImode; + break; + + default:; + } + + if (TARGET_REALLY_IWMMXT) + switch (mode) + { + case SImode: + return V2SImode; + case HImode: + return V4HImode; + case QImode: + return V8QImode; + + default:; + } + + return word_mode; +} + +/* Implement TARGET_CLASS_LIKELY_SPILLED_P. + + We need to define this for LO_REGS on thumb. Otherwise we can end up + using r0-r4 for function arguments, r7 for the stack frame and don't + have enough left over to do doubleword arithmetic. */ + +static bool +arm_class_likely_spilled_p (reg_class_t rclass) +{ + if ((TARGET_THUMB && rclass == LO_REGS) + || rclass == CC_REG) return true; return false; } +/* Implements target hook small_register_classes_for_mode_p. */ +bool +arm_small_register_classes_for_mode_p (enum machine_mode mode ATTRIBUTE_UNUSED) +{ + return TARGET_THUMB1; +} + /* Implement TARGET_SHIFT_TRUNCATION_MASK. SImode shifts use normal ARM insns and therefore guarantee that the shift count is modulo 256. DImode shifts (those implemented by lib1funcs.asm or by optabs.c) @@ -18480,9 +22083,14 @@ arm_dbx_register_number (unsigned int regno) if (IS_FPA_REGNUM (regno)) return (TARGET_AAPCS_BASED ? 96 : 16) + regno - FIRST_FPA_REGNUM; - /* FIXME: VFPv3 register numbering. */ if (IS_VFP_REGNUM (regno)) - return 64 + regno - FIRST_VFP_REGNUM; + { + /* See comment in arm_dwarf_register_span. */ + if (VFP_REGNO_OK_FOR_SINGLE (regno)) + return 64 + regno - FIRST_VFP_REGNUM; + else + return 256 + (regno - FIRST_VFP_REGNUM) / 2; + } if (IS_IWMMXT_GR_REGNUM (regno)) return 104 + regno - FIRST_IWMMXT_GR_REGNUM; @@ -18493,8 +22101,41 @@ arm_dbx_register_number (unsigned int regno) gcc_unreachable (); } +/* Dwarf models VFPv3 registers as 32 64-bit registers. + GCC models tham as 64 32-bit registers, so we need to describe this to + the DWARF generation code. Other registers can use the default. */ +static rtx +arm_dwarf_register_span (rtx rtl) +{ + unsigned regno; + int nregs; + int i; + rtx p; + + regno = REGNO (rtl); + if (!IS_VFP_REGNUM (regno)) + return NULL_RTX; + + /* XXX FIXME: The EABI defines two VFP register ranges: + 64-95: Legacy VFPv2 numbering for S0-S31 (obsolescent) + 256-287: D0-D31 + The recommended encoding for S0-S31 is a DW_OP_bit_piece of the + corresponding D register. Until GDB supports this, we shall use the + legacy encodings. We also use these encodings for D0-D15 for + compatibility with older debuggers. */ + if (VFP_REGNO_OK_FOR_SINGLE (regno)) + return NULL_RTX; + + nregs = GET_MODE_SIZE (GET_MODE (rtl)) / 8; + p = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs)); + regno = (regno - FIRST_VFP_REGNUM) / 2; + for (i = 0; i < nregs; i++) + XVECEXP (p, 0, i) = gen_rtx_REG (DImode, 256 + regno + i); + + return p; +} -#ifdef TARGET_UNWIND_INFO +#if ARM_UNWIND_INFO /* Emit unwind directives for a store-multiple instruction or stack pointer push during alignment. These should only ever be generated by the function prologue code, so @@ -18659,7 +22300,7 @@ arm_unwind_emit_set (FILE * asm_out_file, rtx p) offset = INTVAL (XEXP (e1, 1)); asm_fprintf (asm_out_file, "\t.setfp %r, %r, #%wd\n", HARD_FRAME_POINTER_REGNUM, reg, - INTVAL (XEXP (e1, 1))); + offset); } else if (GET_CODE (e1) == REG) { @@ -18708,7 +22349,7 @@ arm_unwind_emit (FILE * asm_out_file, rtx insn) { rtx pat; - if (!ARM_EABI_UNWIND_TABLES) + if (arm_except_unwind_info () != UI_TARGET) return; if (!(flag_unwind_tables || crtl->uses_eh_lsda) @@ -18758,7 +22399,52 @@ arm_output_ttype (rtx x) return TRUE; } -#endif /* TARGET_UNWIND_INFO */ + +/* Implement TARGET_ASM_EMIT_EXCEPT_PERSONALITY. */ + +static void +arm_asm_emit_except_personality (rtx personality) +{ + fputs ("\t.personality\t", asm_out_file); + output_addr_const (asm_out_file, personality); + fputc ('\n', asm_out_file); +} + +/* Implement TARGET_ASM_INITIALIZE_SECTIONS. */ + +static void +arm_asm_init_sections (void) +{ + exception_section = get_unnamed_section (0, output_section_asm_op, + "\t.handlerdata"); +} +#endif /* ARM_UNWIND_INFO */ + +/* Implement TARGET_EXCEPT_UNWIND_INFO. */ + +static enum unwind_info_type +arm_except_unwind_info (void) +{ + /* Honor the --enable-sjlj-exceptions configure switch. */ +#ifdef CONFIG_SJLJ_EXCEPTIONS + if (CONFIG_SJLJ_EXCEPTIONS) + return UI_SJLJ; +#endif + + /* If not using ARM EABI unwind tables... */ + if (ARM_UNWIND_INFO) + { + /* For simplicity elsewhere in this file, indicate that all unwind + info is disabled if we're not emitting unwind tables. */ + if (!flag_exceptions && !flag_unwind_tables) + return UI_NONE; + else + return UI_TARGET; + } + + /* ... we use sjlj exceptions for backwards compatibility. */ + return UI_SJLJ; +} /* Handle UNSPEC DWARF call frame instructions. These are needed for dynamic @@ -18790,7 +22476,7 @@ arm_dwarf_handle_frame_unspec (const char *label, rtx pattern, int index) void arm_output_fn_unwind (FILE * f, bool prologue) { - if (!ARM_EABI_UNWIND_TABLES) + if (arm_except_unwind_info () != UI_TARGET) return; if (prologue) @@ -18816,7 +22502,7 @@ arm_emit_tls_decoration (FILE *fp, rtx x) rtx val; val = XVECEXP (x, 0, 0); - reloc = INTVAL (XVECEXP (x, 0, 1)); + reloc = (enum tls_reloc) INTVAL (XVECEXP (x, 0, 1)); output_addr_const (fp, val); @@ -18870,7 +22556,9 @@ arm_output_dwarf_dtprel (FILE *file, int size, rtx x) fputs ("(tlsldo)", file); } -bool +/* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */ + +static bool arm_output_addr_const_extra (FILE *fp, rtx x) { if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS) @@ -18885,6 +22573,26 @@ arm_output_addr_const_extra (FILE *fp, rtx x) return TRUE; } + else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_GOTSYM_OFF) + { + assemble_name (fp, "_GLOBAL_OFFSET_TABLE_"); + if (GOT_PCREL) + fputs ("+.", fp); + fputs ("-(", fp); + output_addr_const (fp, XVECEXP (x, 0, 0)); + fputc (')', fp); + return TRUE; + } + else if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SYMBOL_OFFSET) + { + output_addr_const (fp, XVECEXP (x, 0, 0)); + if (GOT_PCREL) + fputs ("+.", fp); + fputs ("-(", fp); + output_addr_const (fp, XVECEXP (x, 0, 1)); + fputc (')', fp); + return TRUE; + } else if (GET_CODE (x) == CONST_VECTOR) return arm_emit_vector_const (fp, x); @@ -18924,6 +22632,29 @@ arm_output_shift(rtx * operands, int set_flags) return ""; } +/* Output a Thumb-1 casesi dispatch sequence. */ +const char * +thumb1_output_casesi (rtx *operands) +{ + rtx diff_vec = PATTERN (next_real_insn (operands[0])); + + gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC); + + switch (GET_MODE(diff_vec)) + { + case QImode: + return (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned ? + "bl\t%___gnu_thumb1_case_uqi" : "bl\t%___gnu_thumb1_case_sqi"); + case HImode: + return (ADDR_DIFF_VEC_FLAGS (diff_vec).offset_unsigned ? + "bl\t%___gnu_thumb1_case_uhi" : "bl\t%___gnu_thumb1_case_shi"); + case SImode: + return "bl\t%___gnu_thumb1_case_si"; + default: + gcc_unreachable (); + } +} + /* Output a Thumb-2 casesi instruction. */ const char * thumb2_output_casesi (rtx *operands) @@ -18966,7 +22697,10 @@ arm_issue_rate (void) switch (arm_tune) { case cortexr4: + case cortexr4f: + case cortexa5: case cortexa8: + case cortexa9: return 2; default: @@ -19016,6 +22750,25 @@ arm_mangle_type (const_tree type) { arm_mangle_map_entry *pos = arm_mangle_map; + /* The ARM ABI documents (10th October 2008) say that "__va_list" + has to be managled as if it is in the "std" namespace. */ + if (TARGET_AAPCS_BASED + && lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type)) + { + static bool warned; + if (!warned && warn_psabi && !in_system_header) + { + warned = true; + inform (input_location, + "the mangling of % has changed in GCC 4.4"); + } + return "St9__va_list"; + } + + /* Half-precision float. */ + if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16) + return "Dh"; + if (TREE_CODE (type) != VECTOR_TYPE) return NULL; @@ -19039,4 +22792,464 @@ arm_mangle_type (const_tree type) return NULL; } +/* Order of allocation of core registers for Thumb: this allocation is + written over the corresponding initial entries of the array + initialized with REG_ALLOC_ORDER. We allocate all low registers + first. Saving and restoring a low register is usually cheaper than + using a call-clobbered high register. */ + +static const int thumb_core_reg_alloc_order[] = +{ + 3, 2, 1, 0, 4, 5, 6, 7, + 14, 12, 8, 9, 10, 11, 13, 15 +}; + +/* Adjust register allocation order when compiling for Thumb. */ + +void +arm_order_regs_for_local_alloc (void) +{ + const int arm_reg_alloc_order[] = REG_ALLOC_ORDER; + memcpy(reg_alloc_order, arm_reg_alloc_order, sizeof (reg_alloc_order)); + if (TARGET_THUMB) + memcpy (reg_alloc_order, thumb_core_reg_alloc_order, + sizeof (thumb_core_reg_alloc_order)); +} + +/* Set default optimization options. */ +static void +arm_option_optimization (int level, int size ATTRIBUTE_UNUSED) +{ + /* Enable section anchors by default at -O1 or higher. + Use 2 to distinguish from an explicit -fsection-anchors + given on the command line. */ + if (level > 0) + flag_section_anchors = 2; +} + +/* Implement TARGET_FRAME_POINTER_REQUIRED. */ + +bool +arm_frame_pointer_required (void) +{ + return (cfun->has_nonlocal_label + || SUBTARGET_FRAME_POINTER_REQUIRED + || (TARGET_ARM && TARGET_APCS_FRAME && ! leaf_function_p ())); +} + +/* Only thumb1 can't support conditional execution, so return true if + the target is not thumb1. */ +static bool +arm_have_conditional_execution (void) +{ + return !TARGET_THUMB1; +} + +/* Legitimize a memory reference for sync primitive implemented using + ldrex / strex. We currently force the form of the reference to be + indirect without offset. We do not yet support the indirect offset + addressing supported by some ARM targets for these + instructions. */ +static rtx +arm_legitimize_sync_memory (rtx memory) +{ + rtx addr = force_reg (Pmode, XEXP (memory, 0)); + rtx legitimate_memory = gen_rtx_MEM (GET_MODE (memory), addr); + + set_mem_alias_set (legitimate_memory, ALIAS_SET_MEMORY_BARRIER); + MEM_VOLATILE_P (legitimate_memory) = MEM_VOLATILE_P (memory); + return legitimate_memory; +} + +/* An instruction emitter. */ +typedef void (* emit_f) (int label, const char *, rtx *); + +/* An instruction emitter that emits via the conventional + output_asm_insn. */ +static void +arm_emit (int label ATTRIBUTE_UNUSED, const char *pattern, rtx *operands) +{ + output_asm_insn (pattern, operands); +} + +/* Count the number of emitted synchronization instructions. */ +static unsigned arm_insn_count; + +/* An emitter that counts emitted instructions but does not actually + emit instruction into the the instruction stream. */ +static void +arm_count (int label, + const char *pattern ATTRIBUTE_UNUSED, + rtx *operands ATTRIBUTE_UNUSED) +{ + if (! label) + ++ arm_insn_count; +} + +/* Construct a pattern using conventional output formatting and feed + it to output_asm_insn. Provides a mechanism to construct the + output pattern on the fly. Note the hard limit on the pattern + buffer size. */ +static void +arm_output_asm_insn (emit_f emit, int label, rtx *operands, + const char *pattern, ...) +{ + va_list ap; + char buffer[256]; + + va_start (ap, pattern); + vsprintf (buffer, pattern, ap); + va_end (ap); + emit (label, buffer, operands); +} + +/* Emit the memory barrier instruction, if any, provided by this + target to a specified emitter. */ +static void +arm_process_output_memory_barrier (emit_f emit, rtx *operands) +{ + if (TARGET_HAVE_DMB) + { + /* Note we issue a system level barrier. We should consider + issuing a inner shareabilty zone barrier here instead, ie. + "DMB ISH". */ + emit (0, "dmb\tsy", operands); + return; + } + + if (TARGET_HAVE_DMB_MCR) + { + emit (0, "mcr\tp15, 0, r0, c7, c10, 5", operands); + return; + } + + gcc_unreachable (); +} + +/* Emit the memory barrier instruction, if any, provided by this + target. */ +const char * +arm_output_memory_barrier (rtx *operands) +{ + arm_process_output_memory_barrier (arm_emit, operands); + return ""; +} + +/* Helper to figure out the instruction suffix required on ldrex/strex + for operations on an object of the specified mode. */ +static const char * +arm_ldrex_suffix (enum machine_mode mode) +{ + switch (mode) + { + case QImode: return "b"; + case HImode: return "h"; + case SImode: return ""; + case DImode: return "d"; + default: + gcc_unreachable (); + } + return ""; +} + +/* Emit an ldrex{b,h,d, } instruction appropriate for the specified + mode. */ +static void +arm_output_ldrex (emit_f emit, + enum machine_mode mode, + rtx target, + rtx memory) +{ + const char *suffix = arm_ldrex_suffix (mode); + rtx operands[2]; + + operands[0] = target; + operands[1] = memory; + arm_output_asm_insn (emit, 0, operands, "ldrex%s\t%%0, %%C1", suffix); +} + +/* Emit a strex{b,h,d, } instruction appropriate for the specified + mode. */ +static void +arm_output_strex (emit_f emit, + enum machine_mode mode, + const char *cc, + rtx result, + rtx value, + rtx memory) +{ + const char *suffix = arm_ldrex_suffix (mode); + rtx operands[3]; + + operands[0] = result; + operands[1] = value; + operands[2] = memory; + arm_output_asm_insn (emit, 0, operands, "strex%s%s\t%%0, %%1, %%C2", suffix, + cc); +} + +/* Helper to emit a two operand instruction. */ +static void +arm_output_op2 (emit_f emit, const char *mnemonic, rtx d, rtx s) +{ + rtx operands[2]; + + operands[0] = d; + operands[1] = s; + arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1", mnemonic); +} + +/* Helper to emit a three operand instruction. */ +static void +arm_output_op3 (emit_f emit, const char *mnemonic, rtx d, rtx a, rtx b) +{ + rtx operands[3]; + + operands[0] = d; + operands[1] = a; + operands[2] = b; + arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1, %%2", mnemonic); +} + +/* Emit a load store exclusive synchronization loop. + + do + old_value = [mem] + if old_value != required_value + break; + t1 = sync_op (old_value, new_value) + [mem] = t1, t2 = [0|1] + while ! t2 + + Note: + t1 == t2 is not permitted + t1 == old_value is permitted + + required_value: + + RTX register or const_int representing the required old_value for + the modify to continue, if NULL no comparsion is performed. */ +static void +arm_output_sync_loop (emit_f emit, + enum machine_mode mode, + rtx old_value, + rtx memory, + rtx required_value, + rtx new_value, + rtx t1, + rtx t2, + enum attr_sync_op sync_op, + int early_barrier_required) +{ + rtx operands[1]; + + gcc_assert (t1 != t2); + + if (early_barrier_required) + arm_process_output_memory_barrier (emit, NULL); + + arm_output_asm_insn (emit, 1, operands, "%sLSYT%%=:", LOCAL_LABEL_PREFIX); + + arm_output_ldrex (emit, mode, old_value, memory); + + if (required_value) + { + rtx operands[2]; + + operands[0] = old_value; + operands[1] = required_value; + arm_output_asm_insn (emit, 0, operands, "cmp\t%%0, %%1"); + arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYB%%=", LOCAL_LABEL_PREFIX); + } + + switch (sync_op) + { + case SYNC_OP_ADD: + arm_output_op3 (emit, "add", t1, old_value, new_value); + break; + + case SYNC_OP_SUB: + arm_output_op3 (emit, "sub", t1, old_value, new_value); + break; + + case SYNC_OP_IOR: + arm_output_op3 (emit, "orr", t1, old_value, new_value); + break; + + case SYNC_OP_XOR: + arm_output_op3 (emit, "eor", t1, old_value, new_value); + break; + + case SYNC_OP_AND: + arm_output_op3 (emit,"and", t1, old_value, new_value); + break; + + case SYNC_OP_NAND: + arm_output_op3 (emit, "and", t1, old_value, new_value); + arm_output_op2 (emit, "mvn", t1, t1); + break; + + case SYNC_OP_NONE: + t1 = new_value; + break; + } + + arm_output_strex (emit, mode, "", t2, t1, memory); + operands[0] = t2; + arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0"); + arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=", LOCAL_LABEL_PREFIX); + + arm_process_output_memory_barrier (emit, NULL); + arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX); +} + +static rtx +arm_get_sync_operand (rtx *operands, int index, rtx default_value) +{ + if (index > 0) + default_value = operands[index - 1]; + + return default_value; +} + +#define FETCH_SYNC_OPERAND(NAME, DEFAULT) \ + arm_get_sync_operand (operands, (int) get_attr_sync_##NAME (insn), DEFAULT); + +/* Extract the operands for a synchroniztion instruction from the + instructions attributes and emit the instruction. */ +static void +arm_process_output_sync_insn (emit_f emit, rtx insn, rtx *operands) +{ + rtx result, memory, required_value, new_value, t1, t2; + int early_barrier; + enum machine_mode mode; + enum attr_sync_op sync_op; + + result = FETCH_SYNC_OPERAND(result, 0); + memory = FETCH_SYNC_OPERAND(memory, 0); + required_value = FETCH_SYNC_OPERAND(required_value, 0); + new_value = FETCH_SYNC_OPERAND(new_value, 0); + t1 = FETCH_SYNC_OPERAND(t1, 0); + t2 = FETCH_SYNC_OPERAND(t2, 0); + early_barrier = + get_attr_sync_release_barrier (insn) == SYNC_RELEASE_BARRIER_YES; + sync_op = get_attr_sync_op (insn); + mode = GET_MODE (memory); + + arm_output_sync_loop (emit, mode, result, memory, required_value, + new_value, t1, t2, sync_op, early_barrier); +} + +/* Emit a synchronization instruction loop. */ +const char * +arm_output_sync_insn (rtx insn, rtx *operands) +{ + arm_process_output_sync_insn (arm_emit, insn, operands); + return ""; +} + +/* Count the number of machine instruction that will be emitted for a + synchronization instruction. Note that the emitter used does not + emit instructions, it just counts instructions being carefull not + to count labels. */ +unsigned int +arm_sync_loop_insns (rtx insn, rtx *operands) +{ + arm_insn_count = 0; + arm_process_output_sync_insn (arm_count, insn, operands); + return arm_insn_count; +} + +/* Helper to call a target sync instruction generator, dealing with + the variation in operands required by the different generators. */ +static rtx +arm_call_generator (struct arm_sync_generator *generator, rtx old_value, + rtx memory, rtx required_value, rtx new_value) +{ + switch (generator->op) + { + case arm_sync_generator_omn: + gcc_assert (! required_value); + return generator->u.omn (old_value, memory, new_value); + + case arm_sync_generator_omrn: + gcc_assert (required_value); + return generator->u.omrn (old_value, memory, required_value, new_value); + } + + return NULL; +} + +/* Expand a synchronization loop. The synchronization loop is expanded + as an opaque block of instructions in order to ensure that we do + not subsequently get extraneous memory accesses inserted within the + critical region. The exclusive access property of ldrex/strex is + only guaranteed in there are no intervening memory accesses. */ +void +arm_expand_sync (enum machine_mode mode, + struct arm_sync_generator *generator, + rtx target, rtx memory, rtx required_value, rtx new_value) +{ + if (target == NULL) + target = gen_reg_rtx (mode); + + memory = arm_legitimize_sync_memory (memory); + if (mode != SImode) + { + rtx load_temp = gen_reg_rtx (SImode); + + if (required_value) + required_value = convert_modes (SImode, mode, required_value, true); + + new_value = convert_modes (SImode, mode, new_value, true); + emit_insn (arm_call_generator (generator, load_temp, memory, + required_value, new_value)); + emit_move_insn (target, gen_lowpart (mode, load_temp)); + } + else + { + emit_insn (arm_call_generator (generator, target, memory, required_value, + new_value)); + } +} + +static bool +arm_vector_alignment_reachable (const_tree type, bool is_packed) +{ + /* Vectors which aren't in packed structures will not be less aligned than + the natural alignment of their element type, so this is safe. */ + if (TARGET_NEON && !BYTES_BIG_ENDIAN) + return !is_packed; + + return default_builtin_vector_alignment_reachable (type, is_packed); +} + +static bool +arm_builtin_support_vector_misalignment (enum machine_mode mode, + const_tree type, int misalignment, + bool is_packed) +{ + if (TARGET_NEON && !BYTES_BIG_ENDIAN) + { + HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type); + + if (is_packed) + return align == 1; + + /* If the misalignment is unknown, we should be able to handle the access + so long as it is not to a member of a packed data structure. */ + if (misalignment == -1) + return true; + + /* Return true if the misalignment is a multiple of the natural alignment + of the vector's element type. This is probably always going to be + true in practice, since we've already established that this isn't a + packed access. */ + return ((misalignment % align) == 0); + } + + return default_builtin_support_vector_misalignment (mode, type, misalignment, + is_packed); +} + #include "gt-arm.h"