X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Fconfig%2Fi386%2Fi386.c;h=cdd35b2bf41136891c3ca356f8f1474a3bc465ab;hb=6b195b535c9b98b364e7d6cea3a1ad104811a420;hp=16dfe454b3976f59ea11432c211ff49ee033d21e;hpb=cbf586882d7da8e8e9b3f98e03d0c501f1b842ed;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 16dfe454b39..cdd35b2bf41 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -30,7 +30,6 @@ Boston, MA 02111-1307, USA. */ #include "real.h" #include "insn-config.h" #include "conditions.h" -#include "insn-flags.h" #include "output.h" #include "insn-attr.h" #include "flags.h" @@ -41,6 +40,8 @@ Boston, MA 02111-1307, USA. */ #include "toplev.h" #include "basic-block.h" #include "ggc.h" +#include "target.h" +#include "target-def.h" #ifndef CHECK_STACK_LIMIT #define CHECK_STACK_LIMIT -1 @@ -291,6 +292,7 @@ const int x86_use_bit_test = m_386; const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON | m_K6; const int x86_cmove = m_PPRO | m_ATHLON | m_PENT4; const int x86_deep_branch = m_PPRO | m_K6 | m_ATHLON | m_PENT4; +const int x86_branch_hints = m_PENT4; const int x86_use_sahf = m_PPRO | m_K6 | m_PENT4; const int x86_partial_reg_stall = m_PPRO; const int x86_use_loop = m_K6; @@ -313,6 +315,9 @@ const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486 | m_PENT4; const int x86_integer_DFmode_moves = ~(m_ATHLON | m_PENT4); const int x86_partial_reg_dependency = m_ATHLON | m_PENT4; const int x86_memory_mismatch_stall = m_ATHLON | m_PENT4; +const int x86_accumulate_outgoing_args = m_ATHLON | m_PENT4 | m_PPRO; +const int x86_prologue_using_move = m_ATHLON | m_PENT4 | m_PPRO; +const int x86_epilogue_using_move = m_ATHLON | m_PENT4 | m_PPRO; #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx)) @@ -442,16 +447,20 @@ int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] = struct rtx_def *ix86_compare_op0 = NULL_RTX; struct rtx_def *ix86_compare_op1 = NULL_RTX; -#define MAX_386_STACK_LOCALS 2 +#define MAX_386_STACK_LOCALS 3 +/* Size of the register save area. */ +#define X86_64_VARARGS_SIZE (REGPARM_MAX * UNITS_PER_WORD + SSE_REGPARM_MAX * 16) /* Define the structure for the machine field in struct function. */ struct machine_function { rtx stack_locals[(int) MAX_MACHINE_MODE][MAX_386_STACK_LOCALS]; + int save_varrargs_registers; int accesses_prev_frame; }; #define ix86_stack_locals (cfun->machine->stack_locals) +#define ix86_save_varrargs_registers (cfun->machine->save_varrargs_registers) /* Structure describing stack frame layout. Stack grows downward: @@ -476,9 +485,11 @@ struct ix86_frame { int nregs; int padding1; + int va_arg_size; HOST_WIDE_INT frame; int padding2; int outgoing_arguments_size; + int red_zone_size; HOST_WIDE_INT to_allocate; /* The offsets relative to ARG_POINTER. */ @@ -527,14 +538,7 @@ int ix86_branch_cost; const char *ix86_branch_cost_string; /* Power of two alignment for functions. */ -int ix86_align_funcs; const char *ix86_align_funcs_string; - -/* Power of two alignment for loops. */ -int ix86_align_loops; - -/* Power of two alignment for non-loop jumps. */ -int ix86_align_jumps; static void output_pic_addr_const PARAMS ((FILE *, rtx, int)); static void put_condition_code PARAMS ((enum rtx_code, enum machine_mode, @@ -561,12 +565,16 @@ static int ix86_split_to_parts PARAMS ((rtx, rtx *, enum machine_mode)); static int ix86_safe_length_prefix PARAMS ((rtx)); static int ix86_nsaved_regs PARAMS((void)); static void ix86_emit_save_regs PARAMS((void)); -static void ix86_emit_restore_regs_using_mov PARAMS ((rtx, int)); -static void ix86_emit_epilogue_esp_adjustment PARAMS((int)); +static void ix86_emit_save_regs_using_mov PARAMS ((rtx, HOST_WIDE_INT)); +static void ix86_emit_restore_regs_using_mov PARAMS ((rtx, int, int)); static void ix86_set_move_mem_attrs_1 PARAMS ((rtx, rtx, rtx, rtx, rtx)); static void ix86_sched_reorder_pentium PARAMS((rtx *, rtx *)); static void ix86_sched_reorder_ppro PARAMS((rtx *, rtx *)); static HOST_WIDE_INT ix86_GOT_alias_set PARAMS ((void)); +static void ix86_adjust_counter PARAMS ((rtx, HOST_WIDE_INT)); +static rtx ix86_zero_extend_to_Pmode PARAMS ((rtx)); +static rtx ix86_expand_aligntest PARAMS ((rtx, int)); +static void ix86_expand_strlensi_unroll_1 PARAMS ((rtx, rtx)); struct ix86_address { @@ -597,8 +605,26 @@ static int ix86_fp_comparison_arithmetics_cost PARAMS ((enum rtx_code code)); static int ix86_fp_comparison_fcomi_cost PARAMS ((enum rtx_code code)); static int ix86_fp_comparison_sahf_cost PARAMS ((enum rtx_code code)); static int ix86_fp_comparison_cost PARAMS ((enum rtx_code code)); -static int ix86_save_reg PARAMS ((int)); +static int ix86_save_reg PARAMS ((int, int)); static void ix86_compute_frame_layout PARAMS ((struct ix86_frame *)); +static int ix86_comp_type_attributes PARAMS ((tree, tree)); + +/* Initialize the GCC target structure. */ +#undef TARGET_VALID_TYPE_ATTRIBUTE +#ifdef TARGET_DLLIMPORT_DECL_ATTRIBUTES +# define TARGET_VALID_TYPE_ATTRIBUTE i386_pe_valid_type_attribute_p +# undef TARGET_VALID_DECL_ATTRIBUTE +# define TARGET_VALID_DECL_ATTRIBUTE i386_pe_valid_decl_attribute_p +# undef TARGET_MERGE_DECL_ATTRIBUTES +# define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes +#else +# define TARGET_VALID_TYPE_ATTRIBUTE ix86_valid_type_attribute_p +#endif + +#undef TARGET_COMP_TYPE_ATTRIBUTES +#define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes + +struct gcc_target target = TARGET_INITIALIZER; /* Sometimes certain combinations of command options do not make sense on a particular target machine. You can define a macro @@ -692,6 +718,9 @@ override_options () ix86_cmodel_string, TARGET_64BIT ? "64" : "32"); if (ix86_cmodel == CM_LARGE) sorry ("Code model `large' not supported yet."); + if ((TARGET_64BIT != 0) != ((target_flags & MASK_64BIT) != 0)) + sorry ("%i-bit mode not compiled in.", + (target_flags & MASK_64BIT) ? 64 : 32); if (ix86_arch_string != 0) { @@ -738,49 +767,70 @@ override_options () else ix86_regparm = i; } + else + if (TARGET_64BIT) + ix86_regparm = REGPARM_MAX; - /* Validate -malign-loops= value, or provide default. */ - ix86_align_loops = processor_target_table[ix86_cpu].align_loop; + /* If the user has provided any of the -malign-* options, + warn and use that value only if -falign-* is not set. + Remove this code in GCC 3.2 or later. */ if (ix86_align_loops_string) { - i = atoi (ix86_align_loops_string); - if (i < 0 || i > MAX_CODE_ALIGN) - error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN); - else - ix86_align_loops = i; + warning ("-malign-loops is obsolete, use -falign-loops"); + if (align_loops == 0) + { + i = atoi (ix86_align_loops_string); + if (i < 0 || i > MAX_CODE_ALIGN) + error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN); + else + align_loops = 1 << i; + } } - /* Validate -malign-jumps= value, or provide default. */ - ix86_align_jumps = processor_target_table[ix86_cpu].align_jump; if (ix86_align_jumps_string) { - i = atoi (ix86_align_jumps_string); - if (i < 0 || i > MAX_CODE_ALIGN) - error ("-malign-jumps=%d is not between 0 and %d", i, MAX_CODE_ALIGN); - else - ix86_align_jumps = i; + warning ("-malign-jumps is obsolete, use -falign-jumps"); + if (align_jumps == 0) + { + i = atoi (ix86_align_jumps_string); + if (i < 0 || i > MAX_CODE_ALIGN) + error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN); + else + align_jumps = 1 << i; + } } - /* Validate -malign-functions= value, or provide default. */ - ix86_align_funcs = processor_target_table[ix86_cpu].align_func; if (ix86_align_funcs_string) { - i = atoi (ix86_align_funcs_string); - if (i < 0 || i > MAX_CODE_ALIGN) - error ("-malign-functions=%d is not between 0 and %d", - i, MAX_CODE_ALIGN); - else - ix86_align_funcs = i; + warning ("-malign-functions is obsolete, use -falign-functions"); + if (align_functions == 0) + { + i = atoi (ix86_align_funcs_string); + if (i < 0 || i > MAX_CODE_ALIGN) + error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN); + else + align_functions = 1 << i; + } } + /* Default align_* from the processor table. */ +#define abs(n) (n < 0 ? -n : n) + if (align_loops == 0) + align_loops = 1 << abs (processor_target_table[ix86_cpu].align_loop); + if (align_jumps == 0) + align_jumps = 1 << abs (processor_target_table[ix86_cpu].align_jump); + if (align_functions == 0) + align_functions = 1 << abs (processor_target_table[ix86_cpu].align_func); + /* Validate -mpreferred-stack-boundary= value, or provide default. The default of 128 bits is for Pentium III's SSE __m128. */ ix86_preferred_stack_boundary = 128; if (ix86_preferred_stack_boundary_string) { i = atoi (ix86_preferred_stack_boundary_string); - if (i < 2 || i > 31) - error ("-mpreferred-stack-boundary=%d is not between 2 and 31", i); + if (i < (TARGET_64BIT ? 3 : 2) || i > 31) + error ("-mpreferred-stack-boundary=%d is not between %d and 31", i, + TARGET_64BIT ? 3 : 2); else ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT; } @@ -809,6 +859,11 @@ override_options () on by -msse. */ if (TARGET_SSE) target_flags |= MASK_MMX; + + if ((x86_accumulate_outgoing_args & CPUMASK) + && !(target_flags & MASK_NO_ACCUMULATE_OUTGOING_ARGS) + && !optimize_size) + target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; } void @@ -825,20 +880,6 @@ optimization_options (level, size) } /* Return nonzero if IDENTIFIER with arguments ARGS is a valid machine specific - attribute for DECL. The attributes in ATTRIBUTES have previously been - assigned to DECL. */ - -int -ix86_valid_decl_attribute_p (decl, attributes, identifier, args) - tree decl ATTRIBUTE_UNUSED; - tree attributes ATTRIBUTE_UNUSED; - tree identifier ATTRIBUTE_UNUSED; - tree args ATTRIBUTE_UNUSED; -{ - return 0; -} - -/* Return nonzero if IDENTIFIER with arguments ARGS is a valid machine specific attribute for TYPE. The attributes in ATTRIBUTES have previously been assigned to TYPE. */ @@ -857,11 +898,13 @@ ix86_valid_type_attribute_p (type, attributes, identifier, args) /* Stdcall attribute says callee is responsible for popping arguments if they are not variable. */ - if (is_attribute_p ("stdcall", identifier)) + if (is_attribute_p ("stdcall", identifier) + && !TARGET_64BIT) return (args == NULL_TREE); /* Cdecl attribute says the callee is a normal C declaration. */ - if (is_attribute_p ("cdecl", identifier)) + if (is_attribute_p ("cdecl", identifier) + && !TARGET_64BIT) return (args == NULL_TREE); /* Regparm attribute specifies how many integer arguments are to be @@ -892,7 +935,7 @@ ix86_valid_type_attribute_p (type, attributes, identifier, args) are compatible, and 2 if they are nearly compatible (which causes a warning to be generated). */ -int +static int ix86_comp_type_attributes (type1, type2) tree type1; tree type2; @@ -950,7 +993,8 @@ ix86_return_pops_args (fundecl, funtype, size) } /* Lose any fake structure return argument. */ - if (aggregate_value_p (TREE_TYPE (funtype))) + if (aggregate_value_p (TREE_TYPE (funtype)) + && !TARGET_64BIT) return GET_MODE_SIZE (Pmode); return 0; @@ -1089,6 +1133,9 @@ function_arg (cum, mode, type, named) (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); int words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + if (mode == VOIDmode) + return constm1_rtx; + switch (mode) { /* For now, pass fp/complex values on the stack. */ @@ -1127,6 +1174,101 @@ function_arg (cum, mode, type, named) } +/* Return nonzero if OP is general operand representable on x86_64. */ + +int +x86_64_general_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + if (!TARGET_64BIT) + return general_operand (op, mode); + if (nonimmediate_operand (op, mode)) + return 1; + return x86_64_sign_extended_value (op); +} + +/* Return nonzero if OP is general operand representable on x86_64 + as eighter sign extended or zero extended constant. */ + +int +x86_64_szext_general_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + if (!TARGET_64BIT) + return general_operand (op, mode); + if (nonimmediate_operand (op, mode)) + return 1; + return x86_64_sign_extended_value (op) || x86_64_zero_extended_value (op); +} + +/* Return nonzero if OP is nonmemory operand representable on x86_64. */ + +int +x86_64_nonmemory_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + if (!TARGET_64BIT) + return nonmemory_operand (op, mode); + if (register_operand (op, mode)) + return 1; + return x86_64_sign_extended_value (op); +} + +/* Return nonzero if OP is nonmemory operand acceptable by movabs patterns. */ + +int +x86_64_movabs_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + if (!TARGET_64BIT || !flag_pic) + return nonmemory_operand (op, mode); + if (register_operand (op, mode) || x86_64_sign_extended_value (op)) + return 1; + if (CONSTANT_P (op) && !symbolic_reference_mentioned_p (op)) + return 1; + return 0; +} + +/* Return nonzero if OP is nonmemory operand representable on x86_64. */ + +int +x86_64_szext_nonmemory_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + if (!TARGET_64BIT) + return nonmemory_operand (op, mode); + if (register_operand (op, mode)) + return 1; + return x86_64_sign_extended_value (op) || x86_64_zero_extended_value (op); +} + +/* Return nonzero if OP is immediate operand representable on x86_64. */ + +int +x86_64_immediate_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + if (!TARGET_64BIT) + return immediate_operand (op, mode); + return x86_64_sign_extended_value (op); +} + +/* Return nonzero if OP is immediate operand representable on x86_64. */ + +int +x86_64_zext_immediate_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + return x86_64_zero_extended_value (op); +} + /* Return nonzero if OP is (const_int 1), else return zero. */ int @@ -1287,23 +1429,27 @@ const248_operand (op, mode) int incdec_operand (op, mode) register rtx op; - enum machine_mode mode; + enum machine_mode mode ATTRIBUTE_UNUSED; { /* On Pentium4, the inc and dec operations causes extra dependancy on flag registers, since carry flag is not set. */ if (TARGET_PENTIUM4 && !optimize_size) return 0; - if (op == const1_rtx || op == constm1_rtx) - return 1; - if (GET_CODE (op) != CONST_INT) - return 0; - if (mode == SImode && INTVAL (op) == (HOST_WIDE_INT) 0xffffffff) - return 1; - if (mode == HImode && INTVAL (op) == (HOST_WIDE_INT) 0xffff) - return 1; - if (mode == QImode && INTVAL (op) == (HOST_WIDE_INT) 0xff) - return 1; - return 0; + return op == const1_rtx || op == constm1_rtx; +} + +/* Return nonzero if OP is acceptable as operand of DImode shift + expander. */ + +int +shiftdi_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + if (TARGET_64BIT) + return nonimmediate_operand (op, mode); + else + return register_operand (op, mode); } /* Return false if this is the stack pointer, or any other fake @@ -1350,6 +1496,10 @@ general_no_elim_operand (op, mode) || t == virtual_incoming_args_rtx || t == virtual_stack_vars_rtx || t == virtual_stack_dynamic_rtx) return 0; + if (REG_P (t) + && REGNO (t) >= FIRST_VIRTUAL_REGISTER + && REGNO (t) <= LAST_VIRTUAL_REGISTER) + return 0; return general_operand (op, mode); } @@ -1559,9 +1709,17 @@ ext_register_operand (op, mode) register rtx op; enum machine_mode mode ATTRIBUTE_UNUSED; { - if (GET_MODE (op) != SImode && GET_MODE (op) != HImode) + int regno; + if ((!TARGET_64BIT || GET_MODE (op) != DImode) + && GET_MODE (op) != SImode && GET_MODE (op) != HImode) + return 0; + + if (!register_operand (op, VOIDmode)) return 0; - return register_operand (op, VOIDmode); + + /* Be curefull to accept only registers having upper parts. */ + regno = REG_P (op) ? REGNO (op) : REGNO (SUBREG_REG (op)); + return (regno > LAST_VIRTUAL_REGISTER || regno < 4); } /* Return 1 if this is a valid binary floating-point operation. @@ -1643,7 +1801,7 @@ cmpsi_operand (op, mode) rtx op; enum machine_mode mode; { - if (general_operand (op, mode)) + if (nonimmediate_operand (op, mode)) return 1; if (GET_CODE (op) == AND @@ -1844,7 +2002,7 @@ x86_64_sign_extended_value (value) else { HOST_WIDE_INT val = trunc_int_for_mode (INTVAL (value), DImode); - return (HOST_WIDE_INT)(int)val == val; + return trunc_int_for_mode (val, SImode) == val; } break; @@ -2084,6 +2242,9 @@ load_pic_register () { rtx gotsym, pclab; + if (TARGET_64BIT) + abort(); + gotsym = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_"); if (TARGET_DEEP_BRANCH_PREDICTION) @@ -2105,31 +2266,50 @@ load_pic_register () emit_insn (gen_prologue_set_got (pic_offset_table_rtx, gotsym, pclab)); } -/* Generate an SImode "push" pattern for input ARG. */ +/* Generate an "push" pattern for input ARG. */ static rtx gen_push (arg) rtx arg; { return gen_rtx_SET (VOIDmode, - gen_rtx_MEM (SImode, - gen_rtx_PRE_DEC (SImode, + gen_rtx_MEM (Pmode, + gen_rtx_PRE_DEC (Pmode, stack_pointer_rtx)), arg); } /* Return 1 if we need to save REGNO. */ static int -ix86_save_reg (regno) - int regno; -{ - int pic_reg_used = flag_pic && (current_function_uses_pic_offset_table - || current_function_uses_const_pool); - return ((regs_ever_live[regno] && !call_used_regs[regno] - && !fixed_regs[regno] - && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed)) - || (regno == PIC_OFFSET_TABLE_REGNUM && pic_reg_used)); +ix86_save_reg (regno, maybe_eh_return) + int regno; + int maybe_eh_return; +{ + if (flag_pic + && ! TARGET_64BIT + && regno == PIC_OFFSET_TABLE_REGNUM + && (current_function_uses_pic_offset_table + || current_function_uses_const_pool + || current_function_calls_eh_return)) + return 1; + if (current_function_calls_eh_return && maybe_eh_return) + { + unsigned i; + for (i = 0; ; i++) + { + unsigned test = EH_RETURN_DATA_REGNO(i); + if (test == INVALID_REGNUM) + break; + if (test == (unsigned) regno) + return 1; + } + } + + return (regs_ever_live[regno] + && !call_used_regs[regno] + && !fixed_regs[regno] + && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed)); } /* Return number of registers to be saved on the stack. */ @@ -2141,7 +2321,7 @@ ix86_nsaved_regs () int regno; for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--) - if (ix86_save_reg (regno)) + if (ix86_save_reg (regno, true)) nregs++; return nregs; } @@ -2214,6 +2394,15 @@ ix86_compute_frame_layout (frame) /* Register save area */ offset += frame->nregs * UNITS_PER_WORD; + /* Va-arg area */ + if (ix86_save_varrargs_registers) + { + offset += X86_64_VARARGS_SIZE; + frame->va_arg_size = X86_64_VARARGS_SIZE; + } + else + frame->va_arg_size = 0; + /* Align start of frame for local function. */ frame->padding1 = ((offset + stack_alignment_needed - 1) & -stack_alignment_needed) - offset; @@ -2246,15 +2435,28 @@ ix86_compute_frame_layout (frame) /* Size prologue needs to allocate. */ frame->to_allocate = (size + frame->padding1 + frame->padding2 - + frame->outgoing_arguments_size); + + frame->outgoing_arguments_size + frame->va_arg_size); + if (TARGET_64BIT && TARGET_RED_ZONE && current_function_sp_is_unchanging + && current_function_is_leaf) + { + frame->red_zone_size = frame->to_allocate; + if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE) + frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE; + } + else + frame->red_zone_size = 0; + frame->to_allocate -= frame->red_zone_size; + frame->stack_pointer_offset -= frame->red_zone_size; #if 0 fprintf (stderr, "nregs: %i\n", frame->nregs); fprintf (stderr, "size: %i\n", size); fprintf (stderr, "alignment1: %i\n", stack_alignment_needed); fprintf (stderr, "padding1: %i\n", frame->padding1); + fprintf (stderr, "va_arg: %i\n", frame->va_arg_size); fprintf (stderr, "padding2: %i\n", frame->padding2); fprintf (stderr, "to_allocate: %i\n", frame->to_allocate); + fprintf (stderr, "red_zone_size: %i\n", frame->red_zone_size); fprintf (stderr, "frame_pointer_offset: %i\n", frame->frame_pointer_offset); fprintf (stderr, "hard_frame_pointer_offset: %i\n", frame->hard_frame_pointer_offset); @@ -2271,10 +2473,32 @@ ix86_emit_save_regs () rtx insn; for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--) - if (ix86_save_reg (regno)) + if (ix86_save_reg (regno, true)) + { + insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno))); + RTX_FRAME_RELATED_P (insn) = 1; + } +} + +/* Emit code to save registers using MOV insns. First register + is restored from POINTER + OFFSET. */ +static void +ix86_emit_save_regs_using_mov (pointer, offset) + rtx pointer; + HOST_WIDE_INT offset; +{ + int regno; + rtx insn; + + for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) + if (ix86_save_reg (regno, true)) { - insn = emit_insn (gen_push (gen_rtx_REG (SImode, regno))); + insn = emit_move_insn (adj_offsettable_operand (gen_rtx_MEM (Pmode, + pointer), + offset), + gen_rtx_REG (Pmode, regno)); RTX_FRAME_RELATED_P (insn) = 1; + offset += UNITS_PER_WORD; } } @@ -2284,9 +2508,12 @@ void ix86_expand_prologue () { rtx insn; - int pic_reg_used = flag_pic && (current_function_uses_pic_offset_table - || current_function_uses_const_pool); + int pic_reg_used = (flag_pic && (current_function_uses_pic_offset_table + || current_function_uses_const_pool) + && !TARGET_64BIT); struct ix86_frame frame; + int use_mov = (TARGET_PROLOGUE_USING_MOVE && !optimize_size); + HOST_WIDE_INT allocate; ix86_compute_frame_layout (&frame); @@ -2302,19 +2529,24 @@ ix86_expand_prologue () RTX_FRAME_RELATED_P (insn) = 1; } - ix86_emit_save_regs (); + allocate = frame.to_allocate; + /* In case we are dealing only with single register and empty frame, + push is equivalent of the mov+add sequence. */ + if (allocate == 0 && frame.nregs <= 1) + use_mov = 0; - if (frame.to_allocate == 0) + if (!use_mov) + ix86_emit_save_regs (); + else + allocate += frame.nregs * UNITS_PER_WORD; + + if (allocate == 0) ; - else if (! TARGET_STACK_PROBE || frame.to_allocate < CHECK_STACK_LIMIT) + else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT) { - if (frame_pointer_needed) - insn = emit_insn (gen_pro_epilogue_adjust_stack - (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (-frame.to_allocate), hard_frame_pointer_rtx)); - else - insn = emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (-frame.to_allocate))); + insn = emit_insn (gen_pro_epilogue_adjust_stack + (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-allocate))); RTX_FRAME_RELATED_P (insn) = 1; } else @@ -2323,17 +2555,28 @@ ix86_expand_prologue () rtx arg0, sym; + if (TARGET_64BIT) + abort(); + arg0 = gen_rtx_REG (SImode, 0); - emit_move_insn (arg0, GEN_INT (frame.to_allocate)); + emit_move_insn (arg0, GEN_INT (allocate)); sym = gen_rtx_MEM (FUNCTION_MODE, gen_rtx_SYMBOL_REF (Pmode, "_alloca")); - insn = emit_call_insn (gen_call (sym, const0_rtx)); + insn = emit_call_insn (gen_call (sym, const0_rtx, constm1_rtx)); CALL_INSN_FUNCTION_USAGE (insn) = gen_rtx_EXPR_LIST (VOIDmode, gen_rtx_USE (VOIDmode, arg0), CALL_INSN_FUNCTION_USAGE (insn)); } + if (use_mov) + { + if (!frame_pointer_needed || !frame.to_allocate) + ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate); + else + ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx, + -frame.nregs * UNITS_PER_WORD); + } #ifdef SUBTARGET_PROLOGUE SUBTARGET_PROLOGUE; @@ -2349,36 +2592,18 @@ ix86_expand_prologue () emit_insn (gen_blockage ()); } -/* Emit code to add TSIZE to esp value. Use POP instruction when - profitable. */ - -static void -ix86_emit_epilogue_esp_adjustment (tsize) - int tsize; -{ - /* If a frame pointer is present, we must be sure to tie the sp - to the fp so that we don't mis-schedule. */ - if (frame_pointer_needed) - emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx, - stack_pointer_rtx, - GEN_INT (tsize), - hard_frame_pointer_rtx)); - else - emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (tsize))); -} - /* Emit code to restore saved registers using MOV insns. First register is restored from POINTER + OFFSET. */ static void -ix86_emit_restore_regs_using_mov (pointer, offset) - rtx pointer; - int offset; +ix86_emit_restore_regs_using_mov (pointer, offset, maybe_eh_return) + rtx pointer; + int offset; + int maybe_eh_return; { int regno; for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (ix86_save_reg (regno)) + if (ix86_save_reg (regno, maybe_eh_return)) { emit_move_insn (gen_rtx_REG (Pmode, regno), adj_offsettable_operand (gen_rtx_MEM (Pmode, @@ -2391,8 +2616,8 @@ ix86_emit_restore_regs_using_mov (pointer, offset) /* Restore function stack, frame, and registers. */ void -ix86_expand_epilogue (emit_return) - int emit_return; +ix86_expand_epilogue (style) + int style; { int regno; int sp_valid = !frame_pointer_needed || current_function_sp_is_unchanging; @@ -2401,8 +2626,14 @@ ix86_expand_epilogue (emit_return) ix86_compute_frame_layout (&frame); - /* Calculate start of saved registers relative to ebp. */ - offset = -frame.nregs * UNITS_PER_WORD; + /* Calculate start of saved registers relative to ebp. Special care + must be taken for the normal return case of a function using + eh_return: the eax and edx registers are marked as saved, but not + restored along this path. */ + offset = frame.nregs; + if (current_function_calls_eh_return && style != 2) + offset -= 2; + offset *= -UNITS_PER_WORD; #ifdef FUNCTION_BLOCK_PROFILER_EXIT if (profile_block_flag == 2) @@ -2422,9 +2653,12 @@ ix86_expand_epilogue (emit_return) and there is exactly one register to pop. This heruistic may need some tuning in future. */ if ((!sp_valid && frame.nregs <= 1) + || (TARGET_EPILOGUE_USING_MOVE && !optimize_size + && (frame.nregs > 1 || frame.to_allocate)) || (frame_pointer_needed && !frame.nregs && frame.to_allocate) || (frame_pointer_needed && TARGET_USE_LEAVE && !optimize_size - && frame.nregs == 1)) + && frame.nregs == 1) + || style == 2) { /* Restore registers. We can use ebp or esp to address the memory locations. If both are available, default to ebp, since offsets @@ -2433,23 +2667,54 @@ ix86_expand_epilogue (emit_return) mode. */ if (!frame_pointer_needed || (sp_valid && !frame.to_allocate)) - ix86_emit_restore_regs_using_mov (stack_pointer_rtx, frame.to_allocate); + ix86_emit_restore_regs_using_mov (stack_pointer_rtx, + frame.to_allocate, style == 2); else - ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx, offset); + ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx, + offset, style == 2); + + /* eh_return epilogues need %ecx added to the stack pointer. */ + if (style == 2) + { + rtx tmp, sa = EH_RETURN_STACKADJ_RTX; - if (!frame_pointer_needed) - ix86_emit_epilogue_esp_adjustment (frame.to_allocate - + frame.nregs * UNITS_PER_WORD); + if (frame_pointer_needed) + { + tmp = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa); + tmp = plus_constant (tmp, UNITS_PER_WORD); + emit_insn (gen_rtx_SET (VOIDmode, sa, tmp)); + + tmp = gen_rtx_MEM (Pmode, hard_frame_pointer_rtx); + emit_move_insn (hard_frame_pointer_rtx, tmp); + + emit_insn (gen_pro_epilogue_adjust_stack + (stack_pointer_rtx, sa, const0_rtx)); + } + else + { + tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa); + tmp = plus_constant (tmp, (frame.to_allocate + + frame.nregs * UNITS_PER_WORD)); + emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp)); + } + } + else if (!frame_pointer_needed) + emit_insn (gen_pro_epilogue_adjust_stack + (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (frame.to_allocate + + frame.nregs * UNITS_PER_WORD))); /* If not an i386, mov & pop is faster than "leave". */ else if (TARGET_USE_LEAVE || optimize_size) - emit_insn (gen_leave ()); + emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ()); else { emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx, - const0_rtx, - hard_frame_pointer_rtx)); - emit_insn (gen_popsi1 (hard_frame_pointer_rtx)); + const0_rtx)); + if (TARGET_64BIT) + emit_insn (gen_popdi1 (hard_frame_pointer_rtx)); + else + emit_insn (gen_popsi1 (hard_frame_pointer_rtx)); } } else @@ -2462,21 +2727,32 @@ ix86_expand_epilogue (emit_return) abort (); emit_insn (gen_pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx, - GEN_INT (offset), - hard_frame_pointer_rtx)); + GEN_INT (offset))); } else if (frame.to_allocate) - ix86_emit_epilogue_esp_adjustment (frame.to_allocate); + emit_insn (gen_pro_epilogue_adjust_stack + (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (frame.to_allocate))); for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (ix86_save_reg (regno)) - emit_insn (gen_popsi1 (gen_rtx_REG (SImode, regno))); + if (ix86_save_reg (regno, false)) + { + if (TARGET_64BIT) + emit_insn (gen_popdi1 (gen_rtx_REG (Pmode, regno))); + else + emit_insn (gen_popsi1 (gen_rtx_REG (Pmode, regno))); + } if (frame_pointer_needed) - emit_insn (gen_popsi1 (hard_frame_pointer_rtx)); + { + if (TARGET_64BIT) + emit_insn (gen_popdi1 (hard_frame_pointer_rtx)); + else + emit_insn (gen_popsi1 (hard_frame_pointer_rtx)); + } } /* Sibcall epilogues don't want a return instruction. */ - if (! emit_return) + if (style == 0) return; if (current_function_pops_args && current_function_args_size) @@ -2491,6 +2767,10 @@ ix86_expand_epilogue (emit_return) { rtx ecx = gen_rtx_REG (SImode, 2); + /* There are is no "pascal" calling convention in 64bit ABI. */ + if (TARGET_64BIT) + abort(); + emit_insn (gen_popsi1 (ecx)); emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, popc)); emit_jump_insn (gen_return_indirect_internal (ecx)); @@ -2881,14 +3161,30 @@ legitimate_address_p (mode, addr, strict) goto report_error; } - if (GET_CODE (disp) == CONST_DOUBLE) + if (TARGET_64BIT) { - reason = "displacement is a const_double"; - goto report_error; + if (!x86_64_sign_extended_value (disp)) + { + reason = "displacement is out of range"; + goto report_error; + } + } + else + { + if (GET_CODE (disp) == CONST_DOUBLE) + { + reason = "displacement is a const_double"; + goto report_error; + } } if (flag_pic && SYMBOLIC_CONST (disp)) { + if (TARGET_64BIT && (index || base)) + { + reason = "non-constant pic memory reference"; + goto report_error; + } if (! legitimate_pic_address_disp_p (disp)) { reason = "displacement is an invalid pic construct"; @@ -3528,7 +3824,7 @@ print_reg (x, code, file) if (ASSEMBLER_DIALECT == 0 || USER_LABEL_PREFIX[0] == 0) putc ('%', file); - if (code == 'w') + if (code == 'w' || MMX_REG_P (x)) code = 2; else if (code == 'b') code = 1; @@ -3540,8 +3836,6 @@ print_reg (x, code, file) code = 3; else if (code == 'h') code = 0; - else if (code == 'm' || MMX_REG_P (x)) - code = 5; else code = GET_MODE_SIZE (GET_MODE (x)); @@ -3549,9 +3843,11 @@ print_reg (x, code, file) from the normal registers. */ if (REX_INT_REG_P (x)) { + if (!TARGET_64BIT) + abort (); switch (code) { - case 5: + case 0: error ("Extended registers have no high halves\n"); break; case 1: @@ -3574,9 +3870,6 @@ print_reg (x, code, file) } switch (code) { - case 5: - fputs (hi_reg_name[REGNO (x)], file); - break; case 3: if (STACK_TOP_P (x)) { @@ -3588,7 +3881,7 @@ print_reg (x, code, file) case 4: case 12: if (! ANY_FP_REG_P (x)) - putc (code == 8 ? 'r' : 'e', file); + putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file); /* FALLTHRU */ case 16: case 2: @@ -3609,6 +3902,7 @@ print_reg (x, code, file) L,W,B,Q,S,T -- print the opcode suffix for specified size of operand. C -- print opcode suffix for set/cmov insn. c -- like C, but print reversed condition + F,f -- likewise, but for floating-point. R -- print the prefix for register names. z -- print the opcode suffix for the size of the current operand. * -- print a star (in certain assembler syntax) @@ -3621,10 +3915,11 @@ print_reg (x, code, file) w -- likewise, print the HImode name of the register. k -- likewise, print the SImode name of the register. q -- likewise, print the DImode name of the register. - h -- print the QImode name for a "high" register, either ah, bh, ch or dh. - y -- print "st(0)" instead of "st" as a register. - m -- print "st(n)" as an mmx register. + h -- print the QImode name for a "high" register, either ah, bh, ch or dh. + y -- print "st(0)" instead of "st" as a register. D -- print condition for SSE cmp instruction. + P -- if PIC, print an @PLT suffix. + X -- don't print any sort of PIC '@' suffix for a symbol. */ void @@ -3747,7 +4042,6 @@ print_operand (file, x, code) case 'q': case 'h': case 'y': - case 'm': case 'X': case 'P': break; @@ -3815,7 +4109,39 @@ print_operand (file, x, code) case 'f': put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file); return; + case '+': + { + rtx x; + if (!optimize || optimize_size || !TARGET_BRANCH_PREDICTION_HINTS) + return; + + x = find_reg_note (current_output_insn, REG_BR_PROB, 0); + if (x) + { + int pred_val = INTVAL (XEXP (x, 0)); + + if (pred_val < REG_BR_PROB_BASE * 45 / 100 + || pred_val > REG_BR_PROB_BASE * 55 / 100) + { + int taken = pred_val > REG_BR_PROB_BASE / 2; + int cputaken = final_forward_branch_p (current_output_insn) == 0; + + /* Emit hints only in the case default branch prediction + heruistics would fail. */ + if (taken != cputaken) + { + /* We use 3e (DS) prefix for taken branches and + 2e (CS) prefix for not taken branches. */ + if (taken) + fputs ("ds ; ", file); + else + fputs ("cs ; ", file); + } + } + } + return; + } default: { char str[50]; @@ -3863,6 +4189,10 @@ print_operand (file, x, code) x = XEXP (x, 0); if (flag_pic && CONSTANT_ADDRESS_P (x)) output_pic_addr_const (file, x, code); + /* Avoid (%rip) for call operands. */ + else if (CONSTANT_ADDRESS_P (x) && code =='P' + && GET_CODE (x) != CONST_INT) + output_addr_const (file, x); else output_address (x); } @@ -3965,6 +4295,10 @@ print_operand_address (file, addr) output_pic_addr_const (file, addr, 0); else output_addr_const (file, addr); + + /* Use one byte shorter RIP relative addressing for 64bit mode. */ + if (GET_CODE (disp) != CONST_INT && TARGET_64BIT) + fputs ("(%rip)", file); } else { @@ -4070,14 +4404,16 @@ split_di (operands, num, lo_half, hi_half) } else if (GET_CODE (op) == REG) { + if (TARGET_64BIT) + abort(); lo_half[num] = gen_rtx_REG (SImode, REGNO (op)); hi_half[num] = gen_rtx_REG (SImode, REGNO (op) + 1); } else if (offsettable_memref_p (op)) { - rtx lo_addr = XEXP (op, 0); rtx hi_addr = XEXP (adj_offsettable_operand (op, 4), 0); - lo_half[num] = change_address (op, SImode, lo_addr); + + lo_half[num] = adjust_address (op, SImode, 0); hi_half[num] = change_address (op, SImode, hi_addr); } else @@ -4309,6 +4645,25 @@ output_387_binary_op (insn, operands) return buf; } +/* Output code to initialize control word copies used by + trunc?f?i patterns. NORMAL is set to current control word, while ROUND_DOWN + is set to control word rounding downwards. */ +void +emit_i387_cw_initialization (normal, round_down) + rtx normal, round_down; +{ + rtx reg = gen_reg_rtx (HImode); + + emit_insn (gen_x86_fnstcw_1 (normal)); + emit_move_insn (reg, normal); + if (!TARGET_PARTIAL_REG_STALL && !optimize_size + && !TARGET_64BIT) + emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc))); + else + emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0xc00))); + emit_move_insn (round_down, reg); +} + /* Output code for INSN to convert a float to a signed int. OPERANDS are the insn operands. The output may be [HSD]Imode and the input operand may be [SDX]Fmode. */ @@ -4328,45 +4683,19 @@ output_fix_trunc (insn, operands) if (dimode_p && !stack_top_dies) output_asm_insn ("fld\t%y1", operands); - if (! STACK_TOP_P (operands[1])) + if (!STACK_TOP_P (operands[1])) abort (); - xops[0] = GEN_INT (12); - xops[1] = adj_offsettable_operand (operands[2], 1); - xops[1] = change_address (xops[1], QImode, NULL_RTX); - - xops[2] = operands[0]; if (GET_CODE (operands[0]) != MEM) - xops[2] = operands[3]; - - output_asm_insn ("fnstcw\t%2", operands); - output_asm_insn ("mov{l}\t{%2, %4|%4, %2}", operands); - output_asm_insn ("mov{b}\t{%0, %1|%1, %0}", xops); - output_asm_insn ("fldcw\t%2", operands); - output_asm_insn ("mov{l}\t{%4, %2|%2, %4}", operands); + abort (); + output_asm_insn ("fldcw\t%3", operands); if (stack_top_dies || dimode_p) - output_asm_insn ("fistp%z2\t%2", xops); + output_asm_insn ("fistp%z0\t%0", operands); else - output_asm_insn ("fist%z2\t%2", xops); - + output_asm_insn ("fist%z0\t%0", operands); output_asm_insn ("fldcw\t%2", operands); - if (GET_CODE (operands[0]) != MEM) - { - if (dimode_p) - { - split_di (operands+0, 1, xops+0, xops+1); - split_di (operands+3, 1, xops+2, xops+3); - output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops); - output_asm_insn ("mov{l}\t{%3, %1|%1, %3}", xops); - } - else if (GET_MODE (operands[0]) == SImode) - output_asm_insn ("mov{l}\t{%3, %0|%0, %3}", operands); - else - output_asm_insn ("mov{w}\t{%3, %0|%0, %3}", operands); - } - return ""; } @@ -5631,6 +5960,7 @@ ix86_expand_branch (code, label) case QImode: case HImode: case SImode: + simple: tmp = ix86_expand_compare (code, NULL, NULL); tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, gen_rtx_LABEL_REF (VOIDmode, label), @@ -5674,6 +6004,8 @@ ix86_expand_branch (code, label) } case DImode: + if (TARGET_64BIT) + goto simple; /* Expand DImode branch into multiple compare+branch. */ { rtx lo[2], hi[2], label2; @@ -5850,7 +6182,8 @@ ix86_expand_setcc (code, dest) rtx second_test, bypass_test; int type; - if (GET_MODE (ix86_compare_op0) == DImode) + if (GET_MODE (ix86_compare_op0) == DImode + && !TARGET_64BIT) return 0; /* FAIL */ /* Three modes of generation: @@ -5967,6 +6300,7 @@ ix86_expand_int_movcc (operands) HImode insns, we'd be swallowed in word prefix ops. */ if (GET_MODE (operands[0]) != HImode + && GET_MODE (operands[0]) != DImode && GET_CODE (operands[2]) == CONST_INT && GET_CODE (operands[3]) == CONST_INT) { @@ -6010,7 +6344,7 @@ ix86_expand_int_movcc (operands) * Size 5 - 8. */ if (ct) - emit_insn (gen_addsi3 (out, out, GEN_INT (ct))); + emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (ct))); } else if (cf == -1) { @@ -6021,7 +6355,7 @@ ix86_expand_int_movcc (operands) * * Size 8. */ - emit_insn (gen_iorsi3 (out, out, GEN_INT (ct))); + emit_insn (gen_iorsi3 (tmp, tmp, GEN_INT (ct))); } else if (diff == -1 && ct) { @@ -6035,7 +6369,7 @@ ix86_expand_int_movcc (operands) */ emit_insn (gen_one_cmplsi2 (tmp, tmp)); if (cf) - emit_insn (gen_addsi3 (out, out, GEN_INT (cf))); + emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (cf))); } else { @@ -6047,9 +6381,10 @@ ix86_expand_int_movcc (operands) * * Size 8 - 11. */ - emit_insn (gen_andsi3 (out, out, GEN_INT (cf - ct))); + emit_insn (gen_andsi3 (tmp, tmp, GEN_INT (trunc_int_for_mode + (cf - ct, SImode)))); if (ct) - emit_insn (gen_addsi3 (out, out, GEN_INT (ct))); + emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (ct))); } if (tmp != out) @@ -6100,28 +6435,46 @@ ix86_expand_int_movcc (operands) ix86_compare_op1, VOIDmode, 0, 1); nops = 0; + /* On x86_64 the lea instruction operates on Pmode, so we need to get arithmetics + done in proper mode to match. */ if (diff == 1) - tmp = out; + { + if (Pmode != SImode) + tmp = gen_lowpart (Pmode, out); + else + tmp = out; + } else { - tmp = gen_rtx_MULT (SImode, out, GEN_INT (diff & ~1)); + rtx out1; + if (Pmode != SImode) + out1 = gen_lowpart (Pmode, out); + else + out1 = out; + tmp = gen_rtx_MULT (Pmode, out1, GEN_INT (diff & ~1)); nops++; if (diff & 1) { - tmp = gen_rtx_PLUS (SImode, tmp, out); + tmp = gen_rtx_PLUS (Pmode, tmp, out1); nops++; } } if (cf != 0) { - tmp = gen_rtx_PLUS (SImode, tmp, GEN_INT (cf)); + tmp = gen_rtx_PLUS (Pmode, tmp, GEN_INT (cf)); nops++; } - if (tmp != out) + if (tmp != out + && (GET_CODE (tmp) != SUBREG || SUBREG_REG (tmp) != out)) { - if (nops == 0) - emit_move_insn (out, tmp); - else if (nops == 1) + if (Pmode != SImode) + tmp = gen_rtx_SUBREG (SImode, tmp, 0); + + /* ??? We should to take care for outputing non-lea arithmetics + for Pmode != SImode case too, but it is quite tricky and not + too important, since all TARGET_64BIT machines support real + conditional moves. */ + if (nops == 1 && Pmode == SImode) { rtx clob; @@ -6186,7 +6539,8 @@ ix86_expand_int_movcc (operands) ix86_compare_op1, VOIDmode, 0, 1); emit_insn (gen_addsi3 (out, out, constm1_rtx)); - emit_insn (gen_andsi3 (out, out, GEN_INT (cf-ct))); + emit_insn (gen_andsi3 (out, out, GEN_INT (trunc_int_for_mode + (cf - ct, SImode)))); if (ct != 0) emit_insn (gen_addsi3 (out, out, GEN_INT (ct))); if (out != operands[0]) @@ -6277,6 +6631,9 @@ ix86_expand_int_movcc (operands) emit_move_insn (tmp, operands[2]); operands[2] = tmp; } + if (! register_operand (operands[2], VOIDmode) + && ! register_operand (operands[3], VOIDmode)) + operands[2] = force_reg (GET_MODE (operands[0]), operands[2]); emit_insn (compare_seq); emit_insn (gen_rtx_SET (VOIDmode, operands[0], @@ -6312,6 +6669,9 @@ ix86_expand_fp_movcc (operands) if (((TARGET_SSE && GET_MODE (operands[0]) == SFmode) || (TARGET_SSE2 && GET_MODE (operands[0]) == DFmode)) && GET_MODE (ix86_compare_op0) == GET_MODE (operands[0]) + /* The SSE comparisons does not support the LTGT/UNEQ pair. */ + && (!TARGET_IEEE_FP + || (GET_CODE (operands[1]) != LTGT && GET_CODE (operands[1]) != UNEQ)) /* We may be called from the post-reload splitter. */ && (!REG_P (operands[0]) || SSE_REG_P (operands[0]) @@ -6371,12 +6731,15 @@ ix86_expand_fp_movcc (operands) ix86_compare_op1); } /* Similary try to manage result to be first operand of conditional - move. */ - if (rtx_equal_p (operands[0], operands[3])) + move. We also don't support the NE comparison on SSE, so try to + avoid it. */ + if ((rtx_equal_p (operands[0], operands[3]) + && (!TARGET_IEEE_FP || GET_CODE (operands[1]) != EQ)) + || (GET_CODE (operands[1]) == NE && TARGET_IEEE_FP)) { rtx tmp = operands[2]; operands[2] = operands[3]; - operands[2] = tmp; + operands[3] = tmp; operands[1] = gen_rtx_fmt_ee (reverse_condition_maybe_unordered (GET_CODE (operands[1])), VOIDmode, ix86_compare_op0, @@ -6458,7 +6821,12 @@ ix86_split_to_parts (operand, parts, mode) rtx *parts; enum machine_mode mode; { - int size = mode == TFmode ? 3 : GET_MODE_SIZE (mode) / 4; + int size; + + if (!TARGET_64BIT) + size = mode == TFmode ? 3 : (GET_MODE_SIZE (mode) / 4); + else + size = (GET_MODE_SIZE (mode) + 4) / 8; if (GET_CODE (operand) == REG && MMX_REGNO_P (REGNO (operand))) abort (); @@ -6479,10 +6847,11 @@ ix86_split_to_parts (operand, parts, mode) if (! push_operand (operand, VOIDmode)) abort (); - PUT_MODE (operand, SImode); + operand = copy_rtx (operand); + PUT_MODE (operand, Pmode); parts[0] = parts[1] = parts[2] = operand; } - else + else if (!TARGET_64BIT) { if (mode == DImode) split_di (&operand, 1, &parts[0], &parts[1]); @@ -6499,7 +6868,7 @@ ix86_split_to_parts (operand, parts, mode) } else if (offsettable_memref_p (operand)) { - PUT_MODE (operand, SImode); + operand = adjust_address (operand, SImode, 0); parts[0] = operand; parts[1] = adj_offsettable_operand (operand, 4); if (size == 3) @@ -6531,6 +6900,42 @@ ix86_split_to_parts (operand, parts, mode) abort (); } } + else + { + if (mode == XFmode || mode == TFmode) + { + if (REG_P (operand)) + { + if (!reload_completed) + abort (); + parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0); + parts[1] = gen_rtx_REG (SImode, REGNO (operand) + 1); + } + else if (offsettable_memref_p (operand)) + { + operand = change_address (operand, DImode, XEXP (operand, 0)); + parts[0] = operand; + parts[1] = adj_offsettable_operand (operand, 8); + parts[1] = change_address (parts[1], SImode, XEXP (parts[1], 0)); + } + else if (GET_CODE (operand) == CONST_DOUBLE) + { + REAL_VALUE_TYPE r; + long l[3]; + + REAL_VALUE_FROM_CONST_DOUBLE (r, operand); + REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l); + /* Do not use shift by 32 to avoid warning on 32bit systems. */ + if (HOST_BITS_PER_WIDE_INT >= 64) + parts[0] = GEN_INT (l[0] + ((l[1] << 31) << 1)); + else + parts[0] = immed_double_const (l[0], l[1], DImode); + parts[1] = GEN_INT (l[2]); + } + else + abort (); + } + } return size; } @@ -6540,19 +6945,39 @@ ix86_split_to_parts (operand, parts, mode) insns have been emitted. Operands 2-4 contain the input values int the correct order; operands 5-7 contain the output values. */ -int -ix86_split_long_move (operands1) - rtx operands1[]; +void +ix86_split_long_move (operands) + rtx operands[]; { rtx part[2][3]; - rtx operands[2]; - int size; + int nparts; int push = 0; int collisions = 0; + enum machine_mode mode = GET_MODE (operands[0]); + + /* The DFmode expanders may ask us to move double. + For 64bit target this is single move. By hiding the fact + here we simplify i386.md splitters. */ + if (GET_MODE_SIZE (GET_MODE (operands[0])) == 8 && TARGET_64BIT) + { + /* Optimize constant pool reference to immediates. This is used by fp moves, + that force all constants to memory to allow combining. */ - /* Make our own copy to avoid clobbering the operands. */ - operands[0] = copy_rtx (operands1[0]); - operands[1] = copy_rtx (operands1[1]); + if (GET_CODE (operands[1]) == MEM + && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF + && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0))) + operands[1] = get_pool_constant (XEXP (operands[1], 0)); + if (push_operand (operands[0], VOIDmode)) + { + operands[0] = copy_rtx (operands[0]); + PUT_MODE (operands[0], Pmode); + } + else + operands[0] = gen_lowpart (DImode, operands[0]); + operands[1] = gen_lowpart (DImode, operands[1]); + emit_move_insn (operands[0], operands[1]); + return; + } /* The only non-offsettable memory we handle is push. */ if (push_operand (operands[0], VOIDmode)) @@ -6561,16 +6986,18 @@ ix86_split_long_move (operands1) && ! offsettable_memref_p (operands[0])) abort (); - size = ix86_split_to_parts (operands[0], part[0], GET_MODE (operands1[0])); - ix86_split_to_parts (operands[1], part[1], GET_MODE (operands1[0])); + nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0])); + ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0])); /* When emitting push, take care for source operands on the stack. */ if (push && GET_CODE (operands[1]) == MEM && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) { - if (size == 3) - part[1][1] = part[1][2]; - part[1][0] = part[1][1]; + if (nparts == 3) + part[1][1] = change_address (part[1][1], GET_MODE (part[1][1]), + XEXP (part[1][2], 0)); + part[1][0] = change_address (part[1][0], GET_MODE (part[1][0]), + XEXP (part[1][1], 0)); } /* We need to do copy in the right order in case an address register @@ -6581,12 +7008,12 @@ ix86_split_long_move (operands1) collisions++; if (reg_overlap_mentioned_p (part[0][1], XEXP (part[1][0], 0))) collisions++; - if (size == 3 + if (nparts == 3 && reg_overlap_mentioned_p (part[0][2], XEXP (part[1][0], 0))) collisions++; /* Collision in the middle part can be handled by reordering. */ - if (collisions == 1 && size == 3 + if (collisions == 1 && nparts == 3 && reg_overlap_mentioned_p (part[0][1], XEXP (part[1][0], 0))) { rtx tmp; @@ -6599,79 +7026,109 @@ ix86_split_long_move (operands1) else if (collisions > 1) { collisions = 1; - emit_insn (gen_rtx_SET (VOIDmode, part[0][size - 1], + emit_insn (gen_rtx_SET (VOIDmode, part[0][nparts - 1], XEXP (part[1][0], 0))); - part[1][0] = change_address (part[1][0], SImode, part[0][size - 1]); - part[1][1] = adj_offsettable_operand (part[1][0], 4); - if (size == 3) + part[1][0] = change_address (part[1][0], + TARGET_64BIT ? DImode : SImode, + part[0][nparts - 1]); + part[1][1] = adj_offsettable_operand (part[1][0], + UNITS_PER_WORD); + part[1][1] = change_address (part[1][1], GET_MODE (part[0][1]), + XEXP (part[1][1], 0)); + if (nparts == 3) part[1][2] = adj_offsettable_operand (part[1][0], 8); } } if (push) { - if (size == 3) + if (!TARGET_64BIT) { - /* We use only first 12 bytes of TFmode value, but for pushing we - are required to adjust stack as if we were pushing real 16byte - value. */ - if (GET_MODE (operands1[0]) == TFmode) - emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, - GEN_INT (-4))); - emit_insn (gen_push (part[1][2])); + if (nparts == 3) + { + /* We use only first 12 bytes of TFmode value, but for pushing we + are required to adjust stack as if we were pushing real 16byte + value. */ + if (mode == TFmode && !TARGET_64BIT) + emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-4))); + emit_move_insn (part[0][2], part[1][2]); + } } - emit_insn (gen_push (part[1][1])); - emit_insn (gen_push (part[1][0])); - return 1; + else + { + /* In 64bit mode we don't have 32bit push available. In case this is + register, it is OK - we will just use larger counterpart. We also + retype memory - these comes from attempt to avoid REX prefix on + moving of second half of TFmode value. */ + if (GET_MODE (part[1][1]) == SImode) + { + if (GET_CODE (part[1][1]) == MEM) + part[1][1] = adjust_address (part[1][1], DImode, 0); + else if (REG_P (part[1][1])) + part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1])); + else + abort(); + if (GET_MODE (part[1][0]) == SImode) + part[1][0] = part[1][1]; + } + } + emit_move_insn (part[0][1], part[1][1]); + emit_move_insn (part[0][0], part[1][0]); + return; } /* Choose correct order to not overwrite the source before it is copied. */ if ((REG_P (part[0][0]) && REG_P (part[1][1]) && (REGNO (part[0][0]) == REGNO (part[1][1]) - || (size == 3 + || (nparts == 3 && REGNO (part[0][0]) == REGNO (part[1][2])))) || (collisions > 0 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0)))) { - if (size == 3) + if (nparts == 3) { - operands1[2] = part[0][2]; - operands1[3] = part[0][1]; - operands1[4] = part[0][0]; - operands1[5] = part[1][2]; - operands1[6] = part[1][1]; - operands1[7] = part[1][0]; + operands[2] = part[0][2]; + operands[3] = part[0][1]; + operands[4] = part[0][0]; + operands[5] = part[1][2]; + operands[6] = part[1][1]; + operands[7] = part[1][0]; } else { - operands1[2] = part[0][1]; - operands1[3] = part[0][0]; - operands1[5] = part[1][1]; - operands1[6] = part[1][0]; + operands[2] = part[0][1]; + operands[3] = part[0][0]; + operands[5] = part[1][1]; + operands[6] = part[1][0]; } } else { - if (size == 3) + if (nparts == 3) { - operands1[2] = part[0][0]; - operands1[3] = part[0][1]; - operands1[4] = part[0][2]; - operands1[5] = part[1][0]; - operands1[6] = part[1][1]; - operands1[7] = part[1][2]; + operands[2] = part[0][0]; + operands[3] = part[0][1]; + operands[4] = part[0][2]; + operands[5] = part[1][0]; + operands[6] = part[1][1]; + operands[7] = part[1][2]; } else { - operands1[2] = part[0][0]; - operands1[3] = part[0][1]; - operands1[5] = part[1][0]; - operands1[6] = part[1][1]; + operands[2] = part[0][0]; + operands[3] = part[0][1]; + operands[5] = part[1][0]; + operands[6] = part[1][1]; } } + emit_move_insn (operands[2], operands[5]); + emit_move_insn (operands[3], operands[6]); + if (nparts == 3) + emit_move_insn (operands[4], operands[7]); - return 0; + return; } void @@ -6840,6 +7297,542 @@ ix86_split_lshrdi (operands, scratch) } } +/* Helper function for the string operations bellow. Dest VARIABLE whether + it is aligned to VALUE bytes. If true, jump to the label. */ +static rtx +ix86_expand_aligntest (variable, value) + rtx variable; + int value; +{ + rtx label = gen_label_rtx (); + rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); + if (GET_MODE (variable) == DImode) + emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value))); + else + emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); + emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), + 1, 0, label); + return label; +} + +/* Adjust COUNTER by the VALUE. */ +static void +ix86_adjust_counter (countreg, value) + rtx countreg; + HOST_WIDE_INT value; +{ + if (GET_MODE (countreg) == DImode) + emit_insn (gen_adddi3 (countreg, countreg, GEN_INT (-value))); + else + emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-value))); +} + +/* Zero extend possibly SImode EXP to Pmode register. */ +static rtx +ix86_zero_extend_to_Pmode (exp) + rtx exp; +{ + rtx r; + if (GET_MODE (exp) == VOIDmode) + return force_reg (Pmode, exp); + if (GET_MODE (exp) == Pmode) + return copy_to_mode_reg (Pmode, exp); + r = gen_reg_rtx (Pmode); + emit_insn (gen_zero_extendsidi2 (r, exp)); + return r; +} + +/* Expand string move (memcpy) operation. Use i386 string operations when + profitable. expand_clrstr contains similar code. */ +int +ix86_expand_movstr (dst, src, count_exp, align_exp) + rtx dst, src, count_exp, align_exp; +{ + rtx srcreg, destreg, countreg; + enum machine_mode counter_mode; + HOST_WIDE_INT align = 0; + unsigned HOST_WIDE_INT count = 0; + rtx insns; + + start_sequence (); + + if (GET_CODE (align_exp) == CONST_INT) + align = INTVAL (align_exp); + + /* This simple hack avoids all inlining code and simplifies code bellow. */ + if (!TARGET_ALIGN_STRINGOPS) + align = 64; + + if (GET_CODE (count_exp) == CONST_INT) + count = INTVAL (count_exp); + + /* Figure out proper mode for counter. For 32bits it is always SImode, + for 64bits use SImode when possible, otherwise DImode. + Set count to number of bytes copied when known at compile time. */ + if (!TARGET_64BIT || GET_MODE (count_exp) == SImode + || x86_64_zero_extended_value (count_exp)) + counter_mode = SImode; + else + counter_mode = DImode; + + if (counter_mode != SImode && counter_mode != DImode) + abort (); + + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); + + emit_insn (gen_cld ()); + + /* When optimizing for size emit simple rep ; movsb instruction for + counts not divisible by 4. */ + + if ((!optimize || optimize_size) && (count == 0 || (count & 0x03))) + { + countreg = ix86_zero_extend_to_Pmode (count_exp); + if (TARGET_64BIT) + emit_insn (gen_rep_movqi_rex64 (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + else + emit_insn (gen_rep_movqi (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + } + + /* For constant aligned (or small unaligned) copies use rep movsl + followed by code copying the rest. For PentiumPro ensure 8 byte + alignment to allow rep movsl acceleration. */ + + else if (count != 0 + && (align >= 8 + || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) + || optimize_size || count < (unsigned int)64)) + { + int size = TARGET_64BIT && !optimize_size ? 8 : 4; + if (count & ~(size - 1)) + { + countreg = copy_to_mode_reg (counter_mode, + GEN_INT ((count >> (size == 4 ? 2 : 3)) + & (TARGET_64BIT ? -1 : 0x3fffffff))); + countreg = ix86_zero_extend_to_Pmode (countreg); + if (size == 4) + { + if (TARGET_64BIT) + emit_insn (gen_rep_movsi_rex64 (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + else + emit_insn (gen_rep_movsi (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + } + else + emit_insn (gen_rep_movdi_rex64 (destreg, srcreg, countreg, + destreg, srcreg, countreg)); + } + if (size == 8 && (count & 0x04)) + emit_insn (gen_strmovsi (destreg, srcreg)); + if (count & 0x02) + emit_insn (gen_strmovhi (destreg, srcreg)); + if (count & 0x01) + emit_insn (gen_strmovqi (destreg, srcreg)); + } + /* The generic code based on the glibc implementation: + - align destination to 4 bytes (8 byte alignment is used for PentiumPro + allowing accelerated copying there) + - copy the data using rep movsl + - copy the rest. */ + else + { + rtx countreg2; + rtx label = NULL; + + /* In case we don't know anything about the alignment, default to + library version, since it is usually equally fast and result in + shorter code. */ + if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD) + { + end_sequence (); + return 0; + } + + if (TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld ()); + + countreg2 = gen_reg_rtx (Pmode); + countreg = copy_to_mode_reg (counter_mode, count_exp); + + /* We don't use loops to align destination and to copy parts smaller + than 4 bytes, because gcc is able to optimize such code better (in + the case the destination or the count really is aligned, gcc is often + able to predict the branches) and also it is friendlier to the + hardware branch prediction. + + Using loops is benefical for generic case, because we can + handle small counts using the loops. Many CPUs (such as Athlon) + have large REP prefix setup costs. + + This is quite costy. Maybe we can revisit this decision later or + add some customizability to this code. */ + + if (count == 0 + && align < (TARGET_PENTIUMPRO && (count == 0 + || count >= (unsigned int)260) + ? 8 : UNITS_PER_WORD)) + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (countreg, GEN_INT (UNITS_PER_WORD - 1), + LEU, 0, counter_mode, 1, 0, label); + } + if (align <= 1) + { + rtx label = ix86_expand_aligntest (destreg, 1); + emit_insn (gen_strmovqi (destreg, srcreg)); + ix86_adjust_counter (countreg, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2) + { + rtx label = ix86_expand_aligntest (destreg, 2); + emit_insn (gen_strmovhi (destreg, srcreg)); + ix86_adjust_counter (countreg, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 + && ((TARGET_PENTIUMPRO && (count == 0 + || count >= (unsigned int)260)) + || TARGET_64BIT)) + { + rtx label = ix86_expand_aligntest (destreg, 4); + emit_insn (gen_strmovsi (destreg, srcreg)); + ix86_adjust_counter (countreg, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (!TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld ()); + if (TARGET_64BIT) + { + emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), + GEN_INT (3))); + emit_insn (gen_rep_movdi_rex64 (destreg, srcreg, countreg2, + destreg, srcreg, countreg2)); + } + else + { + emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2))); + emit_insn (gen_rep_movsi (destreg, srcreg, countreg2, + destreg, srcreg, countreg2)); + } + + if (label) + { + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) + emit_insn (gen_strmovsi (destreg, srcreg)); + if ((align <= 4 || count == 0) && TARGET_64BIT) + { + rtx label = ix86_expand_aligntest (countreg, 4); + emit_insn (gen_strmovsi (destreg, srcreg)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 2 && count != 0 && (count & 2)) + emit_insn (gen_strmovhi (destreg, srcreg)); + if (align <= 2 || count == 0) + { + rtx label = ix86_expand_aligntest (countreg, 2); + emit_insn (gen_strmovhi (destreg, srcreg)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 1 && count != 0 && (count & 1)) + emit_insn (gen_strmovqi (destreg, srcreg)); + if (align <= 1 || count == 0) + { + rtx label = ix86_expand_aligntest (countreg, 1); + emit_insn (gen_strmovqi (destreg, srcreg)); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } + + insns = get_insns (); + end_sequence (); + + ix86_set_move_mem_attrs (insns, dst, src, destreg, srcreg); + emit_insns (insns); + return 1; +} + +/* Expand string clear operation (bzero). Use i386 string operations when + profitable. expand_movstr contains similar code. */ +int +ix86_expand_clrstr (src, count_exp, align_exp) + rtx src, count_exp, align_exp; +{ + rtx destreg, zeroreg, countreg; + enum machine_mode counter_mode; + HOST_WIDE_INT align = 0; + unsigned HOST_WIDE_INT count = 0; + + if (GET_CODE (align_exp) == CONST_INT) + align = INTVAL (align_exp); + + /* This simple hack avoids all inlining code and simplifies code bellow. */ + if (!TARGET_ALIGN_STRINGOPS) + align = 32; + + if (GET_CODE (count_exp) == CONST_INT) + count = INTVAL (count_exp); + /* Figure out proper mode for counter. For 32bits it is always SImode, + for 64bits use SImode when possible, otherwise DImode. + Set count to number of bytes copied when known at compile time. */ + if (!TARGET_64BIT || GET_MODE (count_exp) == SImode + || x86_64_zero_extended_value (count_exp)) + counter_mode = SImode; + else + counter_mode = DImode; + + destreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); + + emit_insn (gen_cld ()); + + /* When optimizing for size emit simple rep ; movsb instruction for + counts not divisible by 4. */ + + if ((!optimize || optimize_size) && (count == 0 || (count & 0x03))) + { + countreg = ix86_zero_extend_to_Pmode (count_exp); + zeroreg = copy_to_mode_reg (QImode, const0_rtx); + if (TARGET_64BIT) + emit_insn (gen_rep_stosqi_rex64 (destreg, countreg, zeroreg, + destreg, countreg)); + else + emit_insn (gen_rep_stosqi (destreg, countreg, zeroreg, + destreg, countreg)); + } + else if (count != 0 + && (align >= 8 + || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) + || optimize_size || count < (unsigned int)64)) + { + int size = TARGET_64BIT && !optimize_size ? 8 : 4; + zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx); + if (count & ~(size - 1)) + { + countreg = copy_to_mode_reg (counter_mode, + GEN_INT ((count >> (size == 4 ? 2 : 3)) + & (TARGET_64BIT ? -1 : 0x3fffffff))); + countreg = ix86_zero_extend_to_Pmode (countreg); + if (size == 4) + { + if (TARGET_64BIT) + emit_insn (gen_rep_stossi_rex64 (destreg, countreg, zeroreg, + destreg, countreg)); + else + emit_insn (gen_rep_stossi (destreg, countreg, zeroreg, + destreg, countreg)); + } + else + emit_insn (gen_rep_stosdi_rex64 (destreg, countreg, zeroreg, + destreg, countreg)); + } + if (size == 8 && (count & 0x04)) + emit_insn (gen_strsetsi (destreg, + gen_rtx_SUBREG (SImode, zeroreg, 0))); + if (count & 0x02) + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + if (count & 0x01) + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + } + else + { + rtx countreg2; + rtx label = NULL; + + /* In case we don't know anything about the alignment, default to + library version, since it is usually equally fast and result in + shorter code. */ + if (!TARGET_INLINE_ALL_STRINGOPS && align < UNITS_PER_WORD) + return 0; + + if (TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld ()); + + countreg2 = gen_reg_rtx (Pmode); + countreg = copy_to_mode_reg (counter_mode, count_exp); + zeroreg = copy_to_mode_reg (Pmode, const0_rtx); + + if (count == 0 + && align < (TARGET_PENTIUMPRO && (count == 0 + || count >= (unsigned int)260) + ? 8 : UNITS_PER_WORD)) + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (countreg, GEN_INT (UNITS_PER_WORD - 1), + LEU, 0, counter_mode, 1, 0, label); + } + if (align <= 1) + { + rtx label = ix86_expand_aligntest (destreg, 1); + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + ix86_adjust_counter (countreg, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2) + { + rtx label = ix86_expand_aligntest (destreg, 2); + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + ix86_adjust_counter (countreg, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && TARGET_PENTIUMPRO && (count == 0 + || count >= (unsigned int)260)) + { + rtx label = ix86_expand_aligntest (destreg, 4); + emit_insn (gen_strsetsi (destreg, (TARGET_64BIT + ? gen_rtx_SUBREG (SImode, zeroreg, 0) + : zeroreg))); + ix86_adjust_counter (countreg, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (!TARGET_SINGLE_STRINGOP) + emit_insn (gen_cld ()); + if (TARGET_64BIT) + { + emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), + GEN_INT (3))); + emit_insn (gen_rep_stosdi_rex64 (destreg, countreg2, zeroreg, + destreg, countreg2)); + } + else + { + emit_insn (gen_lshrsi3 (countreg2, countreg, GEN_INT (2))); + emit_insn (gen_rep_stossi (destreg, countreg2, zeroreg, + destreg, countreg2)); + } + + if (label) + { + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) + emit_insn (gen_strsetsi (destreg, + gen_rtx_SUBREG (SImode, zeroreg, 0))); + if (TARGET_64BIT && (align <= 4 || count == 0)) + { + rtx label = ix86_expand_aligntest (destreg, 2); + emit_insn (gen_strsetsi (destreg, + gen_rtx_SUBREG (SImode, zeroreg, 0))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 2 && count != 0 && (count & 2)) + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + if (align <= 2 || count == 0) + { + rtx label = ix86_expand_aligntest (destreg, 2); + emit_insn (gen_strsethi (destreg, + gen_rtx_SUBREG (HImode, zeroreg, 0))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align > 1 && count != 0 && (count & 1)) + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + if (align <= 1 || count == 0) + { + rtx label = ix86_expand_aligntest (destreg, 1); + emit_insn (gen_strsetqi (destreg, + gen_rtx_SUBREG (QImode, zeroreg, 0))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + } + return 1; +} +/* Expand strlen. */ +int +ix86_expand_strlen (out, src, eoschar, align) + rtx out, src, eoschar, align; +{ + rtx addr, scratch1, scratch2, scratch3, scratch4; + + /* The generic case of strlen expander is long. Avoid it's + expanding unless TARGET_INLINE_ALL_STRINGOPS. */ + + if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 + && !TARGET_INLINE_ALL_STRINGOPS + && !optimize_size + && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)) + return 0; + + addr = force_reg (Pmode, XEXP (src, 0)); + scratch1 = gen_reg_rtx (Pmode); + + if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 + && !optimize_size) + { + /* Well it seems that some optimizer does not combine a call like + foo(strlen(bar), strlen(bar)); + when the move and the subtraction is done here. It does calculate + the length just once when these instructions are done inside of + output_strlen_unroll(). But I think since &bar[strlen(bar)] is + often used and I use one fewer register for the lifetime of + output_strlen_unroll() this is better. */ + + emit_move_insn (out, addr); + + ix86_expand_strlensi_unroll_1 (out, align); + + /* strlensi_unroll_1 returns the address of the zero at the end of + the string, like memchr(), so compute the length by subtracting + the start address. */ + if (TARGET_64BIT) + emit_insn (gen_subdi3 (out, out, addr)); + else + emit_insn (gen_subsi3 (out, out, addr)); + } + else + { + scratch2 = gen_reg_rtx (Pmode); + scratch3 = gen_reg_rtx (Pmode); + scratch4 = force_reg (Pmode, constm1_rtx); + + emit_move_insn (scratch3, addr); + eoschar = force_reg (QImode, eoschar); + + emit_insn (gen_cld ()); + if (TARGET_64BIT) + { + emit_insn (gen_strlenqi_rex_1 (scratch1, scratch3, eoschar, + align, scratch4, scratch3)); + emit_insn (gen_one_cmpldi2 (scratch2, scratch1)); + emit_insn (gen_adddi3 (out, scratch2, constm1_rtx)); + } + else + { + emit_insn (gen_strlenqi_1 (scratch1, scratch3, eoschar, + align, scratch4, scratch3)); + emit_insn (gen_one_cmplsi2 (scratch2, scratch1)); + emit_insn (gen_addsi3 (out, scratch2, constm1_rtx)); + } + } + return 1; +} + /* Expand the appropriate insns for doing strlen if not just doing repnz; scasb @@ -6851,9 +7844,9 @@ ix86_split_lshrdi (operands, scratch) This is just the body. It needs the initialisations mentioned above and some address computing at the end. These things are done in i386.md. */ -void -ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) - rtx out, align_rtx, scratch; +static void +ix86_expand_strlensi_unroll_1 (out, align_rtx) + rtx out, align_rtx; { int align; rtx tmp; @@ -6863,6 +7856,7 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) rtx end_0_label = gen_label_rtx (); rtx mem; rtx tmpreg = gen_reg_rtx (SImode); + rtx scratch = gen_reg_rtx (SImode); align = 0; if (GET_CODE (align_rtx) == CONST_INT) @@ -6873,6 +7867,8 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) /* Is there a known alignment and is it less than 4? */ if (align < 4) { + rtx scratch1 = gen_reg_rtx (Pmode); + emit_move_insn (scratch1, out); /* Is there a known alignment and is it not 2? */ if (align != 2) { @@ -6880,26 +7876,26 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */ /* Leave just the 3 lower bits. */ - align_rtx = expand_binop (SImode, and_optab, scratch, GEN_INT (3), + align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3), NULL_RTX, 0, OPTAB_WIDEN); emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, - SImode, 1, 0, align_4_label); + Pmode, 1, 0, align_4_label); emit_cmp_and_jump_insns (align_rtx, GEN_INT (2), EQ, NULL, - SImode, 1, 0, align_2_label); + Pmode, 1, 0, align_2_label); emit_cmp_and_jump_insns (align_rtx, GEN_INT (2), GTU, NULL, - SImode, 1, 0, align_3_label); + Pmode, 1, 0, align_3_label); } else { /* Since the alignment is 2, we have to check 2 or 0 bytes; check if is aligned to 4 - byte. */ - align_rtx = expand_binop (SImode, and_optab, scratch, GEN_INT (2), + align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (2), NULL_RTX, 0, OPTAB_WIDEN); emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL, - SImode, 1, 0, align_4_label); + Pmode, 1, 0, align_4_label); } mem = gen_rtx_MEM (QImode, out); @@ -6911,7 +7907,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) QImode, 1, 0, end_0_label); /* Increment the address. */ - emit_insn (gen_addsi3 (out, out, const1_rtx)); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, const1_rtx)); + else + emit_insn (gen_addsi3 (out, out, const1_rtx)); /* Not needed with an alignment of 2 */ if (align != 2) @@ -6921,7 +7920,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, 0, end_0_label); - emit_insn (gen_addsi3 (out, out, const1_rtx)); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, const1_rtx)); + else + emit_insn (gen_addsi3 (out, out, const1_rtx)); emit_label (align_3_label); } @@ -6929,7 +7931,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1, 0, end_0_label); - emit_insn (gen_addsi3 (out, out, const1_rtx)); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, const1_rtx)); + else + emit_insn (gen_addsi3 (out, out, const1_rtx)); } /* Generate loop to check 4 bytes at a time. It is not a good idea to @@ -6939,7 +7944,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) mem = gen_rtx_MEM (SImode, out); emit_move_insn (scratch, mem); - emit_insn (gen_addsi3 (out, out, GEN_INT (4))); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, GEN_INT (4))); + else + emit_insn (gen_addsi3 (out, out, GEN_INT (4))); /* This formula yields a nonzero result iff one of the bytes is zero. This saves three branches inside loop and many cycles. */ @@ -6947,13 +7955,16 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101))); emit_insn (gen_one_cmplsi2 (scratch, scratch)); emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch)); - emit_insn (gen_andsi3 (tmpreg, tmpreg, GEN_INT (0x80808080))); + emit_insn (gen_andsi3 (tmpreg, tmpreg, + GEN_INT (trunc_int_for_mode + (0x80808080, SImode)))); emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1, 0, align_4_label); if (TARGET_CMOVE) { rtx reg = gen_reg_rtx (SImode); + rtx reg2 = gen_reg_rtx (Pmode); emit_move_insn (reg, tmpreg); emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16))); @@ -6966,15 +7977,15 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) reg, tmpreg))); /* Emit lea manually to avoid clobbering of flags. */ - emit_insn (gen_rtx_SET (SImode, reg, - gen_rtx_PLUS (SImode, out, GEN_INT (2)))); + emit_insn (gen_rtx_SET (SImode, reg2, + gen_rtx_PLUS (Pmode, out, GEN_INT (2)))); tmp = gen_rtx_REG (CCNOmode, FLAGS_REG); tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx); emit_insn (gen_rtx_SET (VOIDmode, out, - gen_rtx_IF_THEN_ELSE (SImode, tmp, - reg, - out))); + gen_rtx_IF_THEN_ELSE (Pmode, tmp, + reg2, + out))); } else @@ -6993,7 +8004,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) /* Not in the first two. Move two bytes forward. */ emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16))); - emit_insn (gen_addsi3 (out, out, GEN_INT (2))); + if (TARGET_64BIT) + emit_insn (gen_adddi3 (out, out, GEN_INT (2))); + else + emit_insn (gen_addsi3 (out, out, GEN_INT (2))); emit_label (end_2_label); @@ -7002,7 +8016,10 @@ ix86_expand_strlensi_unroll_1 (out, align_rtx, scratch) /* Avoid branch in fixing the byte. */ tmpreg = gen_lowpart (QImode, tmpreg); emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg)); - emit_insn (gen_subsi3_carry (out, out, GEN_INT (3))); + if (TARGET_64BIT) + emit_insn (gen_subdi3_carry_rex64 (out, out, GEN_INT (3))); + else + emit_insn (gen_subsi3_carry (out, out, GEN_INT (3))); emit_label (end_0_label); } @@ -7078,7 +8095,9 @@ memory_address_length (addr) int len; if (GET_CODE (addr) == PRE_DEC - || GET_CODE (addr) == POST_INC) + || GET_CODE (addr) == POST_INC + || GET_CODE (addr) == PRE_MODIFY + || GET_CODE (addr) == POST_MODIFY) return 0; if (! ix86_decompose_address (addr, &parts)) @@ -7309,14 +8328,6 @@ ix86_adjust_cost (insn, link, dep_insn, cost) insn_type = get_attr_type (insn); dep_insn_type = get_attr_type (dep_insn); - /* Prologue and epilogue allocators can have a false dependency on ebp. - This results in one cycle extra stall on Pentium prologue scheduling, - so handle this important case manually. */ - if (dep_insn_code_number == CODE_FOR_pro_epilogue_adjust_stack - && dep_insn_type == TYPE_ALU - && !reg_mentioned_p (stack_pointer_rtx, insn)) - return 0; - switch (ix86_cpu) { case PROCESSOR_PENTIUM: @@ -7885,6 +8896,18 @@ ix86_data_alignment (type, align) || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 256) return 256; + /* x86-64 ABI requires arrays greater than 16 bytes to be aligned + to 16byte boundary. */ + if (TARGET_64BIT) + { + if (AGGREGATE_TYPE_P (type) + && TYPE_SIZE (type) + && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST + && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128 + || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128) + return 128; + } + if (TREE_CODE (type) == ARRAY_TYPE) { if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64) @@ -7932,6 +8955,17 @@ ix86_local_alignment (type, align) tree type; int align; { + /* x86-64 ABI requires arrays greater than 16 bytes to be aligned + to 16byte boundary. */ + if (TARGET_64BIT) + { + if (AGGREGATE_TYPE_P (type) + && TYPE_SIZE (type) + && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST + && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16 + || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128) + return 128; + } if (TREE_CODE (type) == ARRAY_TYPE) { if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64) @@ -7967,9 +9001,69 @@ ix86_local_alignment (type, align) } return align; } + +/* Emit RTL insns to initialize the variable parts of a trampoline. + FNADDR is an RTX for the address of the function's pure code. + CXT is an RTX for the static chain value for the function. */ +void +x86_initialize_trampoline (tramp, fnaddr, cxt) + rtx tramp, fnaddr, cxt; +{ + if (!TARGET_64BIT) + { + /* Compute offset from the end of the jmp to the target function. */ + rtx disp = expand_binop (SImode, sub_optab, fnaddr, + plus_constant (tramp, 10), + NULL_RTX, 1, OPTAB_DIRECT); + emit_move_insn (gen_rtx_MEM (QImode, tramp), + GEN_INT (trunc_int_for_mode (0xb9, QImode))); + emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 1)), cxt); + emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, 5)), + GEN_INT (trunc_int_for_mode (0xe9, QImode))); + emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 6)), disp); + } + else + { + int offset = 0; + /* Try to load address using shorter movl instead of movabs. + We may want to support movq for kernel mode, but kernel does not use + trampolines at the moment. */ + if (x86_64_zero_extended_value (fnaddr)) + { + fnaddr = copy_to_mode_reg (DImode, fnaddr); + emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)), + GEN_INT (trunc_int_for_mode (0xbb41, HImode))); + emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, offset + 2)), + gen_lowpart (SImode, fnaddr)); + offset += 6; + } + else + { + emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)), + GEN_INT (trunc_int_for_mode (0xbb49, HImode))); + emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)), + fnaddr); + offset += 10; + } + /* Load static chain using movabs to r10. */ + emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)), + GEN_INT (trunc_int_for_mode (0xba49, HImode))); + emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)), + cxt); + offset += 10; + /* Jump to the r11 */ + emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)), + GEN_INT (trunc_int_for_mode (0xff49, HImode))); + emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, offset+2)), + GEN_INT (trunc_int_for_mode (0xe3, HImode))); + offset += 3; + if (offset > TRAMPOLINE_SIZE) + abort(); + } +} #define def_builtin(NAME, TYPE, CODE) \ - builtin_function ((NAME), (TYPE), (CODE), BUILT_IN_MD, NULL_PTR) + builtin_function ((NAME), (TYPE), (CODE), BUILT_IN_MD, NULL) struct builtin_description { enum insn_code icode; @@ -9094,17 +10188,16 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) case IX86_BUILTIN_SETPS1: target = assign_386_stack_local (SFmode, 0); arg0 = TREE_VALUE (arglist); - emit_move_insn (change_address (target, SFmode, XEXP (target, 0)), + emit_move_insn (adjust_address (target, SFmode, 0), expand_expr (arg0, NULL_RTX, VOIDmode, 0)); op0 = gen_reg_rtx (V4SFmode); - emit_insn (gen_sse_loadss (op0, change_address (target, V4SFmode, - XEXP (target, 0)))); + emit_insn (gen_sse_loadss (op0, adjust_address (target, V4SFmode, 0))); emit_insn (gen_sse_shufps (op0, op0, op0, GEN_INT (0))); return op0; case IX86_BUILTIN_SETPS: target = assign_386_stack_local (V4SFmode, 0); - op0 = change_address (target, SFmode, XEXP (target, 0)); + op0 = adjust_address (target, SFmode, 0); arg0 = TREE_VALUE (arglist); arg1 = TREE_VALUE (TREE_CHAIN (arglist)); arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); @@ -9184,45 +10277,79 @@ ix86_force_to_memory (mode, operand) enum machine_mode mode; rtx operand; { + rtx result; if (!reload_completed) abort (); - switch (mode) + if (TARGET_64BIT && TARGET_RED_ZONE) + { + result = gen_rtx_MEM (mode, + gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + GEN_INT (-RED_ZONE_SIZE))); + emit_move_insn (result, operand); + } + else if (TARGET_64BIT && !TARGET_RED_ZONE) { - case DImode: + switch (mode) { - rtx operands[2]; - split_di (&operand, 1, operands, operands+1); + case HImode: + case SImode: + operand = gen_lowpart (DImode, operand); + /* FALLTHRU */ + case DImode: emit_insn ( - gen_rtx_SET (VOIDmode, - gen_rtx_MEM (SImode, - gen_rtx_PRE_DEC (Pmode, - stack_pointer_rtx)), - operands[1])); + gen_rtx_SET (VOIDmode, + gen_rtx_MEM (DImode, + gen_rtx_PRE_DEC (DImode, + stack_pointer_rtx)), + operand)); + break; + default: + abort (); + } + result = gen_rtx_MEM (mode, stack_pointer_rtx); + } + else + { + switch (mode) + { + case DImode: + { + rtx operands[2]; + split_di (&operand, 1, operands, operands + 1); + emit_insn ( + gen_rtx_SET (VOIDmode, + gen_rtx_MEM (SImode, + gen_rtx_PRE_DEC (Pmode, + stack_pointer_rtx)), + operands[1])); + emit_insn ( + gen_rtx_SET (VOIDmode, + gen_rtx_MEM (SImode, + gen_rtx_PRE_DEC (Pmode, + stack_pointer_rtx)), + operands[0])); + } + break; + case HImode: + /* It is better to store HImodes as SImodes. */ + if (!TARGET_PARTIAL_REG_STALL) + operand = gen_lowpart (SImode, operand); + /* FALLTHRU */ + case SImode: emit_insn ( - gen_rtx_SET (VOIDmode, - gen_rtx_MEM (SImode, - gen_rtx_PRE_DEC (Pmode, - stack_pointer_rtx)), - operands[0])); + gen_rtx_SET (VOIDmode, + gen_rtx_MEM (GET_MODE (operand), + gen_rtx_PRE_DEC (SImode, + stack_pointer_rtx)), + operand)); + break; + default: + abort (); } - break; - case HImode: - /* It is better to store HImodes as SImodes. */ - if (!TARGET_PARTIAL_REG_STALL) - operand = gen_lowpart (SImode, operand); - /* FALLTHRU */ - case SImode: - emit_insn ( - gen_rtx_SET (VOIDmode, - gen_rtx_MEM (GET_MODE (operand), - gen_rtx_PRE_DEC (SImode, - stack_pointer_rtx)), - operand)); - break; - default: - abort(); + result = gen_rtx_MEM (mode, stack_pointer_rtx); } - return gen_rtx_MEM (mode, stack_pointer_rtx); + return result; } /* Free operand from the memory. */ @@ -9230,15 +10357,22 @@ void ix86_free_from_memory (mode) enum machine_mode mode; { - /* Use LEA to deallocate stack space. In peephole2 it will be converted - to pop or add instruction if registers are available. */ - emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, - gen_rtx_PLUS (Pmode, stack_pointer_rtx, - GEN_INT (mode == DImode - ? 8 - : mode == HImode && TARGET_PARTIAL_REG_STALL - ? 2 - : 4)))); + if (!TARGET_64BIT || !TARGET_RED_ZONE) + { + int size; + + if (mode == DImode || TARGET_64BIT) + size = 8; + else if (mode == HImode && TARGET_PARTIAL_REG_STALL) + size = 2; + else + size = 4; + /* Use LEA to deallocate stack space. In peephole2 it will be converted + to pop or add instruction if registers are available. */ + emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, stack_pointer_rtx, + GEN_INT (size)))); + } } /* Put float CONST_DOUBLE in the constant pool instead of fp regs. @@ -9330,13 +10464,13 @@ ix86_register_move_cost (mode, class1, class2) stall. Count this as arbitarily high cost of 20. */ if (ix86_secondary_memory_needed (class1, class2, mode, 0)) { + int add_cost = 0; if (CLASS_MAX_NREGS (class1, mode) > CLASS_MAX_NREGS (class2, mode)) - return 10; + add_cost = 20; return (MEMORY_MOVE_COST (mode, class1, 0) - + MEMORY_MOVE_COST (mode, class2, 1)); + + MEMORY_MOVE_COST (mode, class2, 1) + add_cost); } - /* Moves between SSE/MMX and integer unit are expensive. - ??? We should make this cost CPU specific. */ + /* Moves between SSE/MMX and integer unit are expensive. */ if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2) || SSE_CLASS_P (class1) != SSE_CLASS_P (class2)) return ix86_cost->mmxsse_to_integer;