X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Fconfig%2Fi386%2Fi386.c;h=a88c6152a14c38912bd1cb0811166da9370ea310;hb=cf9d2c343781ec1f69c8cc6921466a4fac7f12ef;hp=af6ec6764ea7620dd98279b8e634fc2be6396339;hpb=d86ccd2cf85cab143f2ad064a0eebb1092397c2b;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index af6ec6764ea..a88c6152a14 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1,6 +1,6 @@ /* Subroutines used for code generation on IA-32. Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. + 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -50,6 +50,7 @@ Boston, MA 02110-1301, USA. */ #include "tree-gimple.h" #include "dwarf2.h" #include "tm-constrs.h" +#include "params.h" #ifndef CHECK_STACK_LIMIT #define CHECK_STACK_LIMIT (-1) @@ -67,8 +68,10 @@ Boston, MA 02110-1301, USA. */ /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ #define COSTS_N_BYTES(N) ((N) * 2) +#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}} + static const -struct processor_costs size_cost = { /* costs for tunning for size */ +struct processor_costs size_cost = { /* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of an add instruction */ COSTS_N_BYTES (3), /* cost of a lea instruction */ COSTS_N_BYTES (2), /* variable shift costs */ @@ -118,6 +121,10 @@ struct processor_costs size_cost = { /* costs for tunning for size */ COSTS_N_BYTES (2), /* cost of FABS instruction. */ COSTS_N_BYTES (2), /* cost of FCHS instruction. */ COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}} }; /* Processor costs (relative to an add) */ @@ -172,6 +179,10 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (22), /* cost of FABS instruction. */ COSTS_N_INSNS (24), /* cost of FCHS instruction. */ COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, }; static const @@ -225,6 +236,10 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -278,6 +293,10 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -331,6 +350,75 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure + the alignment). For small blocks inline loop is still a noticeable win, for bigger + blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently + more expensive startup time in CPU, but after 4K the difference is down in the noise. + */ + {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop}, + {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_4_byte, {{1024, unrolled_loop}, + {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} +}; + +static const +struct processor_costs geode_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1), /* cost of a lea instruction */ + COSTS_N_INSNS (2), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (7), /* SI */ + COSTS_N_INSNS (7), /* DI */ + COSTS_N_INSNS (7)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (23), /* HI */ + COSTS_N_INSNS (39), /* SI */ + COSTS_N_INSNS (39), /* DI */ + COSTS_N_INSNS (39)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 4, /* MOVE_RATIO */ + 1, /* cost for loading QImode using movzbl */ + {1, 1, 1}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {1, 1, 1}, /* cost of storing integer registers */ + 1, /* cost of reg,reg fld/fst */ + {1, 1, 1}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 6, 6}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + + 1, /* cost of moving MMX register */ + {1, 1}, /* cost of loading MMX registers + in SImode and DImode */ + {1, 1}, /* cost of storing MMX registers + in SImode and DImode */ + 1, /* cost of moving SSE register */ + {1, 1, 1}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {1, 1, 1}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 1, /* MMX or SSE register to integer */ + 32, /* size of prefetch block */ + 1, /* number of parallel prefetches */ + 1, /* Branch cost */ + COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (11), /* cost of FMUL instruction. */ + COSTS_N_INSNS (47), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -384,6 +472,10 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -437,6 +529,13 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + /* For some reason, Athlon deals better with REP prefix (relative to loops) + compared to K8. Alignment becomes important after 8 bytes for memcpy and + 128 bytes for memset. */ + {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -482,7 +581,85 @@ struct processor_costs k8_cost = { in SImode, DImode and TImode */ 5, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ - 6, /* number of parallel prefetches */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 5, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + /* K8 has optimized REP instruction for medium sized blocks, but for very small + blocks it is better to use loop. For large blocks, libcall can do + nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} +}; + +struct processor_costs amdfam10_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8 + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10 + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ 5, /* Branch cost */ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ COSTS_N_INSNS (4), /* cost of FMUL instruction. */ @@ -490,6 +667,15 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + + /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; static const @@ -543,6 +729,11 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ + {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, + {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, }; static const @@ -596,6 +787,72 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ + {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, + {libcall, {{32, loop}, {20000, rep_prefix_8_byte}, + {100000, unrolled_loop}, {-1, libcall}}}}, + {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, + {-1, libcall}}}, + {libcall, {{24, loop}, {64, unrolled_loop}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}} +}; + +static const +struct processor_costs core2_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (3), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (3), /* DI */ + COSTS_N_INSNS (3)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (22), /* HI */ + COSTS_N_INSNS (22), /* SI */ + COSTS_N_INSNS (22), /* DI */ + COSTS_N_INSNS (22)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 16, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {6, 6, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 4}, /* cost of loading integer registers */ + 2, /* cost of moving MMX register */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {6, 6, 6}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 4}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 2, /* MMX or SSE register to integer */ + 128, /* size of prefetch block */ + 8, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (5), /* cost of FMUL instruction. */ + COSTS_N_INSNS (32), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (58), /* cost of FSQRT instruction. */ + {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{32, loop}, {64, rep_prefix_4_byte}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {15, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{24, loop}, {32, unrolled_loop}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; /* Generic64 should produce code tuned for Nocona and K8. */ @@ -604,7 +861,7 @@ struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With this cost however our current implementation of synth_mult results in - use of unnecesary temporary registers causing regression on several + use of unnecessary temporary registers causing regression on several SPECfp benchmarks. */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ COSTS_N_INSNS (1), /* variable shift costs */ @@ -656,6 +913,10 @@ struct processor_costs generic64_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */ @@ -710,6 +971,10 @@ struct processor_costs generic32_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, }; const struct processor_costs *ix86_cost = &pentium_cost; @@ -719,39 +984,51 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_486 (1< 12) - error ("-mpreferred-stack-boundary=%d is not between %d and 12", i, - TARGET_64BIT ? 4 : 2); - else - ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT; - } - /* Validate -mbranch-cost= value, or provide default. */ ix86_branch_cost = ix86_cost->branch_cost; if (ix86_branch_cost_string) @@ -1853,7 +2271,7 @@ override_options (void) /* If we're doing fast math, we don't care about comparison order wrt NaNs. This lets us use a shorter comparison sequence. */ - if (flag_unsafe_math_optimizations) + if (flag_finite_math_only) target_flags &= ~MASK_IEEE_FP; /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387, @@ -1866,6 +2284,14 @@ override_options (void) if (!TARGET_80387) target_flags |= MASK_NO_FANCY_MATH_387; + /* Turn on SSE3 builtins for -mssse3. */ + if (TARGET_SSSE3) + target_flags |= MASK_SSE3; + + /* Turn on SSE3 builtins for -msse4a. */ + if (TARGET_SSE4A) + target_flags |= MASK_SSE3; + /* Turn on SSE2 builtins for -msse3. */ if (TARGET_SSE3) target_flags |= MASK_SSE2; @@ -1885,6 +2311,10 @@ override_options (void) if (TARGET_3DNOW) target_flags |= MASK_MMX; + /* Turn on POPCNT builtins for -mabm. */ + if (TARGET_ABM) + target_flags |= MASK_POPCNT; + if (TARGET_64BIT) { if (TARGET_ALIGN_DOUBLE) @@ -1907,21 +2337,26 @@ override_options (void) target_flags |= MASK_NO_RED_ZONE; } + /* Validate -mpreferred-stack-boundary= value, or provide default. + The default of 128 bits is for Pentium III's SSE __m128. We can't + change it because of optimize_size. Otherwise, we can't mix object + files compiled with -Os and -On. */ + ix86_preferred_stack_boundary = 128; + if (ix86_preferred_stack_boundary_string) + { + i = atoi (ix86_preferred_stack_boundary_string); + if (i < (TARGET_64BIT ? 4 : 2) || i > 12) + error ("-mpreferred-stack-boundary=%d is not between %d and 12", i, + TARGET_64BIT ? 4 : 2); + else + ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT; + } + /* Accept -msseregparm only if at least SSE support is enabled. */ if (TARGET_SSEREGPARM && ! TARGET_SSE) error ("-msseregparm used without SSE enabled"); - /* Accept -msselibm only if at least SSE support is enabled. */ - if (TARGET_SSELIBM - && ! TARGET_SSE2) - error ("-msselibm used without SSE2 enabled"); - - /* Ignore -msselibm on 64bit targets. */ - if (TARGET_SSELIBM - && TARGET_64BIT) - error ("-msselibm used on a 64bit target"); - ix86_fpmath = TARGET_FPMATH_DEFAULT; if (ix86_fpmath_string != 0) @@ -1995,6 +2430,12 @@ override_options (void) so it won't slow down the compilation and make x87 code slower. */ if (!TARGET_SCHEDULE) flag_schedule_insns_after_reload = flag_schedule_insns = 0; + + if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES)) + set_param_value ("simultaneous-prefetches", + ix86_cost->simultaneous_prefetches); + if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE)) + set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block); } /* switch to the appropriate section for output of DECL. @@ -2149,7 +2590,7 @@ x86_elf_aligned_common (FILE *file, fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n", size, align / BITS_PER_UNIT); } - +#endif /* Utility function for targets to use in implementing ASM_OUTPUT_ALIGNED_BSS. */ @@ -2173,7 +2614,6 @@ x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED, #endif /* ASM_DECLARE_OBJECT_NAME */ ASM_OUTPUT_SKIP (file, size ? size : 1); } -#endif void optimization_options (int level, int size ATTRIBUTE_UNUSED) @@ -2221,6 +2661,9 @@ const struct attribute_spec ix86_attribute_table[] = /* Sseregparm attribute says we are using x86_64 calling conventions for FP arguments. */ { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* force_align_arg_pointer says this function realigns the stack at entry. */ + { (const char *)&ix86_force_align_arg_pointer_string, 0, 0, + false, true, true, ix86_handle_cconv_attribute }, #if TARGET_DLLIMPORT_DECL_ATTRIBUTES { "dllimport", 0, 0, false, false, false, handle_dll_attribute }, { "dllexport", 0, 0, false, false, false, handle_dll_attribute }, @@ -2254,7 +2697,7 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) func = decl; else { - func = TREE_TYPE (TREE_OPERAND (exp, 0)); + func = TREE_TYPE (CALL_EXPR_FN (exp)); if (POINTER_TYPE_P (func)) func = TREE_TYPE (func); } @@ -2289,7 +2732,7 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) tree type; /* We're looking at the CALL_EXPR, we need the type of the function. */ - type = TREE_OPERAND (exp, 0); /* pointer expression */ + type = CALL_EXPR_FN (exp); /* pointer expression */ type = TREE_TYPE (type); /* pointer type */ type = TREE_TYPE (type); /* function type */ @@ -2363,6 +2806,15 @@ ix86_handle_cconv_attribute (tree *node, tree name, *no_add_attrs = true; } + if (!TARGET_64BIT + && lookup_attribute (ix86_force_align_arg_pointer_string, + TYPE_ATTRIBUTES (*node)) + && compare_tree_int (cst, REGPARM_MAX-1)) + { + error ("%s functions limited to %d register parameters", + ix86_force_align_arg_pointer_string, REGPARM_MAX-1); + } + return NULL_TREE; } @@ -2502,6 +2954,18 @@ ix86_function_regparm (tree type, tree decl) && decl_function_context (decl) && !DECL_NO_STATIC_CHAIN (decl)) local_regparm = 2; + /* If the function realigns its stackpointer, the + prologue will clobber %ecx. If we've already + generated code for the callee, the callee + DECL_STRUCT_FUNCTION is gone, so we fall back to + scanning the attributes for the self-realigning + property. */ + if ((DECL_STRUCT_FUNCTION (decl) + && DECL_STRUCT_FUNCTION (decl)->machine->force_align_arg_pointer) + || (!DECL_STRUCT_FUNCTION (decl) + && lookup_attribute (ix86_force_align_arg_pointer_string, + TYPE_ATTRIBUTES (TREE_TYPE (decl))))) + local_regparm = 2; /* Each global register variable increases register preassure, so the more global reg vars there are, the smaller regparm optimization use, unless requested by the user explicitly. */ @@ -2519,10 +2983,10 @@ ix86_function_regparm (tree type, tree decl) return regparm; } -/* Return 1 or 2, if we can pass up to 8 SFmode (1) and DFmode (2) arguments - in SSE registers for a function with the indicated TYPE and DECL. - DECL may be NULL when calling function indirectly - or considering a libcall. Otherwise return 0. */ +/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and + DFmode (2) arguments in SSE registers for a function with the + indicated TYPE and DECL. DECL may be NULL when calling function + indirectly or considering a libcall. Otherwise return 0. */ static int ix86_function_sseregparm (tree type, tree decl) @@ -2547,9 +3011,9 @@ ix86_function_sseregparm (tree type, tree decl) return 2; } - /* For local functions, pass SFmode (and DFmode for SSE2) arguments - in SSE registers even for 32-bit mode and not just 3, but up to - 8 SSE arguments in registers. */ + /* For local functions, pass up to SSE_REGPARM_MAX SFmode + (and DFmode for SSE2) arguments in SSE registers, + even for 32-bit targets. */ if (!TARGET_64BIT && decl && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag) { @@ -2637,15 +3101,29 @@ ix86_function_arg_regno_p (int regno) { int i; if (!TARGET_64BIT) - return (regno < REGPARM_MAX - || (TARGET_MMX && MMX_REGNO_P (regno) - && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX)) - || (TARGET_SSE && SSE_REGNO_P (regno) - && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))); - - if (TARGET_SSE && SSE_REGNO_P (regno) - && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)) - return true; + { + if (TARGET_MACHO) + return (regno < REGPARM_MAX + || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno])); + else + return (regno < REGPARM_MAX + || (TARGET_MMX && MMX_REGNO_P (regno) + && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX)) + || (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))); + } + + if (TARGET_MACHO) + { + if (SSE_REGNO_P (regno) && TARGET_SSE) + return true; + } + else + { + if (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)) + return true; + } /* RAX is used as hidden argument to va_arg functions. */ if (!regno) return true; @@ -2769,7 +3247,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ When we have only some of our vector isa extensions enabled, then there are some modes for which vector_mode_supported_p is false. For these modes, the generic vector support in gcc will choose some non-vector mode - in order to implement the type. By computing the natural mode, we'll + in order to implement the type. By computing the natural mode, we'll select the proper ABI location for the operand and not depend on whatever the middle-end decides to do with these vector types. */ @@ -2928,32 +3406,6 @@ classify_argument (enum machine_mode mode, tree type, switch (TREE_CODE (type)) { case RECORD_TYPE: - /* For classes first merge in the field of the subclasses. */ - if (TYPE_BINFO (type)) - { - tree binfo, base_binfo; - int basenum; - - for (binfo = TYPE_BINFO (type), basenum = 0; - BINFO_BASE_ITERATE (binfo, basenum, base_binfo); basenum++) - { - int num; - int offset = tree_low_cst (BINFO_OFFSET (base_binfo), 0) * 8; - tree type = BINFO_TYPE (base_binfo); - - num = classify_argument (TYPE_MODE (type), - type, subclasses, - (offset + bit_offset) % 256); - if (!num) - return 0; - for (i = 0; i < num; i++) - { - int pos = (offset + (bit_offset % 64)) / 8 / 8; - classes[i + pos] = - merge_classes (subclasses[i], classes[i + pos]); - } - } - } /* And now merge the fields of structure. */ for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) { @@ -2961,6 +3413,9 @@ classify_argument (enum machine_mode mode, tree type, { int num; + if (TREE_TYPE (field) == error_mark_node) + continue; + /* Bitfields are always classified as integer. Handle them early, since later code would consider them to be misaligned integers. */ @@ -3008,25 +3463,25 @@ classify_argument (enum machine_mode mode, tree type, subclasses[0] = X86_64_SSE_CLASS; if (subclasses[0] == X86_64_INTEGERSI_CLASS && bytes != 4) subclasses[0] = X86_64_INTEGER_CLASS; - + for (i = 0; i < words; i++) classes[i] = subclasses[i % num]; - + break; } case UNION_TYPE: case QUAL_UNION_TYPE: /* Unions are similar to RECORD_TYPE but offset is always 0. */ - - /* Unions are not derived. */ - gcc_assert (!TYPE_BINFO (type) - || !BINFO_N_BASE_BINFOS (TYPE_BINFO (type))); for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) { if (TREE_CODE (field) == FIELD_DECL) { int num; + + if (TREE_TYPE (field) == error_mark_node) + continue; + num = classify_argument (TYPE_MODE (TREE_TYPE (field)), TREE_TYPE (field), subclasses, bit_offset); @@ -3165,12 +3620,12 @@ classify_argument (enum machine_mode mode, tree type, return 0; default: gcc_assert (VECTOR_MODE_P (mode)); - + if (bytes > 16) return 0; - + gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT); - + if (bit_offset + GET_MODE_BITSIZE (mode) <= 32) classes[0] = X86_64_INTEGERSI_CLASS; else @@ -3229,6 +3684,11 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, tree type, int in_return, int nintregs, int nsseregs, const int *intreg, int sse_regno) { + /* The following variables hold the static issued_error state. */ + static bool issued_sse_arg_error; + static bool issued_sse_ret_error; + static bool issued_x87_ret_error; + enum machine_mode tmpmode; int bytes = (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); @@ -3267,18 +3727,38 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, some less clueful developer tries to use floating-point anyway. */ if (needed_sseregs && !TARGET_SSE) { - static bool issued_error; - if (!issued_error) + if (in_return) { - issued_error = true; - if (in_return) - error ("SSE register return with SSE disabled"); - else - error ("SSE register argument with SSE disabled"); + if (!issued_sse_ret_error) + { + error ("SSE register return with SSE disabled"); + issued_sse_ret_error = true; + } + } + else if (!issued_sse_arg_error) + { + error ("SSE register argument with SSE disabled"); + issued_sse_arg_error = true; } return NULL; } + /* Likewise, error if the ABI requires us to return values in the + x87 registers and the user specified -mno-80387. */ + if (!TARGET_80387 && in_return) + for (i = 0; i < n; i++) + if (class[i] == X86_64_X87_CLASS + || class[i] == X86_64_X87UP_CLASS + || class[i] == X86_64_COMPLEX_X87_CLASS) + { + if (!issued_x87_ret_error) + { + error ("x87 register return with x87 disabled"); + issued_x87_ret_error = true; + } + return NULL; + } + /* First construct simple cases. Avoid SCmode, since we want to use single register to pass this type. */ if (n == 1 && mode != SCmode) @@ -3679,19 +4159,8 @@ contains_128bit_aligned_vector_p (tree type) case QUAL_UNION_TYPE: { tree field; - - if (TYPE_BINFO (type)) - { - tree binfo, base_binfo; - int i; - - for (binfo = TYPE_BINFO (type), i = 0; - BINFO_BASE_ITERATE (binfo, i, base_binfo); i++) - if (contains_128bit_aligned_vector_p - (BINFO_TYPE (base_binfo))) - return true; - } - /* And now merge the fields of structure. */ + + /* Walk all the structure fields. */ for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) { if (TREE_CODE (field) == FIELD_DECL @@ -3706,7 +4175,7 @@ contains_128bit_aligned_vector_p (tree type) if (contains_128bit_aligned_vector_p (TREE_TYPE (type))) return true; break; - + default: gcc_unreachable (); } @@ -3758,16 +4227,31 @@ ix86_function_arg_boundary (enum machine_mode mode, tree type) bool ix86_function_value_regno_p (int regno) { - if (regno == 0 - || (regno == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387) - || (regno == FIRST_SSE_REG && TARGET_SSE)) - return true; + if (TARGET_MACHO) + { + if (!TARGET_64BIT) + { + return ((regno) == 0 + || ((regno) == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387) + || ((regno) == FIRST_SSE_REG && TARGET_SSE)); + } + return ((regno) == 0 || (regno) == FIRST_FLOAT_REG + || ((regno) == FIRST_SSE_REG && TARGET_SSE) + || ((regno) == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387)); + } + else + { + if (regno == 0 + || (regno == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387) + || (regno == FIRST_SSE_REG && TARGET_SSE)) + return true; - if (!TARGET_64BIT - && (regno == FIRST_MMX_REG && TARGET_MMX)) - return true; + if (!TARGET_64BIT + && (regno == FIRST_MMX_REG && TARGET_MMX)) + return true; - return false; + return false; + } } /* Define how to find the value returned by a function. @@ -3933,14 +4417,16 @@ ix86_value_regno (enum machine_mode mode, tree func, tree fntype) gcc_assert (!TARGET_64BIT); /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where - we prevent this case when mmx is not available. */ - if ((VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)) - return FIRST_MMX_REG; + we normally prevent this case when mmx is not available. However + some ABIs may require the result to be returned like DImode. */ + if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8) + return TARGET_MMX ? FIRST_MMX_REG : 0; /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where - we prevent this case when sse is not available. */ + we prevent this case when sse is not available. However some ABIs + may require the result to be returned like integer TImode. */ if (mode == TImode || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16)) - return FIRST_SSE_REG; + return TARGET_SSE ? FIRST_SSE_REG : 0; /* Decimal floating point values can go in %eax, unlike other float modes. */ if (DECIMAL_FLOAT_MODE_P (mode)) @@ -4122,6 +4608,7 @@ ix86_va_start (tree valist, rtx nextarg) HOST_WIDE_INT words, n_gpr, n_fpr; tree f_gpr, f_fpr, f_ovf, f_sav; tree gpr, fpr, ovf, sav, t; + tree type; /* Only 64bit target needs something special. */ if (!TARGET_64BIT) @@ -4152,26 +4639,29 @@ ix86_va_start (tree valist, rtx nextarg) if (cfun->va_list_gpr_size) { - t = build2 (MODIFY_EXPR, TREE_TYPE (gpr), gpr, - build_int_cst (NULL_TREE, n_gpr * 8)); + type = TREE_TYPE (gpr); + t = build2 (GIMPLE_MODIFY_STMT, type, gpr, + build_int_cst (type, n_gpr * 8)); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); } if (cfun->va_list_fpr_size) { - t = build2 (MODIFY_EXPR, TREE_TYPE (fpr), fpr, - build_int_cst (NULL_TREE, n_fpr * 16 + 8*REGPARM_MAX)); + type = TREE_TYPE (fpr); + t = build2 (GIMPLE_MODIFY_STMT, type, fpr, + build_int_cst (type, n_fpr * 16 + 8*REGPARM_MAX)); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); } /* Find the overflow area. */ - t = make_tree (TREE_TYPE (ovf), virtual_incoming_args_rtx); + type = TREE_TYPE (ovf); + t = make_tree (type, virtual_incoming_args_rtx); if (words != 0) - t = build2 (PLUS_EXPR, TREE_TYPE (ovf), t, - build_int_cst (NULL_TREE, words * UNITS_PER_WORD)); - t = build2 (MODIFY_EXPR, TREE_TYPE (ovf), ovf, t); + t = build2 (PLUS_EXPR, type, t, + build_int_cst (type, words * UNITS_PER_WORD)); + t = build2 (GIMPLE_MODIFY_STMT, type, ovf, t); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); @@ -4179,8 +4669,9 @@ ix86_va_start (tree valist, rtx nextarg) { /* Find the register save area. Prologue of the function save it right above stack frame. */ - t = make_tree (TREE_TYPE (sav), frame_pointer_rtx); - t = build2 (MODIFY_EXPR, TREE_TYPE (sav), sav, t); + type = TREE_TYPE (sav); + t = make_tree (type, frame_pointer_rtx); + t = build2 (GIMPLE_MODIFY_STMT, type, sav, t); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); } @@ -4317,7 +4808,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) /* int_addr = gpr + sav; */ t = fold_convert (ptr_type_node, gpr); t = build2 (PLUS_EXPR, ptr_type_node, sav, t); - t = build2 (MODIFY_EXPR, void_type_node, int_addr, t); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, int_addr, t); gimplify_and_add (t, pre_p); } if (needed_sseregs) @@ -4325,7 +4816,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) /* sse_addr = fpr + sav; */ t = fold_convert (ptr_type_node, fpr); t = build2 (PLUS_EXPR, ptr_type_node, sav, t); - t = build2 (MODIFY_EXPR, void_type_node, sse_addr, t); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, sse_addr, t); gimplify_and_add (t, pre_p); } if (need_temp) @@ -4335,7 +4826,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) /* addr = &temp; */ t = build1 (ADDR_EXPR, build_pointer_type (type), temp); - t = build2 (MODIFY_EXPR, void_type_node, addr, t); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t); gimplify_and_add (t, pre_p); for (i = 0; i < XVECLEN (container, 0); i++) @@ -4369,7 +4860,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) size_int (INTVAL (XEXP (slot, 1))))); dest = build_va_arg_indirect_ref (dest_addr); - t = build2 (MODIFY_EXPR, void_type_node, dest, src); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, dest, src); gimplify_and_add (t, pre_p); } } @@ -4378,14 +4869,14 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) { t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr, build_int_cst (TREE_TYPE (gpr), needed_intregs * 8)); - t = build2 (MODIFY_EXPR, TREE_TYPE (gpr), gpr, t); + t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (gpr), gpr, t); gimplify_and_add (t, pre_p); } if (needed_sseregs) { t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr, build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16)); - t = build2 (MODIFY_EXPR, TREE_TYPE (fpr), fpr, t); + t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (fpr), fpr, t); gimplify_and_add (t, pre_p); } @@ -4412,12 +4903,12 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) } gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue); - t2 = build2 (MODIFY_EXPR, void_type_node, addr, t); + t2 = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t); gimplify_and_add (t2, pre_p); t = build2 (PLUS_EXPR, TREE_TYPE (t), t, build_int_cst (TREE_TYPE (t), rsize * UNITS_PER_WORD)); - t = build2 (MODIFY_EXPR, TREE_TYPE (ovf), ovf, t); + t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (ovf), ovf, t); gimplify_and_add (t, pre_p); if (container) @@ -4449,7 +4940,7 @@ ix86_check_movabs (rtx insn, int opnum) mem = XEXP (set, opnum); while (GET_CODE (mem) == SUBREG) mem = SUBREG_REG (mem); - gcc_assert (GET_CODE (mem) == MEM); + gcc_assert (MEM_P (mem)); return (volatile_ok || !MEM_VOLATILE_P (mem)); } @@ -4485,6 +4976,8 @@ init_ext_80387_constants (void) int standard_80387_constant_p (rtx x) { + REAL_VALUE_TYPE r; + if (GET_CODE (x) != CONST_DOUBLE || !FLOAT_MODE_P (GET_MODE (x))) return -1; @@ -4493,23 +4986,30 @@ standard_80387_constant_p (rtx x) if (x == CONST1_RTX (GET_MODE (x))) return 2; + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + /* For XFmode constants, try to find a special 80387 instruction when optimizing for size or on those CPUs that benefit from them. */ if (GET_MODE (x) == XFmode && (optimize_size || x86_ext_80387_constants & TUNEMASK)) { - REAL_VALUE_TYPE r; int i; if (! ext_80387_constants_init) init_ext_80387_constants (); - REAL_VALUE_FROM_CONST_DOUBLE (r, x); for (i = 0; i < 5; i++) if (real_identical (&r, &ext_80387_constants_table[i])) return i + 3; } + /* Load of the constant -0.0 or -1.0 will be split as + fldz;fchs or fld1;fchs sequence. */ + if (real_isnegzero (&r)) + return 8; + if (real_identical (&r, &dconstm1)) + return 9; + return 0; } @@ -4535,6 +5035,9 @@ standard_80387_constant_opcode (rtx x) return "fldl2t"; case 7: return "fldpi"; + case 8: + case 9: + return "#"; default: gcc_unreachable (); } @@ -4570,24 +5073,70 @@ standard_80387_constant_rtx (int idx) XFmode); } +/* Return 1 if mode is a valid mode for sse. */ +static int +standard_sse_mode_p (enum machine_mode mode) +{ + switch (mode) + { + case V16QImode: + case V8HImode: + case V4SImode: + case V2DImode: + case V4SFmode: + case V2DFmode: + return 1; + + default: + return 0; + } +} + /* Return 1 if X is FP constant we can load to SSE register w/o using memory. */ int standard_sse_constant_p (rtx x) { - if (x == const0_rtx) + enum machine_mode mode = GET_MODE (x); + + if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x))) return 1; - return (x == CONST0_RTX (GET_MODE (x))); + if (vector_all_ones_operand (x, mode) + && standard_sse_mode_p (mode)) + return TARGET_SSE2 ? 2 : -1; + + return 0; } -/* Returns 1 if OP contains a symbol reference */ +/* Return the opcode of the special instruction to be used to load + the constant X. */ -int -symbolic_reference_mentioned_p (rtx op) +const char * +standard_sse_constant_opcode (rtx insn, rtx x) { - const char *fmt; - int i; - + switch (standard_sse_constant_p (x)) + { + case 1: + if (get_attr_mode (insn) == MODE_V4SF) + return "xorps\t%0, %0"; + else if (get_attr_mode (insn) == MODE_V2DF) + return "xorpd\t%0, %0"; + else + return "pxor\t%0, %0"; + case 2: + return "pcmpeqd\t%0, %0"; + } + gcc_unreachable (); +} + +/* Returns 1 if OP contains a symbol reference */ + +int +symbolic_reference_mentioned_p (rtx op) +{ + const char *fmt; + int i; + if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF) return 1; @@ -4687,6 +5236,8 @@ static int pic_labels_used; static void get_pc_thunk_name (char name[32], unsigned int regno) { + gcc_assert (!TARGET_64BIT); + if (USE_HIDDEN_LINKONCE) sprintf (name, "__i686.get_pc_thunk.%s", reg_names[regno]); else @@ -4926,7 +5477,7 @@ ix86_initial_elimination_offset (int from, int to) if (from == ARG_POINTER_REGNUM) return frame.stack_pointer_offset; - + gcc_assert (from == FRAME_POINTER_REGNUM); return frame.stack_pointer_offset - frame.frame_pointer_offset; } @@ -5080,18 +5631,22 @@ ix86_compute_frame_layout (struct ix86_frame *frame) frame->to_allocate -= frame->red_zone_size; frame->stack_pointer_offset -= frame->red_zone_size; #if 0 - fprintf (stderr, "nregs: %i\n", frame->nregs); - fprintf (stderr, "size: %i\n", size); - fprintf (stderr, "alignment1: %i\n", stack_alignment_needed); - fprintf (stderr, "padding1: %i\n", frame->padding1); - fprintf (stderr, "va_arg: %i\n", frame->va_arg_size); - fprintf (stderr, "padding2: %i\n", frame->padding2); - fprintf (stderr, "to_allocate: %i\n", frame->to_allocate); - fprintf (stderr, "red_zone_size: %i\n", frame->red_zone_size); - fprintf (stderr, "frame_pointer_offset: %i\n", frame->frame_pointer_offset); - fprintf (stderr, "hard_frame_pointer_offset: %i\n", - frame->hard_frame_pointer_offset); - fprintf (stderr, "stack_pointer_offset: %i\n", frame->stack_pointer_offset); + fprintf (stderr, "\n"); + fprintf (stderr, "nregs: %ld\n", (long)frame->nregs); + fprintf (stderr, "size: %ld\n", (long)size); + fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed); + fprintf (stderr, "padding1: %ld\n", (long)frame->padding1); + fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size); + fprintf (stderr, "padding2: %ld\n", (long)frame->padding2); + fprintf (stderr, "to_allocate: %ld\n", (long)frame->to_allocate); + fprintf (stderr, "red_zone_size: %ld\n", (long)frame->red_zone_size); + fprintf (stderr, "frame_pointer_offset: %ld\n", (long)frame->frame_pointer_offset); + fprintf (stderr, "hard_frame_pointer_offset: %ld\n", + (long)frame->hard_frame_pointer_offset); + fprintf (stderr, "stack_pointer_offset: %ld\n", (long)frame->stack_pointer_offset); + fprintf (stderr, "current_function_is_leaf: %ld\n", (long)current_function_is_leaf); + fprintf (stderr, "current_function_calls_alloca: %ld\n", (long)current_function_calls_alloca); + fprintf (stderr, "x86_current_function_calls_tls_descriptor: %ld\n", (long)ix86_current_function_calls_tls_descriptor); #endif } @@ -5153,7 +5708,7 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style) shouldn't be used together with huge frame sizes in one function because of the frame_size check in sibcall.c. */ gcc_assert (style); - r11 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */); + r11 = gen_rtx_REG (DImode, R11_REG); insn = emit_insn (gen_rtx_SET (DImode, r11, offset)); if (style < 0) RTX_FRAME_RELATED_P (insn) = 1; @@ -5169,11 +5724,28 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style) static rtx ix86_internal_arg_pointer (void) { - if (FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN - && DECL_NAME (current_function_decl) - && MAIN_NAME_P (DECL_NAME (current_function_decl)) - && DECL_FILE_SCOPE_P (current_function_decl)) - { + bool has_force_align_arg_pointer = + (0 != lookup_attribute (ix86_force_align_arg_pointer_string, + TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl)))); + if ((FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN + && DECL_NAME (current_function_decl) + && MAIN_NAME_P (DECL_NAME (current_function_decl)) + && DECL_FILE_SCOPE_P (current_function_decl)) + || ix86_force_align_arg_pointer + || has_force_align_arg_pointer) + { + /* Nested functions can't realign the stack due to a register + conflict. */ + if (DECL_CONTEXT (current_function_decl) + && TREE_CODE (DECL_CONTEXT (current_function_decl)) == FUNCTION_DECL) + { + if (ix86_force_align_arg_pointer) + warning (0, "-mstackrealign ignored for nested functions"); + if (has_force_align_arg_pointer) + error ("%s not supported for nested functions", + ix86_force_align_arg_pointer_string); + return virtual_incoming_args_rtx; + } cfun->machine->force_align_arg_pointer = gen_rtx_REG (Pmode, 2); return copy_to_reg (cfun->machine->force_align_arg_pointer); } @@ -5249,7 +5821,7 @@ ix86_expand_prologue (void) /* And here we cheat like madmen with the unwind info. We force the cfa register back to sp+4, which is exactly what it was at the start of the function. Re-pushing the return address results in - the return at the same spot relative to the cfa, and thus is + the return at the same spot relative to the cfa, and thus is correct wrt the unwind info. */ x = cfun->machine->force_align_arg_pointer; x = gen_frame_mem (Pmode, plus_constant (x, -4)); @@ -5393,7 +5965,7 @@ ix86_emit_restore_regs_using_mov (rtx pointer, HOST_WIDE_INT offset, { rtx r11; - r11 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */); + r11 = gen_rtx_REG (DImode, R11_REG); emit_move_insn (r11, GEN_INT (offset)); emit_insn (gen_adddi3 (r11, r11, pointer)); base_address = gen_rtx_MEM (Pmode, r11); @@ -5585,6 +6157,23 @@ ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED, { if (pic_offset_table_rtx) REGNO (pic_offset_table_rtx) = REAL_PIC_OFFSET_TABLE_REGNUM; +#if TARGET_MACHO + /* Mach-O doesn't support labels at the end of objects, so if + it looks like we might want one, insert a NOP. */ + { + rtx insn = get_last_insn (); + while (insn + && NOTE_P (insn) + && NOTE_LINE_NUMBER (insn) != NOTE_INSN_DELETED_LABEL) + insn = PREV_INSN (insn); + if (insn + && (LABEL_P (insn) + || (NOTE_P (insn) + && NOTE_LINE_NUMBER (insn) == NOTE_INSN_DELETED_LABEL))) + fputs ("\tnop\n", file); + } +#endif + } /* Extract the parts of an RTL expression that is a valid memory address @@ -5602,7 +6191,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) int retval = 1; enum ix86_address_seg seg = SEG_DEFAULT; - if (GET_CODE (addr) == REG || GET_CODE (addr) == SUBREG) + if (REG_P (addr) || GET_CODE (addr) == SUBREG) base = addr; else if (GET_CODE (addr) == PLUS) { @@ -5679,7 +6268,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) /* We're called for lea too, which implements ashift on occasion. */ index = XEXP (addr, 0); tmp = XEXP (addr, 1); - if (GET_CODE (tmp) != CONST_INT) + if (!CONST_INT_P (tmp)) return 0; scale = INTVAL (tmp); if ((unsigned HOST_WIDE_INT) scale > 3) @@ -5693,7 +6282,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) /* Extract the integral value of scale. */ if (scale_rtx) { - if (GET_CODE (scale_rtx) != CONST_INT) + if (!CONST_INT_P (scale_rtx)) return 0; scale = INTVAL (scale_rtx); } @@ -5822,7 +6411,7 @@ ix86_find_base_term (rtx x) return x; term = XEXP (x, 0); if (GET_CODE (term) == PLUS - && (GET_CODE (XEXP (term, 1)) == CONST_INT + && (CONST_INT_P (XEXP (term, 1)) || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE)) term = XEXP (term, 0); if (GET_CODE (term) != UNSPEC @@ -5882,7 +6471,7 @@ legitimate_constant_p (rtx x) if (GET_CODE (x) == PLUS) { - if (GET_CODE (XEXP (x, 1)) != CONST_INT) + if (!CONST_INT_P (XEXP (x, 1))) return false; x = XEXP (x, 0); } @@ -5922,6 +6511,18 @@ legitimate_constant_p (rtx x) return false; break; + case CONST_DOUBLE: + if (GET_MODE (x) == TImode + && x != CONST0_RTX (TImode) + && !TARGET_64BIT) + return false; + break; + + case CONST_VECTOR: + if (x == CONST0_RTX (GET_MODE (x))) + return true; + return false; + default: break; } @@ -5937,6 +6538,17 @@ legitimate_constant_p (rtx x) static bool ix86_cannot_force_const_mem (rtx x) { + /* We can always put integral constants and vectors in memory. */ + switch (GET_CODE (x)) + { + case CONST_INT: + case CONST_DOUBLE: + case CONST_VECTOR: + return false; + + default: + break; + } return !legitimate_constant_p (x); } @@ -5962,7 +6574,7 @@ legitimate_pic_operand_p (rtx x) case CONST: inner = XEXP (x, 0); if (GET_CODE (inner) == PLUS - && GET_CODE (XEXP (inner, 1)) == CONST_INT) + && CONST_INT_P (XEXP (inner, 1))) inner = XEXP (inner, 0); /* Only some unspecs are valid as "constants". */ @@ -6013,10 +6625,10 @@ legitimate_pic_address_disp_p (rtx disp) break; op0 = XEXP (XEXP (disp, 0), 0); op1 = XEXP (XEXP (disp, 0), 1); - if (GET_CODE (op1) != CONST_INT + if (!CONST_INT_P (op1) || INTVAL (op1) >= 16*1024*1024 || INTVAL (op1) < -16*1024*1024) - break; + break; if (GET_CODE (op0) == LABEL_REF) return true; if (GET_CODE (op0) != SYMBOL_REF) @@ -6057,7 +6669,7 @@ legitimate_pic_address_disp_p (rtx disp) saw_plus = false; if (GET_CODE (disp) == PLUS) { - if (GET_CODE (XEXP (disp, 1)) != CONST_INT) + if (!CONST_INT_P (XEXP (disp, 1))) return 0; disp = XEXP (disp, 0); saw_plus = true; @@ -6151,7 +6763,7 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) { rtx reg; reason_rtx = base; - + if (REG_P (base)) reg = base; else if (GET_CODE (base) == SUBREG @@ -6251,7 +6863,7 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) goto is_legitimate_pic; reason = "64bit address unspec"; goto report_error; - + case UNSPEC_GOTPCREL: gcc_assert (flag_pic); goto is_legitimate_pic; @@ -6268,12 +6880,16 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) goto report_error; } - else if (flag_pic && (SYMBOLIC_CONST (disp) + else if (SYMBOLIC_CONST (disp) + && (flag_pic + || (TARGET_MACHO #if TARGET_MACHO - && !machopic_operand_p (disp) + && MACHOPIC_INDIRECT + && !machopic_operand_p (disp) #endif - )) + ))) { + is_legitimate_pic: if (TARGET_64BIT && (index || base)) { @@ -6281,7 +6897,7 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) if (GET_CODE (disp) != CONST || GET_CODE (XEXP (disp, 0)) != PLUS || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC - || GET_CODE (XEXP (XEXP (disp, 0), 1)) != CONST_INT + || !CONST_INT_P (XEXP (XEXP (disp, 0), 1)) || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF)) { @@ -6318,7 +6934,7 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) correct fix for crash to disable this test. */ } else if (GET_CODE (disp) != LABEL_REF - && GET_CODE (disp) != CONST_INT + && !CONST_INT_P (disp) && (GET_CODE (disp) != CONST || !legitimate_constant_p (disp)) && (GET_CODE (disp) != SYMBOL_REF @@ -6386,10 +7002,13 @@ legitimize_pic_address (rtx orig, rtx reg) rtx base; #if TARGET_MACHO - if (reg == 0) - reg = gen_reg_rtx (Pmode); - /* Use the generic Mach-O PIC machinery. */ - return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg); + if (TARGET_MACHO && !TARGET_64BIT) + { + if (reg == 0) + reg = gen_reg_rtx (Pmode); + /* Use the generic Mach-O PIC machinery. */ + return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg); + } #endif if (TARGET_64BIT && legitimate_pic_address_disp_p (addr)) @@ -6453,7 +7072,7 @@ legitimize_pic_address (rtx orig, rtx reg) new = reg; } } - else if (GET_CODE (addr) == SYMBOL_REF) + else if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0) { if (TARGET_64BIT) { @@ -6491,7 +7110,7 @@ legitimize_pic_address (rtx orig, rtx reg) } else { - if (GET_CODE (addr) == CONST_INT + if (CONST_INT_P (addr) && !x86_64_immediate_operand (addr, VOIDmode)) { if (reg) @@ -6522,7 +7141,7 @@ legitimize_pic_address (rtx orig, rtx reg) /* Check first to see if this is a constant offset from a @GOTOFF symbol reference. */ if (local_symbolic_operand (op0, Pmode) - && GET_CODE (op1) == CONST_INT) + && CONST_INT_P (op1)) { if (!TARGET_64BIT) { @@ -6557,7 +7176,7 @@ legitimize_pic_address (rtx orig, rtx reg) new = legitimize_pic_address (XEXP (addr, 1), base == reg ? NULL_RTX : reg); - if (GET_CODE (new) == CONST_INT) + if (CONST_INT_P (new)) new = plus_constant (base, INTVAL (new)); else { @@ -6658,15 +7277,22 @@ legitimize_tls_address (rtx x, enum tls_model model, int for_mov) { rtx x = ix86_tls_module_base (); - base = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, base)); - - set_unique_reg_note (get_last_insn (), REG_EQUIV, x); + set_unique_reg_note (get_last_insn (), REG_EQUIV, + gen_rtx_MINUS (Pmode, x, tp)); } off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF); off = gen_rtx_CONST (Pmode, off); dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off)); + + if (TARGET_GNU2_TLS) + { + dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp)); + + set_unique_reg_note (get_last_insn (), REG_EQUIV, x); + } + break; case TLS_MODEL_INITIAL_EXEC: @@ -6792,7 +7418,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) /* Canonicalize shifts by 0, 1, 2, 3 into multiply */ if (GET_CODE (x) == ASHIFT - && GET_CODE (XEXP (x, 1)) == CONST_INT + && CONST_INT_P (XEXP (x, 1)) && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4) { changed = 1; @@ -6806,7 +7432,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */ if (GET_CODE (XEXP (x, 0)) == ASHIFT - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (x, 0), 1)) && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4) { changed = 1; @@ -6817,7 +7443,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) } if (GET_CODE (XEXP (x, 1)) == ASHIFT - && GET_CODE (XEXP (XEXP (x, 1), 1)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (x, 1), 1)) && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4) { changed = 1; @@ -6860,12 +7486,12 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) rtx constant; rtx other = NULL_RTX; - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) { constant = XEXP (x, 1); other = XEXP (XEXP (XEXP (x, 0), 1), 1); } - else if (GET_CODE (XEXP (XEXP (XEXP (x, 0), 1), 1)) == CONST_INT) + else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1))) { constant = XEXP (XEXP (XEXP (x, 0), 1), 1); other = XEXP (x, 1); @@ -6899,8 +7525,8 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) } if (changed - && GET_CODE (XEXP (x, 1)) == REG - && GET_CODE (XEXP (x, 0)) == REG) + && REG_P (XEXP (x, 1)) + && REG_P (XEXP (x, 0))) return x; if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1))) @@ -6912,7 +7538,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) if (changed && legitimate_address_p (mode, x, FALSE)) return x; - if (GET_CODE (XEXP (x, 0)) == REG) + if (REG_P (XEXP (x, 0))) { rtx temp = gen_reg_rtx (Pmode); rtx val = force_operand (XEXP (x, 1), temp); @@ -6923,7 +7549,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) return x; } - else if (GET_CODE (XEXP (x, 1)) == REG) + else if (REG_P (XEXP (x, 1))) { rtx temp = gen_reg_rtx (Pmode); rtx val = force_operand (XEXP (x, 0), temp); @@ -6998,15 +7624,15 @@ output_pic_addr_const (FILE *file, rtx x, int code) case PLUS: /* Some assemblers need integer constants to appear first. */ - if (GET_CODE (XEXP (x, 0)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 0))) { output_pic_addr_const (file, XEXP (x, 0), code); putc ('+', file); output_pic_addr_const (file, XEXP (x, 1), code); } - else + else { - gcc_assert (GET_CODE (XEXP (x, 1)) == CONST_INT); + gcc_assert (CONST_INT_P (XEXP (x, 1))); output_pic_addr_const (file, XEXP (x, 1), code); putc ('+', file); output_pic_addr_const (file, XEXP (x, 0), code); @@ -7096,7 +7722,7 @@ i386_output_dwarf_dtprel (FILE *file, int size, rtx x) /* In the name of slightly smaller debug output, and to cater to general assembler lossage, recognize PIC+GOTOFF and turn it back - into a direct symbol reference. + into a direct symbol reference. On Darwin, this is necessary to avoid a crash, because Darwin has a different PIC label for each routine but the DWARF debugging @@ -7115,7 +7741,7 @@ ix86_delegitimize_address (rtx orig_x) /* This is the result, or NULL. */ rtx result = NULL_RTX; - if (GET_CODE (x) == MEM) + if (MEM_P (x)) x = XEXP (x, 0); if (TARGET_64BIT) @@ -7123,7 +7749,7 @@ ix86_delegitimize_address (rtx orig_x) if (GET_CODE (x) != CONST || GET_CODE (XEXP (x, 0)) != UNSPEC || XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL - || GET_CODE (orig_x) != MEM) + || !MEM_P (orig_x)) return orig_x; return XVECEXP (XEXP (x, 0), 0, 0); } @@ -7132,7 +7758,7 @@ ix86_delegitimize_address (rtx orig_x) || GET_CODE (XEXP (x, 1)) != CONST) return orig_x; - if (GET_CODE (XEXP (x, 0)) == REG + if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == PIC_OFFSET_TABLE_REGNUM) /* %ebx + GOT/GOTOFF */ ; @@ -7140,15 +7766,15 @@ ix86_delegitimize_address (rtx orig_x) { /* %ebx + %reg * scale + GOT/GOTOFF */ reg_addend = XEXP (x, 0); - if (GET_CODE (XEXP (reg_addend, 0)) == REG + if (REG_P (XEXP (reg_addend, 0)) && REGNO (XEXP (reg_addend, 0)) == PIC_OFFSET_TABLE_REGNUM) reg_addend = XEXP (reg_addend, 1); - else if (GET_CODE (XEXP (reg_addend, 1)) == REG + else if (REG_P (XEXP (reg_addend, 1)) && REGNO (XEXP (reg_addend, 1)) == PIC_OFFSET_TABLE_REGNUM) reg_addend = XEXP (reg_addend, 0); else return orig_x; - if (GET_CODE (reg_addend) != REG + if (!REG_P (reg_addend) && GET_CODE (reg_addend) != MULT && GET_CODE (reg_addend) != ASHIFT) return orig_x; @@ -7158,24 +7784,24 @@ ix86_delegitimize_address (rtx orig_x) x = XEXP (XEXP (x, 1), 0); if (GET_CODE (x) == PLUS - && GET_CODE (XEXP (x, 1)) == CONST_INT) + && CONST_INT_P (XEXP (x, 1))) { const_addend = XEXP (x, 1); x = XEXP (x, 0); } if (GET_CODE (x) == UNSPEC - && ((XINT (x, 1) == UNSPEC_GOT && GET_CODE (orig_x) == MEM) - || (XINT (x, 1) == UNSPEC_GOTOFF && GET_CODE (orig_x) != MEM))) + && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x)) + || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x)))) result = XVECEXP (x, 0, 0); if (TARGET_MACHO && darwin_local_data_pic (x) - && GET_CODE (orig_x) != MEM) + && !MEM_P (orig_x)) result = XEXP (x, 0); if (! result) return orig_x; - + if (const_addend) result = gen_rtx_PLUS (Pmode, result, const_addend); if (reg_addend) @@ -7295,7 +7921,8 @@ print_reg (rtx x, int code, FILE *file) gcc_assert (REGNO (x) != ARG_POINTER_REGNUM && REGNO (x) != FRAME_POINTER_REGNUM && REGNO (x) != FLAGS_REG - && REGNO (x) != FPSR_REG); + && REGNO (x) != FPSR_REG + && REGNO (x) != FPCR_REG); if (ASSEMBLER_DIALECT == ASM_ATT || USER_LABEL_PREFIX[0] == 0) putc ('%', file); @@ -7467,7 +8094,7 @@ print_operand (FILE *file, rtx x, int code) case ASM_INTEL: /* Intel syntax. For absolute addresses, registers should not be surrounded by braces. */ - if (GET_CODE (x) != REG) + if (!REG_P (x)) { putc ('[', file); PRINT_OPERAND (file, x, 0); @@ -7527,6 +8154,10 @@ print_operand (FILE *file, rtx x, int code) /* This is the size of op from size of operand. */ switch (GET_MODE_SIZE (GET_MODE (x))) { + case 1: + putc ('b', file); + return; + case 2: #ifdef HAVE_GAS_FILDS_FISTS putc ('s', file); @@ -7577,7 +8208,7 @@ print_operand (FILE *file, rtx x, int code) break; case 's': - if (GET_CODE (x) == CONST_INT || ! SHIFT_DOUBLE_OMITS_COUNT) + if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT) { PRINT_OPERAND (file, x, 0); putc (',', file); @@ -7715,10 +8346,10 @@ print_operand (FILE *file, rtx x, int code) } } - if (GET_CODE (x) == REG) + if (REG_P (x)) print_reg (x, code, file); - else if (GET_CODE (x) == MEM) + else if (MEM_P (x)) { /* No `byte ptr' prefix for call instructions. */ if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P') @@ -7751,7 +8382,7 @@ print_operand (FILE *file, rtx x, int code) x = XEXP (x, 0); /* Avoid (%rip) for call operands. */ if (CONSTANT_ADDRESS_P (x) && code == 'P' - && GET_CODE (x) != CONST_INT) + && !CONST_INT_P (x)) output_addr_const (file, x); else if (this_is_asm_operands && ! address_operand (x, VOIDmode)) output_operand_lossage ("invalid constraints for operand"); @@ -7803,7 +8434,7 @@ print_operand (FILE *file, rtx x, int code) if (code != 'P') { - if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE) + if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE) { if (ASSEMBLER_DIALECT == ASM_ATT) putc ('$', file); @@ -7817,7 +8448,7 @@ print_operand (FILE *file, rtx x, int code) fputs ("OFFSET FLAT:", file); } } - if (GET_CODE (x) == CONST_INT) + if (CONST_INT_P (x)) fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); else if (flag_pic) output_pic_addr_const (file, x, code); @@ -7861,7 +8492,7 @@ print_operand_address (FILE *file, rtx addr) { /* Displacement only requires special attention. */ - if (GET_CODE (disp) == CONST_INT) + if (CONST_INT_P (disp)) { if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT) { @@ -7881,7 +8512,7 @@ print_operand_address (FILE *file, rtx addr) { if (GET_CODE (disp) == CONST && GET_CODE (XEXP (disp, 0)) == PLUS - && GET_CODE (XEXP (XEXP (disp, 0), 1)) == CONST_INT) + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) disp = XEXP (XEXP (disp, 0), 0); if (GET_CODE (disp) == LABEL_REF || (GET_CODE (disp) == SYMBOL_REF @@ -7924,7 +8555,7 @@ print_operand_address (FILE *file, rtx addr) /* Pull out the offset of a symbol; print any symbol itself. */ if (GET_CODE (disp) == CONST && GET_CODE (XEXP (disp, 0)) == PLUS - && GET_CODE (XEXP (XEXP (disp, 0), 1)) == CONST_INT) + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) { offset = XEXP (XEXP (disp, 0), 1); disp = gen_rtx_CONST (VOIDmode, @@ -7935,7 +8566,7 @@ print_operand_address (FILE *file, rtx addr) output_pic_addr_const (file, disp, 0); else if (GET_CODE (disp) == LABEL_REF) output_asm_label (disp); - else if (GET_CODE (disp) == CONST_INT) + else if (CONST_INT_P (disp)) offset = disp; else output_addr_const (file, disp); @@ -8034,7 +8665,7 @@ split_di (rtx operands[], int num, rtx lo_half[], rtx hi_half[]) /* simplify_subreg refuse to split volatile memory addresses, but we still have to handle it. */ - if (GET_CODE (op) == MEM) + if (MEM_P (op)) { lo_half[num] = adjust_address (op, SImode, 0); hi_half[num] = adjust_address (op, SImode, 4); @@ -8065,7 +8696,7 @@ split_ti (rtx operands[], int num, rtx lo_half[], rtx hi_half[]) /* simplify_subreg refuse to split volatile memory addresses, but we still have to handle it. */ - if (GET_CODE (op) == MEM) + if (MEM_P (op)) { lo_half[num] = adjust_address (op, DImode, 0); hi_half[num] = adjust_address (op, DImode, 8); @@ -8109,10 +8740,10 @@ output_387_binary_op (rtx insn, rtx *operands) if (STACK_REG_P (operands[0]) && ((REG_P (operands[1]) && REGNO (operands[0]) == REGNO (operands[1]) - && (STACK_REG_P (operands[2]) || GET_CODE (operands[2]) == MEM)) + && (STACK_REG_P (operands[2]) || MEM_P (operands[2]))) || (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]) - && (STACK_REG_P (operands[1]) || GET_CODE (operands[1]) == MEM))) + && (STACK_REG_P (operands[1]) || MEM_P (operands[1])))) && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2]))) ; /* ok */ else @@ -8185,7 +8816,7 @@ output_387_binary_op (rtx insn, rtx *operands) /* know operands[0] == operands[1]. */ - if (GET_CODE (operands[2]) == MEM) + if (MEM_P (operands[2])) { p = "%z2\t%2"; break; @@ -8215,13 +8846,13 @@ output_387_binary_op (rtx insn, rtx *operands) case MINUS: case DIV: - if (GET_CODE (operands[1]) == MEM) + if (MEM_P (operands[1])) { p = "r%z1\t%1"; break; } - if (GET_CODE (operands[2]) == MEM) + if (MEM_P (operands[2])) { p = "%z2\t%2"; break; @@ -8367,7 +8998,7 @@ emit_i387_cw_initialization (int mode) rtx reg = gen_reg_rtx (HImode); emit_insn (gen_x86_fnstcw_1 (stored_mode)); - emit_move_insn (reg, stored_mode); + emit_move_insn (reg, copy_rtx (stored_mode)); if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL || optimize_size) { @@ -8424,7 +9055,7 @@ emit_i387_cw_initialization (int mode) emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8))); slot = SLOT_CW_CEIL; break; - + case I387_CW_MASK_PM: /* mask precision exception for nearbyint() */ emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020))); @@ -8460,7 +9091,7 @@ output_fix_trunc (rtx insn, rtx *operands, int fisttp) output_asm_insn ("fld\t%y1", operands); gcc_assert (STACK_TOP_P (operands[1])); - gcc_assert (GET_CODE (operands[0]) == MEM); + gcc_assert (MEM_P (operands[0])); if (fisttp) output_asm_insn ("fisttp%z0\t%0", operands); @@ -8479,6 +9110,32 @@ output_fix_trunc (rtx insn, rtx *operands, int fisttp) return ""; } +/* Output code for x87 ffreep insn. The OPNO argument, which may only + have the values zero or one, indicates the ffreep insn's operand + from the OPERANDS array. */ + +static const char * +output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno) +{ + if (TARGET_USE_FFREEP) +#if HAVE_AS_IX86_FFREEP + return opno ? "ffreep\t%y1" : "ffreep\t%y0"; +#else + { + static char retval[] = ".word\t0xc_df"; + int regno = REGNO (operands[opno]); + + gcc_assert (FP_REGNO_P (regno)); + + retval[9] = '0' + (regno - FIRST_STACK_REG); + return retval; + } +#endif + + return opno ? "fstp\t%y1" : "fstp\t%y0"; +} + + /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi should be used. UNORDERED_P is true when fucom should be used. */ @@ -8523,7 +9180,7 @@ output_fp_compare (rtx insn, rtx *operands, int eflags_p, int unordered_p) if (stack_top_dies) { output_asm_insn ("ftst\n\tfnstsw\t%0", operands); - return TARGET_USE_FFREEP ? "ffreep\t%y1" : "fstp\t%y1"; + return output_387_ffreep (operands, 1); } else return "ftst\n\tfnstsw\t%0"; @@ -8546,7 +9203,7 @@ output_fp_compare (rtx insn, rtx *operands, int eflags_p, int unordered_p) output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands); else output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands); - return TARGET_USE_FFREEP ? "ffreep\t%y0" : "fstp\t%y0"; + return output_387_ffreep (operands, 0); } else { @@ -8716,34 +9373,39 @@ ix86_expand_move (enum machine_mode mode, rtx operands[]) if (flag_pic && mode == Pmode && symbolic_operand (op1, Pmode)) { + if (TARGET_MACHO && !TARGET_64BIT) + { #if TARGET_MACHO - if (MACHOPIC_PURE) - { - rtx temp = ((reload_in_progress - || ((op0 && GET_CODE (op0) == REG) - && mode == Pmode)) - ? op0 : gen_reg_rtx (Pmode)); - op1 = machopic_indirect_data_reference (op1, temp); - op1 = machopic_legitimize_pic_address (op1, mode, - temp == op1 ? 0 : temp); - } - else if (MACHOPIC_INDIRECT) - op1 = machopic_indirect_data_reference (op1, 0); - if (op0 == op1) - return; -#else - if (GET_CODE (op0) == MEM) - op1 = force_reg (Pmode, op1); - else - op1 = legitimize_address (op1, op1, Pmode); -#endif /* TARGET_MACHO */ + if (MACHOPIC_PURE) + { + rtx temp = ((reload_in_progress + || ((op0 && REG_P (op0)) + && mode == Pmode)) + ? op0 : gen_reg_rtx (Pmode)); + op1 = machopic_indirect_data_reference (op1, temp); + op1 = machopic_legitimize_pic_address (op1, mode, + temp == op1 ? 0 : temp); + } + else if (MACHOPIC_INDIRECT) + op1 = machopic_indirect_data_reference (op1, 0); + if (op0 == op1) + return; +#endif + } + else + { + if (MEM_P (op0)) + op1 = force_reg (Pmode, op1); + else + op1 = legitimize_address (op1, op1, Pmode); + } } else { - if (GET_CODE (op0) == MEM + if (MEM_P (op0) && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode) || !push_operand (op0, mode)) - && GET_CODE (op1) == MEM) + && MEM_P (op1)) op1 = force_reg (mode, op1); if (push_operand (op0, mode) @@ -8795,7 +9457,8 @@ ix86_expand_vector_move (enum machine_mode mode, rtx operands[]) to handle some of them more efficiently. */ if ((reload_in_progress | reload_completed) == 0 && register_operand (op0, mode) - && CONSTANT_P (op1) && op1 != CONST0_RTX (mode)) + && CONSTANT_P (op1) + && standard_sse_constant_p (op1) <= 0) op1 = validize_mem (force_const_mem (mode, op1)); /* Make operand1 a register if it isn't already. */ @@ -8810,7 +9473,7 @@ ix86_expand_vector_move (enum machine_mode mode, rtx operands[]) emit_insn (gen_rtx_SET (VOIDmode, op0, op1)); } -/* Implement the movmisalign patterns for SSE. Non-SSE modes go +/* Implement the movmisalign patterns for SSE. Non-SSE modes go straight to ix86_expand_vector_move. */ void @@ -8844,8 +9507,16 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) } if (TARGET_SSE2 && mode == V2DFmode) - { - rtx zero; + { + rtx zero; + + if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + return; + } /* When SSE registers are split into halves, we can avoid writing to the top half twice. */ @@ -8873,7 +9544,15 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) emit_insn (gen_sse2_loadhpd (op0, op0, m)); } else - { + { + if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_sse_movups (op0, op1)); + return; + } + if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) emit_move_insn (op0, CONST0_RTX (mode)); else @@ -8949,6 +9628,43 @@ ix86_expand_push (enum machine_mode mode, rtx x) emit_move_insn (tmp, x); } +/* Helper function of ix86_fixup_binary_operands to canonicalize + operand order. Returns true if the operands should be swapped. */ + +static bool +ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + + /* If the operation is not commutative, we can't do anything. */ + if (GET_RTX_CLASS (code) != RTX_COMM_ARITH) + return false; + + /* Highest priority is that src1 should match dst. */ + if (rtx_equal_p (dst, src1)) + return false; + if (rtx_equal_p (dst, src2)) + return true; + + /* Next highest priority is that immediate constants come second. */ + if (immediate_operand (src2, mode)) + return false; + if (immediate_operand (src1, mode)) + return true; + + /* Lowest priority is that memory references should come second. */ + if (MEM_P (src2)) + return false; + if (MEM_P (src1)) + return true; + + return false; +} + + /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the destination to use for the operation. If different from the true destination in operands[0], a copy operation will be required. */ @@ -8957,55 +9673,46 @@ rtx ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode, rtx operands[]) { - int matching_memory; - rtx src1, src2, dst; - - dst = operands[0]; - src1 = operands[1]; - src2 = operands[2]; + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; - /* Recognize = for commutative operators */ - if (GET_RTX_CLASS (code) == RTX_COMM_ARITH - && (rtx_equal_p (dst, src2) - || immediate_operand (src1, mode))) + /* Canonicalize operand order. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) { rtx temp = src1; src1 = src2; src2 = temp; } - /* If the destination is memory, and we do not have matching source - operands, do things in registers. */ - matching_memory = 0; - if (GET_CODE (dst) == MEM) - { - if (rtx_equal_p (dst, src1)) - matching_memory = 1; - else if (GET_RTX_CLASS (code) == RTX_COMM_ARITH - && rtx_equal_p (dst, src2)) - matching_memory = 2; - else - dst = gen_reg_rtx (mode); - } - /* Both source operands cannot be in memory. */ - if (GET_CODE (src1) == MEM && GET_CODE (src2) == MEM) + if (MEM_P (src1) && MEM_P (src2)) { - if (matching_memory != 2) - src2 = force_reg (mode, src2); + /* Optimization: Only read from memory once. */ + if (rtx_equal_p (src1, src2)) + { + src2 = force_reg (mode, src2); + src1 = src2; + } else - src1 = force_reg (mode, src1); + src2 = force_reg (mode, src2); } - /* If the operation is not commutable, source 1 cannot be a constant - or non-matching memory. */ - if ((CONSTANT_P (src1) - || (!matching_memory && GET_CODE (src1) == MEM)) - && GET_RTX_CLASS (code) != RTX_COMM_ARITH) + /* If the destination is memory, and we do not have matching source + operands, do things in registers. */ + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + dst = gen_reg_rtx (mode); + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) + src1 = force_reg (mode, src1); + + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) src1 = force_reg (mode, src1); - src1 = operands[1] = src1; - src2 = operands[2] = src2; + operands[1] = src1; + operands[2] = src2; return dst; } @@ -9059,28 +9766,37 @@ ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode, appropriate constraints. */ int -ix86_binary_operator_ok (enum rtx_code code, - enum machine_mode mode ATTRIBUTE_UNUSED, +ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode, rtx operands[3]) { + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + /* Both source operands cannot be in memory. */ - if (GET_CODE (operands[1]) == MEM && GET_CODE (operands[2]) == MEM) - return 0; - /* If the operation is not commutable, source 1 cannot be a constant. */ - if (CONSTANT_P (operands[1]) && GET_RTX_CLASS (code) != RTX_COMM_ARITH) + if (MEM_P (src1) && MEM_P (src2)) return 0; + + /* Canonicalize operand order for commutative operators. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) + { + rtx temp = src1; + src1 = src2; + src2 = temp; + } + /* If the destination is memory, we must have a matching source operand. */ - if (GET_CODE (operands[0]) == MEM - && ! (rtx_equal_p (operands[0], operands[1]) - || (GET_RTX_CLASS (code) == RTX_COMM_ARITH - && rtx_equal_p (operands[0], operands[2])))) + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + return 0; + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) return 0; - /* If the operation is not commutable and the source 1 is memory, we must - have a matching destination. */ - if (GET_CODE (operands[1]) == MEM - && GET_RTX_CLASS (code) != RTX_COMM_ARITH - && ! rtx_equal_p (operands[0], operands[1])) + + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) return 0; + return 1; } @@ -9143,150 +9859,368 @@ ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED, rtx operands[2] ATTRIBUTE_UNUSED) { /* If one of operands is memory, source and destination must match. */ - if ((GET_CODE (operands[0]) == MEM - || GET_CODE (operands[1]) == MEM) + if ((MEM_P (operands[0]) + || MEM_P (operands[1])) && ! rtx_equal_p (operands[0], operands[1])) return FALSE; return TRUE; } -/* A subroutine of ix86_expand_fp_absneg_operator and copysign expanders. - Create a mask for the sign bit in MODE for an SSE register. If VECT is - true, then replicate the mask for all elements of the vector register. - If INVERT is true, then create a mask excluding the sign bit. */ +/* Post-reload splitter for converting an SF or DFmode value in an + SSE register into an unsigned SImode. */ -rtx -ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) +void +ix86_split_convert_uns_si_sse (rtx operands[]) { - enum machine_mode vec_mode; - HOST_WIDE_INT hi, lo; - int shift = 63; - rtvec v; - rtx mask; - - /* Find the sign bit, sign extended to 2*HWI. */ - if (mode == SFmode) - lo = 0x80000000, hi = lo < 0; - else if (HOST_BITS_PER_WIDE_INT >= 64) - lo = (HOST_WIDE_INT)1 << shift, hi = -1; - else - lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT); - - if (invert) - lo = ~lo, hi = ~hi; + enum machine_mode vecmode; + rtx value, large, zero_or_two31, input, two31, x; - /* Force this value into the low part of a fp vector constant. */ - mask = immed_double_const (lo, hi, mode == SFmode ? SImode : DImode); - mask = gen_lowpart (mode, mask); + large = operands[1]; + zero_or_two31 = operands[2]; + input = operands[3]; + two31 = operands[4]; + vecmode = GET_MODE (large); + value = gen_rtx_REG (vecmode, REGNO (operands[0])); - if (mode == SFmode) + /* Load up the value into the low element. We must ensure that the other + elements are valid floats -- zero is the easiest such value. */ + if (MEM_P (input)) { - if (vect) - v = gen_rtvec (4, mask, mask, mask, mask); + if (vecmode == V4SFmode) + emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input)); else - v = gen_rtvec (4, mask, CONST0_RTX (SFmode), - CONST0_RTX (SFmode), CONST0_RTX (SFmode)); - vec_mode = V4SFmode; + emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input)); } else { - if (vect) - v = gen_rtvec (2, mask, mask); + input = gen_rtx_REG (vecmode, REGNO (input)); + emit_move_insn (value, CONST0_RTX (vecmode)); + if (vecmode == V4SFmode) + emit_insn (gen_sse_movss (value, value, input)); else - v = gen_rtvec (2, mask, CONST0_RTX (DFmode)); - vec_mode = V2DFmode; + emit_insn (gen_sse2_movsd (value, value, input)); } - return force_reg (vec_mode, gen_rtx_CONST_VECTOR (vec_mode, v)); + emit_move_insn (large, two31); + emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31); + + x = gen_rtx_fmt_ee (LE, vecmode, large, value); + emit_insn (gen_rtx_SET (VOIDmode, large, x)); + + x = gen_rtx_AND (vecmode, zero_or_two31, large); + emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x)); + + x = gen_rtx_MINUS (vecmode, value, zero_or_two31); + emit_insn (gen_rtx_SET (VOIDmode, value, x)); + + large = gen_rtx_REG (V4SImode, REGNO (large)); + emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31))); + + x = gen_rtx_REG (V4SImode, REGNO (value)); + if (vecmode == V4SFmode) + emit_insn (gen_sse2_cvttps2dq (x, value)); + else + emit_insn (gen_sse2_cvttpd2dq (x, value)); + value = x; + + emit_insn (gen_xorv4si3 (value, value, large)); } -/* Generate code for floating point ABS or NEG. */ +/* Convert an unsigned DImode value into a DFmode, using only SSE. + Expects the 64-bit DImode to be supplied in a pair of integral + registers. Requires SSE2; will use SSE3 if available. For x86_32, + -mfpmath=sse, !optimize_size only. */ void -ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, - rtx operands[]) +ix86_expand_convert_uns_didf_sse (rtx target, rtx input) { - rtx mask, set, use, clob, dst, src; - bool matching_memory; - bool use_sse = false; - bool vector_mode = VECTOR_MODE_P (mode); - enum machine_mode elt_mode = mode; + REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt; + rtx int_xmm, fp_xmm; + rtx biases, exponents; + rtx x; - if (vector_mode) + int_xmm = gen_reg_rtx (V4SImode); + if (TARGET_INTER_UNIT_MOVES) + emit_insn (gen_movdi_to_sse (int_xmm, input)); + else if (TARGET_SSE_SPLIT_REGS) { - elt_mode = GET_MODE_INNER (mode); - use_sse = true; + emit_insn (gen_rtx_CLOBBER (VOIDmode, int_xmm)); + emit_move_insn (gen_lowpart (DImode, int_xmm), input); } - else if (TARGET_SSE_MATH) - use_sse = SSE_FLOAT_MODE_P (mode); - - /* NEG and ABS performed with SSE use bitwise mask operations. - Create the appropriate mask now. */ - if (use_sse) - mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS); else { - /* When not using SSE, we don't use the mask, but prefer to keep the - same general form of the insn pattern to reduce duplication when - it comes time to split. */ - mask = const0_rtx; + x = gen_reg_rtx (V2DImode); + ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0); + emit_move_insn (int_xmm, gen_lowpart (V4SImode, x)); } - dst = operands[0]; - src = operands[1]; + x = gen_rtx_CONST_VECTOR (V4SImode, + gen_rtvec (4, GEN_INT (0x43300000UL), + GEN_INT (0x45300000UL), + const0_rtx, const0_rtx)); + exponents = validize_mem (force_const_mem (V4SImode, x)); - /* If the destination is memory, and we don't have matching source - operands, do things in registers. */ - matching_memory = false; - if (MEM_P (dst)) - { - if (rtx_equal_p (dst, src)) - matching_memory = true; - else - dst = gen_reg_rtx (mode); - } - if (MEM_P (src) && !matching_memory) - src = force_reg (mode, src); + /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */ + emit_insn (gen_sse2_punpckldq (int_xmm, int_xmm, exponents)); - if (vector_mode) - { - set = gen_rtx_fmt_ee (code == NEG ? XOR : AND, mode, src, mask); - set = gen_rtx_SET (VOIDmode, dst, set); - emit_insn (set); - } + /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm) + yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)). + Similarly (0x45300000UL ## fp_value_hi_xmm) yields + (0x1.0p84 + double(fp_value_hi_xmm)). + Note these exponents differ by 32. */ + + fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm)); + + /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values + in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */ + real_ldexp (&bias_lo_rvt, &dconst1, 52); + real_ldexp (&bias_hi_rvt, &dconst1, 84); + biases = const_double_from_real_value (bias_lo_rvt, DFmode); + x = const_double_from_real_value (bias_hi_rvt, DFmode); + biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x)); + biases = validize_mem (force_const_mem (V2DFmode, biases)); + emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases)); + + /* Add the upper and lower DFmode values together. */ + if (TARGET_SSE3) + emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm)); else { - set = gen_rtx_fmt_e (code, mode, src); - set = gen_rtx_SET (VOIDmode, dst, set); - use = gen_rtx_USE (VOIDmode, mask); - clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); - emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (3, set, use, clob))); + x = copy_to_mode_reg (V2DFmode, fp_xmm); + emit_insn (gen_sse2_unpckhpd (fp_xmm, fp_xmm, fp_xmm)); + emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x)); } - if (dst != operands[0]) - emit_move_insn (operands[0], dst); + ix86_expand_vector_extract (false, target, fp_xmm, 0); } -/* Expand a copysign operation. Special case operand 0 being a constant. */ +/* Convert an unsigned SImode value into a DFmode. Only currently used + for SSE, but applicable anywhere. */ void -ix86_expand_copysign (rtx operands[]) +ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) { - enum machine_mode mode, vmode; - rtx dest, op0, op1, mask, nmask; + REAL_VALUE_TYPE TWO31r; + rtx x, fp; - dest = operands[0]; - op0 = operands[1]; - op1 = operands[2]; + x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1), + NULL, 1, OPTAB_DIRECT); - mode = GET_MODE (dest); - vmode = mode == SFmode ? V4SFmode : V2DFmode; + fp = gen_reg_rtx (DFmode); + emit_insn (gen_floatsidf2 (fp, x)); - if (GET_CODE (op0) == CONST_DOUBLE) - { - rtvec v; + real_ldexp (&TWO31r, &dconst1, 31); + x = const_double_from_real_value (TWO31r, DFmode); - if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0))) + x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert a signed DImode value into a DFmode. Only used for SSE in + 32-bit mode; otherwise we have a direct convert instruction. */ + +void +ix86_expand_convert_sign_didf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE TWO32r; + rtx fp_lo, fp_hi, x; + + fp_lo = gen_reg_rtx (DFmode); + fp_hi = gen_reg_rtx (DFmode); + + emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input))); + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, DFmode); + fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT); + + ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input)); + + x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert an unsigned SImode value into a SFmode, using only SSE. + For x86_32, -mfpmath=sse, !optimize_size only. */ +void +ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE ONE16r; + rtx fp_hi, fp_lo, int_hi, int_lo, x; + + real_ldexp (&ONE16r, &dconst1, 16); + x = const_double_from_real_value (ONE16r, SFmode); + int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff), + NULL, 0, OPTAB_DIRECT); + int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16), + NULL, 0, OPTAB_DIRECT); + fp_hi = gen_reg_rtx (SFmode); + fp_lo = gen_reg_rtx (SFmode); + emit_insn (gen_floatsisf2 (fp_hi, int_hi)); + emit_insn (gen_floatsisf2 (fp_lo, int_lo)); + fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, + 0, OPTAB_DIRECT); + fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (!rtx_equal_p (target, fp_hi)) + emit_move_insn (target, fp_hi); +} + +/* A subroutine of ix86_build_signbit_mask_vector. If VECT is true, + then replicate the value for all elements of the vector + register. */ + +rtx +ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) +{ + rtvec v; + switch (mode) + { + case SFmode: + if (vect) + v = gen_rtvec (4, value, value, value, value); + else + v = gen_rtvec (4, value, CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode)); + return gen_rtx_CONST_VECTOR (V4SFmode, v); + + case DFmode: + if (vect) + v = gen_rtvec (2, value, value); + else + v = gen_rtvec (2, value, CONST0_RTX (DFmode)); + return gen_rtx_CONST_VECTOR (V2DFmode, v); + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_fp_absneg_operator and copysign expanders. + Create a mask for the sign bit in MODE for an SSE register. If VECT is + true, then replicate the mask for all elements of the vector register. + If INVERT is true, then create a mask excluding the sign bit. */ + +rtx +ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) +{ + enum machine_mode vec_mode; + HOST_WIDE_INT hi, lo; + int shift = 63; + rtx v; + rtx mask; + + /* Find the sign bit, sign extended to 2*HWI. */ + if (mode == SFmode) + lo = 0x80000000, hi = lo < 0; + else if (HOST_BITS_PER_WIDE_INT >= 64) + lo = (HOST_WIDE_INT)1 << shift, hi = -1; + else + lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT); + + if (invert) + lo = ~lo, hi = ~hi; + + /* Force this value into the low part of a fp vector constant. */ + mask = immed_double_const (lo, hi, mode == SFmode ? SImode : DImode); + mask = gen_lowpart (mode, mask); + + v = ix86_build_const_vector (mode, vect, mask); + vec_mode = (mode == SFmode) ? V4SFmode : V2DFmode; + return force_reg (vec_mode, v); +} + +/* Generate code for floating point ABS or NEG. */ + +void +ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + rtx mask, set, use, clob, dst, src; + bool matching_memory; + bool use_sse = false; + bool vector_mode = VECTOR_MODE_P (mode); + enum machine_mode elt_mode = mode; + + if (vector_mode) + { + elt_mode = GET_MODE_INNER (mode); + use_sse = true; + } + else if (TARGET_SSE_MATH) + use_sse = SSE_FLOAT_MODE_P (mode); + + /* NEG and ABS performed with SSE use bitwise mask operations. + Create the appropriate mask now. */ + if (use_sse) + mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS); + else + mask = NULL_RTX; + + dst = operands[0]; + src = operands[1]; + + /* If the destination is memory, and we don't have matching source + operands or we're using the x87, do things in registers. */ + matching_memory = false; + if (MEM_P (dst)) + { + if (use_sse && rtx_equal_p (dst, src)) + matching_memory = true; + else + dst = gen_reg_rtx (mode); + } + if (MEM_P (src) && !matching_memory) + src = force_reg (mode, src); + + if (vector_mode) + { + set = gen_rtx_fmt_ee (code == NEG ? XOR : AND, mode, src, mask); + set = gen_rtx_SET (VOIDmode, dst, set); + emit_insn (set); + } + else + { + set = gen_rtx_fmt_e (code, mode, src); + set = gen_rtx_SET (VOIDmode, dst, set); + if (mask) + { + use = gen_rtx_USE (VOIDmode, mask); + clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG)); + emit_insn (gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (3, set, use, clob))); + } + else + emit_insn (set); + } + + if (dst != operands[0]) + emit_move_insn (operands[0], dst); +} + +/* Expand a copysign operation. Special case operand 0 being a constant. */ + +void +ix86_expand_copysign (rtx operands[]) +{ + enum machine_mode mode, vmode; + rtx dest, op0, op1, mask, nmask; + + dest = operands[0]; + op0 = operands[1]; + op1 = operands[2]; + + mode = GET_MODE (dest); + vmode = mode == SFmode ? V4SFmode : V2DFmode; + + if (GET_CODE (op0) == CONST_DOUBLE) + { + rtvec v; + + if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0))) op0 = simplify_unary_operation (ABS, mode, op0, mode); if (op0 == CONST0_RTX (mode)) @@ -9653,16 +10587,16 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) into a register. */ if (standard_80387_constant_p (op0) == 0 - || (GET_CODE (op0) == MEM + || (MEM_P (op0) && ! (standard_80387_constant_p (op1) == 0 - || GET_CODE (op1) == MEM))) + || MEM_P (op1)))) { rtx tmp; tmp = op0, op0 = op1, op1 = tmp; code = swap_condition (code); } - if (GET_CODE (op0) != REG) + if (!REG_P (op0)) op0 = force_reg (op_mode, op0); if (CONSTANT_P (op1)) @@ -9683,12 +10617,12 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) /* Try to rearrange the comparison to make it cheaper. */ if (ix86_fp_comparison_cost (code) > ix86_fp_comparison_cost (swap_condition (code)) - && (GET_CODE (op1) == REG || !no_new_pseudos)) + && (REG_P (op1) || !no_new_pseudos)) { rtx tmp; tmp = op0, op0 = op1, op1 = tmp; code = swap_condition (code); - if (GET_CODE (op0) != REG) + if (!REG_P (op0)) op0 = force_reg (op_mode, op0); } @@ -10112,6 +11046,12 @@ ix86_expand_branch (enum rtx_code code, rtx label) { rtx tmp; + /* If we have emitted a compare insn, go straight to simple. + ix86_expand_compare won't emit anything if ix86_compare_emitted + is non NULL. */ + if (ix86_compare_emitted) + goto simple; + switch (GET_MODE (ix86_compare_op0)) { case QImode: @@ -10236,7 +11176,7 @@ ix86_expand_branch (enum rtx_code code, rtx label) op1 is a constant and the low word is zero, then we can just examine the high word. */ - if (GET_CODE (hi[1]) == CONST_INT && lo[1] == const0_rtx) + if (CONST_INT_P (hi[1]) && lo[1] == const0_rtx) switch (code) { case LT: case LTU: case GE: case GEU: @@ -10444,7 +11384,7 @@ ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) enum machine_mode mode = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); - /* Do not handle DImode compares that go trought special path. Also we can't + /* Do not handle DImode compares that go through special path. Also we can't deal with FP compares yet. This is possible to add. */ if (mode == (TARGET_64BIT ? TImode : DImode)) return false; @@ -10510,7 +11450,7 @@ ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) /* Convert a>b into b=b-1. */ case GTU: case LEU: - if (GET_CODE (op1) == CONST_INT) + if (CONST_INT_P (op1)) { op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); /* Bail out on overflow. We still can swap operands but that @@ -10587,8 +11527,8 @@ ix86_expand_int_movcc (rtx operands[]) if ((mode != HImode || TARGET_FAST_PREFIX) && (mode != (TARGET_64BIT ? TImode : DImode)) - && GET_CODE (operands[2]) == CONST_INT - && GET_CODE (operands[3]) == CONST_INT) + && CONST_INT_P (operands[2]) + && CONST_INT_P (operands[3])) { rtx out = operands[0]; HOST_WIDE_INT ct = INTVAL (operands[2]); @@ -10763,7 +11703,7 @@ ix86_expand_int_movcc (rtx operands[]) compare_code = UNKNOWN; if (GET_MODE_CLASS (GET_MODE (ix86_compare_op0)) == MODE_INT - && GET_CODE (ix86_compare_op1) == CONST_INT) + && CONST_INT_P (ix86_compare_op1)) { if (ix86_compare_op1 == const0_rtx && (code == LT || code == GE)) @@ -10975,7 +11915,7 @@ ix86_expand_int_movcc (rtx operands[]) /* If one of the two operands is an interesting constant, load a constant with the above and mask it in with a logical operation. */ - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { var = operands[3]; if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) @@ -10985,7 +11925,7 @@ ix86_expand_int_movcc (rtx operands[]) else return 0; /* FAIL */ } - else if (GET_CODE (operands[3]) == CONST_INT) + else if (CONST_INT_P (operands[3])) { var = operands[2]; if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) @@ -11416,6 +12356,8 @@ ix86_expand_int_vcond (rtx operands[]) tricks to turn this into a signed comparison against 0. */ if (code == GTU) { + cop0 = force_reg (mode, cop0); + switch (mode) { case V4SImode: @@ -11470,6 +12412,52 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is + true if we should do zero extension, else sign extension. HIGH_P is + true if we want the N/2 high elements, else the low elements. */ + +void +ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p) +{ + enum machine_mode imode = GET_MODE (operands[1]); + rtx (*unpack)(rtx, rtx, rtx); + rtx se, dest; + + switch (imode) + { + case V16QImode: + if (high_p) + unpack = gen_vec_interleave_highv16qi; + else + unpack = gen_vec_interleave_lowv16qi; + break; + case V8HImode: + if (high_p) + unpack = gen_vec_interleave_highv8hi; + else + unpack = gen_vec_interleave_lowv8hi; + break; + case V4SImode: + if (high_p) + unpack = gen_vec_interleave_highv4si; + else + unpack = gen_vec_interleave_lowv4si; + break; + default: + gcc_unreachable (); + } + + dest = gen_lowpart (imode, operands[0]); + + if (unsigned_p) + se = force_reg (imode, CONST0_RTX (imode)); + else + se = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), + operands[1], pc_rtx, pc_rtx); + + emit_insn (unpack (dest, operands[1], se)); +} + /* Expand conditional increment or decrement using adb/sbb instructions. The default case using setcc followed by the conditional move can be done by generic code. */ @@ -11569,25 +12557,25 @@ ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode) else size = (GET_MODE_SIZE (mode) + 4) / 8; - gcc_assert (GET_CODE (operand) != REG || !MMX_REGNO_P (REGNO (operand))); + gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand))); gcc_assert (size >= 2 && size <= 3); /* Optimize constant pool reference to immediates. This is used by fp moves, that force all constants to memory to allow combining. */ - if (GET_CODE (operand) == MEM && MEM_READONLY_P (operand)) + if (MEM_P (operand) && MEM_READONLY_P (operand)) { rtx tmp = maybe_get_pool_constant (operand); if (tmp) operand = tmp; } - if (GET_CODE (operand) == MEM && !offsettable_memref_p (operand)) + if (MEM_P (operand) && !offsettable_memref_p (operand)) { /* The only non-offsetable memories we handle are pushes. */ int ok = push_operand (operand, VOIDmode); - + gcc_assert (ok); - + operand = copy_rtx (operand); PUT_MODE (operand, Pmode); parts[0] = parts[1] = parts[2] = operand; @@ -11730,7 +12718,7 @@ ix86_split_long_move (rtx operands[]) /* Optimize constant pool reference to immediates. This is used by fp moves, that force all constants to memory to allow combining. */ - if (GET_CODE (operands[1]) == MEM + if (MEM_P (operands[1]) && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0))) operands[1] = get_pool_constant (XEXP (operands[1], 0)); @@ -11750,14 +12738,14 @@ ix86_split_long_move (rtx operands[]) if (push_operand (operands[0], VOIDmode)) push = 1; else - gcc_assert (GET_CODE (operands[0]) != MEM + gcc_assert (!MEM_P (operands[0]) || offsettable_memref_p (operands[0])); nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0])); ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0])); /* When emitting push, take care for source operands on the stack. */ - if (push && GET_CODE (operands[1]) == MEM + if (push && MEM_P (operands[1]) && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) { if (nparts == 3) @@ -11769,7 +12757,7 @@ ix86_split_long_move (rtx operands[]) /* We need to do copy in the right order in case an address register of the source overlaps the destination. */ - if (REG_P (part[0][0]) && GET_CODE (part[1][0]) == MEM) + if (REG_P (part[0][0]) && MEM_P (part[1][0])) { if (reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))) collisions++; @@ -11845,7 +12833,7 @@ ix86_split_long_move (rtx operands[]) default: gcc_unreachable (); } - + if (GET_MODE (part[1][0]) == SImode) part[1][0] = part[1][1]; } @@ -11904,25 +12892,25 @@ ix86_split_long_move (rtx operands[]) /* If optimizing for size, attempt to locally unCSE nonzero constants. */ if (optimize_size) { - if (GET_CODE (operands[5]) == CONST_INT + if (CONST_INT_P (operands[5]) && operands[5] != const0_rtx && REG_P (operands[2])) { - if (GET_CODE (operands[6]) == CONST_INT + if (CONST_INT_P (operands[6]) && INTVAL (operands[6]) == INTVAL (operands[5])) operands[6] = operands[2]; if (nparts == 3 - && GET_CODE (operands[7]) == CONST_INT + && CONST_INT_P (operands[7]) && INTVAL (operands[7]) == INTVAL (operands[5])) operands[7] = operands[2]; } if (nparts == 3 - && GET_CODE (operands[6]) == CONST_INT + && CONST_INT_P (operands[6]) && operands[6] != const0_rtx && REG_P (operands[3]) - && GET_CODE (operands[7]) == CONST_INT + && CONST_INT_P (operands[7]) && INTVAL (operands[7]) == INTVAL (operands[6])) operands[7] = operands[3]; } @@ -11972,7 +12960,7 @@ ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode) int count; const int single_width = mode == DImode ? 32 : 64; - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { (mode == DImode ? split_di : split_ti) (operands, 2, low, high); count = INTVAL (operands[2]) & (single_width * 2 - 1); @@ -12010,7 +12998,7 @@ ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode) ix86_expand_clear (low[0]); ix86_expand_clear (high[0]); emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (single_width))); - + d = gen_lowpart (QImode, low[0]); d = gen_rtx_STRICT_LOW_PART (VOIDmode, d); s = gen_rtx_EQ (QImode, flags, const0_rtx); @@ -12099,7 +13087,7 @@ ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode) int count; const int single_width = mode == DImode ? 32 : 64; - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { (mode == DImode ? split_di : split_ti) (operands, 2, low, high); count = INTVAL (operands[2]) & (single_width * 2 - 1); @@ -12178,7 +13166,7 @@ ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode) int count; const int single_width = mode == DImode ? 32 : 64; - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { (mode == DImode ? split_di : split_ti) (operands, 2, low, high); count = INTVAL (operands[2]) & (single_width * 2 - 1); @@ -12234,10 +13222,22 @@ ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode) } } +/* Predict just emitted jump instruction to be taken with probability PROB. */ +static void +predict_jump (int prob) +{ + rtx insn = get_last_insn (); + gcc_assert (JUMP_P (insn)); + REG_NOTES (insn) + = gen_rtx_EXPR_LIST (REG_BR_PROB, + GEN_INT (prob), + REG_NOTES (insn)); +} + /* Helper function for the string operations below. Dest VARIABLE whether it is aligned to VALUE bytes. If true, jump to the label. */ static rtx -ix86_expand_aligntest (rtx variable, int value) +ix86_expand_aligntest (rtx variable, int value, bool epilogue) { rtx label = gen_label_rtx (); rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); @@ -12247,6 +13247,10 @@ ix86_expand_aligntest (rtx variable, int value) emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), 1, label); + if (epilogue) + predict_jump (REG_BR_PROB_BASE * 50 / 100); + else + predict_jump (REG_BR_PROB_BASE * 90 / 100); return label; } @@ -12274,581 +13278,1291 @@ ix86_zero_extend_to_Pmode (rtx exp) return r; } -/* Expand string move (memcpy) operation. Use i386 string operations when - profitable. expand_clrmem contains similar code. */ -int -ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp) -{ - rtx srcreg, destreg, countreg, srcexp, destexp; - enum machine_mode counter_mode; - HOST_WIDE_INT align = 0; - unsigned HOST_WIDE_INT count = 0; +/* Divide COUNTREG by SCALE. */ +static rtx +scale_counter (rtx countreg, int scale) +{ + rtx sc; + rtx piece_size_mask; + + if (scale == 1) + return countreg; + if (CONST_INT_P (countreg)) + return GEN_INT (INTVAL (countreg) / scale); + gcc_assert (REG_P (countreg)); + + piece_size_mask = GEN_INT (scale - 1); + sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg, + GEN_INT (exact_log2 (scale)), + NULL, 1, OPTAB_DIRECT); + return sc; +} - if (GET_CODE (align_exp) == CONST_INT) - align = INTVAL (align_exp); +/* When SRCPTR is non-NULL, output simple loop to move memory + pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times, + overall size is COUNT specified in bytes. When SRCPTR is NULL, output the + equivalent loop to set memory by VALUE (supposed to be in MODE). - /* Can't use any of this if the user has appropriated esi or edi. */ - if (global_regs[4] || global_regs[5]) - return 0; + The size is rounded down to whole number of chunk size moved at once. + SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */ + - /* This simple hack avoids all inlining code and simplifies code below. */ - if (!TARGET_ALIGN_STRINGOPS) - align = 64; +static void +expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx value, + rtx count, enum machine_mode mode, int unroll, + int expected_size) +{ + rtx out_label, top_label, iter, tmp; + enum machine_mode iter_mode; + rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll); + rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); + rtx size; + rtx x_addr; + rtx y_addr; + int i; + + iter_mode = GET_MODE (count); + if (iter_mode == VOIDmode) + iter_mode = word_mode; - if (GET_CODE (count_exp) == CONST_INT) + top_label = gen_label_rtx (); + out_label = gen_label_rtx (); + iter = gen_reg_rtx (iter_mode); + + size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, + NULL, 1, OPTAB_DIRECT); + /* Those two should combine. */ + if (piece_size == const1_rtx) { - count = INTVAL (count_exp); - if (!TARGET_INLINE_ALL_STRINGOPS && count > 64) - return 0; + emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, + true, out_label); + predict_jump (REG_BR_PROB_BASE * 10 / 100); } + emit_move_insn (iter, const0_rtx); - /* Figure out proper mode for counter. For 32bits it is always SImode, - for 64bits use SImode when possible, otherwise DImode. - Set count to number of bytes copied when known at compile time. */ - if (!TARGET_64BIT - || GET_MODE (count_exp) == SImode - || x86_64_zext_immediate_operand (count_exp, VOIDmode)) - counter_mode = SImode; - else - counter_mode = DImode; + emit_label (top_label); - gcc_assert (counter_mode == SImode || counter_mode == DImode); + tmp = convert_modes (Pmode, iter_mode, iter, true); + x_addr = gen_rtx_PLUS (Pmode, destptr, tmp); + destmem = change_address (destmem, mode, x_addr); - destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); - if (destreg != XEXP (dst, 0)) - dst = replace_equiv_address_nv (dst, destreg); - srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); - if (srcreg != XEXP (src, 0)) - src = replace_equiv_address_nv (src, srcreg); - - /* When optimizing for size emit simple rep ; movsb instruction for - counts not divisible by 4, except when (movsl;)*(movsw;)?(movsb;)? - sequence is shorter than mov{b,l} $count, %{ecx,cl}; rep; movsb. - Sice of (movsl;)*(movsw;)?(movsb;)? sequence is - count / 4 + (count & 3), the other sequence is either 4 or 7 bytes, - but we don't know whether upper 24 (resp. 56) bits of %ecx will be - known to be zero or not. The rep; movsb sequence causes higher - register pressure though, so take that into account. */ - - if ((!optimize || optimize_size) - && (count == 0 - || ((count & 0x03) - && (!optimize_size - || count > 5 * 4 - || (count & 3) + count / 4 > 6)))) - { - emit_insn (gen_cld ()); - countreg = ix86_zero_extend_to_Pmode (count_exp); - destexp = gen_rtx_PLUS (Pmode, destreg, countreg); - srcexp = gen_rtx_PLUS (Pmode, srcreg, countreg); - emit_insn (gen_rep_mov (destreg, dst, srcreg, src, countreg, - destexp, srcexp)); - } - - /* For constant aligned (or small unaligned) copies use rep movsl - followed by code copying the rest. For PentiumPro ensure 8 byte - alignment to allow rep movsl acceleration. */ - - else if (count != 0 - && (align >= 8 - || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) - || optimize_size || count < (unsigned int) 64)) - { - unsigned HOST_WIDE_INT offset = 0; - int size = TARGET_64BIT && !optimize_size ? 8 : 4; - rtx srcmem, dstmem; - - emit_insn (gen_cld ()); - if (count & ~(size - 1)) - { - if ((TARGET_SINGLE_STRINGOP || optimize_size) && count < 5 * 4) - { - enum machine_mode movs_mode = size == 4 ? SImode : DImode; + if (srcmem) + { + y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp)); + srcmem = change_address (srcmem, mode, y_addr); - while (offset < (count & ~(size - 1))) + /* When unrolling for chips that reorder memory reads and writes, + we can save registers by using single temporary. + Also using 4 temporaries is overkill in 32bit mode. */ + if (!TARGET_64BIT && 0) + { + for (i = 0; i < unroll; i++) + { + if (i) { - srcmem = adjust_automodify_address_nv (src, movs_mode, - srcreg, offset); - dstmem = adjust_automodify_address_nv (dst, movs_mode, - destreg, offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - offset += size; + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + srcmem = + adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode)); } + emit_move_insn (destmem, srcmem); } - else - { - countreg = GEN_INT ((count >> (size == 4 ? 2 : 3)) - & (TARGET_64BIT ? -1 : 0x3fffffff)); - countreg = copy_to_mode_reg (counter_mode, countreg); - countreg = ix86_zero_extend_to_Pmode (countreg); - - destexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (size == 4 ? 2 : 3)); - srcexp = gen_rtx_PLUS (Pmode, destexp, srcreg); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - - emit_insn (gen_rep_mov (destreg, dst, srcreg, src, - countreg, destexp, srcexp)); - offset = count & ~(size - 1); - } - } - if (size == 8 && (count & 0x04)) - { - srcmem = adjust_automodify_address_nv (src, SImode, srcreg, - offset); - dstmem = adjust_automodify_address_nv (dst, SImode, destreg, - offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - offset += 4; - } - if (count & 0x02) - { - srcmem = adjust_automodify_address_nv (src, HImode, srcreg, - offset); - dstmem = adjust_automodify_address_nv (dst, HImode, destreg, - offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - offset += 2; } - if (count & 0x01) + else { - srcmem = adjust_automodify_address_nv (src, QImode, srcreg, - offset); - dstmem = adjust_automodify_address_nv (dst, QImode, destreg, - offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + rtx tmpreg[4]; + gcc_assert (unroll <= 4); + for (i = 0; i < unroll; i++) + { + tmpreg[i] = gen_reg_rtx (mode); + if (i) + { + srcmem = + adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode)); + } + emit_move_insn (tmpreg[i], srcmem); + } + for (i = 0; i < unroll; i++) + { + if (i) + { + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, tmpreg[i]); + } } } - /* The generic code based on the glibc implementation: - - align destination to 4 bytes (8 byte alignment is used for PentiumPro - allowing accelerated copying there) - - copy the data using rep movsl - - copy the rest. */ else - { - rtx countreg2; - rtx label = NULL; - rtx srcmem, dstmem; - int desired_alignment = (TARGET_PENTIUMPRO - && (count == 0 || count >= (unsigned int) 260) - ? 8 : UNITS_PER_WORD); - /* Get rid of MEM_OFFSETs, they won't be accurate. */ - dst = change_address (dst, BLKmode, destreg); - src = change_address (src, BLKmode, srcreg); + for (i = 0; i < unroll; i++) + { + if (i) + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + emit_move_insn (destmem, value); + } - /* In case we don't know anything about the alignment, default to - library version, since it is usually equally fast and result in - shorter code. + tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter, + true, OPTAB_LIB_WIDEN); + if (tmp != iter) + emit_move_insn (iter, tmp); - Also emit call when we know that the count is large and call overhead - will not be important. */ - if (!TARGET_INLINE_ALL_STRINGOPS - && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) - return 0; + emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, + true, top_label); + if (expected_size != -1) + { + expected_size /= GET_MODE_SIZE (mode) * unroll; + if (expected_size == 0) + predict_jump (0); + else if (expected_size > REG_BR_PROB_BASE) + predict_jump (REG_BR_PROB_BASE - 1); + else + predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size); + } + else + predict_jump (REG_BR_PROB_BASE * 80 / 100); + iter = ix86_zero_extend_to_Pmode (iter); + tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr, + true, OPTAB_LIB_WIDEN); + if (tmp != destptr) + emit_move_insn (destptr, tmp); + if (srcptr) + { + tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr, + true, OPTAB_LIB_WIDEN); + if (tmp != srcptr) + emit_move_insn (srcptr, tmp); + } + emit_label (out_label); +} - if (TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); +/* Output "rep; mov" instruction. + Arguments have same meaning as for previous function */ +static void +expand_movmem_via_rep_mov (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, + rtx count, + enum machine_mode mode) +{ + rtx destexp; + rtx srcexp; + rtx countreg; + + /* If the size is known, it is shorter to use rep movs. */ + if (mode == QImode && CONST_INT_P (count) + && !(INTVAL (count) & 3)) + mode = SImode; + + if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) + destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); + if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) + srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); + if (mode != QImode) + { + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + srcexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); + } + else + { + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); + } + emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, + destexp, srcexp)); +} - countreg2 = gen_reg_rtx (Pmode); - countreg = copy_to_mode_reg (counter_mode, count_exp); +/* Output "rep; stos" instruction. + Arguments have same meaning as for previous function */ +static void +expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value, + rtx count, + enum machine_mode mode) +{ + rtx destexp; + rtx countreg; - /* We don't use loops to align destination and to copy parts smaller - than 4 bytes, because gcc is able to optimize such code better (in - the case the destination or the count really is aligned, gcc is often - able to predict the branches) and also it is friendlier to the - hardware branch prediction. + if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) + destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); + value = force_reg (mode, gen_lowpart (mode, value)); + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); + if (mode != QImode) + { + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + } + else + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); +} - Using loops is beneficial for generic case, because we can - handle small counts using the loops. Many CPUs (such as Athlon) - have large REP prefix setup costs. +static void +emit_strmov (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, enum machine_mode mode, int offset) +{ + rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset); + rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); +} - This is quite costly. Maybe we can revisit this decision later or - add some customizability to this code. */ +/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */ +static void +expand_movmem_epilogue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx count, int max_size) +{ + rtx src, dest; + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + int offset = 0; - if (count == 0 && align < desired_alignment) + if ((countval & 0x16) && max_size > 16) { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (countreg, GEN_INT (desired_alignment - 1), - LEU, 0, counter_mode, 1, label); + if (TARGET_64BIT) + { + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8); + } + else + gcc_unreachable (); + offset += 16; } - if (align <= 1) + if ((countval & 0x08) && max_size > 8) { - rtx label = ix86_expand_aligntest (destreg, 1); - srcmem = change_address (src, QImode, srcreg); - dstmem = change_address (dst, QImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - ix86_adjust_counter (countreg, 1); + if (TARGET_64BIT) + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + else + { + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 4); + } + offset += 8; + } + if ((countval & 0x04) && max_size > 4) + { + emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset); + offset += 4; + } + if ((countval & 0x02) && max_size > 2) + { + emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset); + offset += 2; + } + if ((countval & 0x01) && max_size > 1) + { + emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset); + offset += 1; + } + return; + } + if (max_size > 8) + { + count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), + count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL, + count, QImode, 1, 4); + return; + } + + /* When there are stringops, we can cheaply increase dest and src pointers. + Otherwise we save code size by maintaining offset (zero is readily + available from preceding rep operation) and using x86 addressing modes. + */ + if (TARGET_SINGLE_STRINGOP) + { + if (max_size > 4) + { + rtx label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); emit_label (label); LABEL_NUSES (label) = 1; } - if (align <= 2) + if (max_size > 2) { - rtx label = ix86_expand_aligntest (destreg, 2); - srcmem = change_address (src, HImode, srcreg); - dstmem = change_address (dst, HImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - ix86_adjust_counter (countreg, 2); + rtx label = ix86_expand_aligntest (count, 2, true); + src = change_address (srcmem, HImode, srcptr); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); emit_label (label); LABEL_NUSES (label) = 1; } - if (align <= 4 && desired_alignment > 4) + if (max_size > 1) { - rtx label = ix86_expand_aligntest (destreg, 4); - srcmem = change_address (src, SImode, srcreg); - dstmem = change_address (dst, SImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - ix86_adjust_counter (countreg, 4); + rtx label = ix86_expand_aligntest (count, 1, true); + src = change_address (srcmem, QImode, srcptr); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); emit_label (label); LABEL_NUSES (label) = 1; } + } + else + { + rtx offset = force_reg (Pmode, const0_rtx); + rtx tmp; - if (label && desired_alignment > 4 && !TARGET_64BIT) + if (max_size > 4) { + rtx label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); emit_label (label); LABEL_NUSES (label) = 1; - label = NULL_RTX; } - if (!TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - if (TARGET_64BIT) - { - emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), - GEN_INT (3))); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, GEN_INT (3)); + if (max_size > 2) + { + rtx label = ix86_expand_aligntest (count, 2, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, HImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, HImode, tmp); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); + emit_label (label); + LABEL_NUSES (label) = 1; } - else + if (max_size > 1) { - emit_insn (gen_lshrsi3 (countreg2, countreg, const2_rtx)); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, const2_rtx); + rtx label = ix86_expand_aligntest (count, 1, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, QImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, QImode, tmp); + emit_move_insn (dest, src); + emit_label (label); + LABEL_NUSES (label) = 1; } - srcexp = gen_rtx_PLUS (Pmode, destexp, srcreg); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_mov (destreg, dst, srcreg, src, - countreg2, destexp, srcexp)); + } +} - if (label) +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, + rtx count, int max_size) +{ + count = + expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), + count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL, + gen_lowpart (QImode, value), count, QImode, + 1, max_size / 2); +} + +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size) +{ + rtx dest; + + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + int offset = 0; + + if ((countval & 0x16) && max_size > 16) { - emit_label (label); - LABEL_NUSES (label) = 1; + if (TARGET_64BIT) + { + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8); + emit_insn (gen_strset (destptr, dest, value)); + } + else + gcc_unreachable (); + offset += 16; } - if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) + if ((countval & 0x08) && max_size > 8) { - srcmem = change_address (src, SImode, srcreg); - dstmem = change_address (dst, SImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + if (TARGET_64BIT) + { + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4); + emit_insn (gen_strset (destptr, dest, value)); + } + offset += 8; } - if ((align <= 4 || count == 0) && TARGET_64BIT) + if ((countval & 0x04) && max_size > 4) { - rtx label = ix86_expand_aligntest (countreg, 4); - srcmem = change_address (src, SImode, srcreg); - dstmem = change_address (dst, SImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - emit_label (label); - LABEL_NUSES (label) = 1; + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); + offset += 4; + } + if ((countval & 0x02) && max_size > 2) + { + dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); + offset += 2; } - if (align > 2 && count != 0 && (count & 2)) + if ((countval & 0x01) && max_size > 1) { - srcmem = change_address (src, HImode, srcreg); - dstmem = change_address (dst, HImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); + offset += 1; } - if (align <= 2 || count == 0) + return; + } + if (max_size > 32) + { + expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size); + return; + } + if (max_size > 16) + { + rtx label = ix86_expand_aligntest (count, 16, true); + if (TARGET_64BIT) { - rtx label = ix86_expand_aligntest (countreg, 2); - srcmem = change_address (src, HImode, srcreg); - dstmem = change_address (dst, HImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - emit_label (label); - LABEL_NUSES (label) = 1; + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); } - if (align > 1 && count != 0 && (count & 1)) + else { - srcmem = change_address (src, QImode, srcreg); - dstmem = change_address (dst, QImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); } - if (align <= 1 || count == 0) + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 8) + { + rtx label = ix86_expand_aligntest (count, 8, true); + if (TARGET_64BIT) { - rtx label = ix86_expand_aligntest (countreg, 1); - srcmem = change_address (src, QImode, srcreg); - dstmem = change_address (dst, QImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - emit_label (label); - LABEL_NUSES (label) = 1; + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 4) + { + rtx label = ix86_expand_aligntest (count, 4, true); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 2) + { + rtx label = ix86_expand_aligntest (count, 2, true); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; } + if (max_size > 1) + { + rtx label = ix86_expand_aligntest (count, 1, true); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } +} - return 1; +/* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to + DESIRED_ALIGNMENT. */ +static void +expand_movmem_prologue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx count, + int align, int desired_alignment) +{ + if (align <= 1 && desired_alignment > 1) + { + rtx label = ix86_expand_aligntest (destptr, 1, false); + srcmem = change_address (srcmem, QImode, srcptr); + destmem = change_address (destmem, QImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2 && desired_alignment > 2) + { + rtx label = ix86_expand_aligntest (destptr, 2, false); + srcmem = change_address (srcmem, HImode, srcptr); + destmem = change_address (destmem, HImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && desired_alignment > 4) + { + rtx label = ix86_expand_aligntest (destptr, 4, false); + srcmem = change_address (srcmem, SImode, srcptr); + destmem = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + gcc_assert (desired_alignment <= 8); } -/* Expand string clear operation (bzero). Use i386 string operations when - profitable. expand_movmem contains similar code. */ +/* Set enough from DEST to align DEST known to by aligned by ALIGN to + DESIRED_ALIGNMENT. */ +static void +expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count, + int align, int desired_alignment) +{ + if (align <= 1 && desired_alignment > 1) + { + rtx label = ix86_expand_aligntest (destptr, 1, false); + destmem = change_address (destmem, QImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value))); + ix86_adjust_counter (count, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2 && desired_alignment > 2) + { + rtx label = ix86_expand_aligntest (destptr, 2, false); + destmem = change_address (destmem, HImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value))); + ix86_adjust_counter (count, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && desired_alignment > 4) + { + rtx label = ix86_expand_aligntest (destptr, 4, false); + destmem = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value))); + ix86_adjust_counter (count, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + gcc_assert (desired_alignment <= 8); +} + +/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */ +static enum stringop_alg +decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, + int *dynamic_check) +{ + const struct stringop_algs * algs; + + *dynamic_check = -1; + if (memset) + algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + if (stringop_alg != no_stringop) + return stringop_alg; + /* rep; movq or rep; movl is the smallest variant. */ + else if (optimize_size) + { + if (!count || (count & 3)) + return rep_prefix_1_byte; + else + return rep_prefix_4_byte; + } + /* Very tiny blocks are best handled via the loop, REP is expensive to setup. + */ + else if (expected_size != -1 && expected_size < 4) + return loop_1_byte; + else if (expected_size != -1) + { + unsigned int i; + enum stringop_alg alg = libcall; + for (i = 0; i < NAX_STRINGOP_ALGS; i++) + { + gcc_assert (algs->size[i].max); + if (algs->size[i].max >= expected_size || algs->size[i].max == -1) + { + if (algs->size[i].alg != libcall) + alg = algs->size[i].alg; + /* Honor TARGET_INLINE_ALL_STRINGOPS by picking + last non-libcall inline algorithm. */ + if (TARGET_INLINE_ALL_STRINGOPS) + { + /* When the current size is best to be copied by a libcall, + but we are still forced to inline, run the heuristic bellow + that will pick code for medium sized blocks. */ + if (alg != libcall) + return alg; + break; + } + else + return algs->size[i].alg; + } + } + gcc_assert (TARGET_INLINE_ALL_STRINGOPS); + } + /* When asked to inline the call anyway, try to pick meaningful choice. + We look for maximal size of block that is faster to copy by hand and + take blocks of at most of that size guessing that average size will + be roughly half of the block. + + If this turns out to be bad, we might simply specify the preferred + choice in ix86_costs. */ + if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY) + && algs->unknown_size == libcall) + { + int max = -1; + enum stringop_alg alg; + int i; + + for (i = 0; i < NAX_STRINGOP_ALGS; i++) + if (algs->size[i].alg != libcall && algs->size[i].alg) + max = algs->size[i].max; + if (max == -1) + max = 4096; + alg = decide_alg (count, max / 2, memset, dynamic_check); + gcc_assert (*dynamic_check == -1); + gcc_assert (alg != libcall); + if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) + *dynamic_check = max; + return alg; + } + return algs->unknown_size; +} + +/* Decide on alignment. We know that the operand is already aligned to ALIGN + (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */ +static int +decide_alignment (int align, + enum stringop_alg alg, + int expected_size) +{ + int desired_align = 0; + switch (alg) + { + case no_stringop: + gcc_unreachable (); + case loop: + case unrolled_loop: + desired_align = GET_MODE_SIZE (Pmode); + break; + case rep_prefix_8_byte: + desired_align = 8; + break; + case rep_prefix_4_byte: + /* PentiumPro has special logic triggering for 8 byte aligned blocks. + copying whole cacheline at once. */ + if (TARGET_PENTIUMPRO) + desired_align = 8; + else + desired_align = 4; + break; + case rep_prefix_1_byte: + /* PentiumPro has special logic triggering for 8 byte aligned blocks. + copying whole cacheline at once. */ + if (TARGET_PENTIUMPRO) + desired_align = 8; + else + desired_align = 1; + break; + case loop_1_byte: + desired_align = 1; + break; + case libcall: + return 0; + } + + if (optimize_size) + desired_align = 1; + if (desired_align < align) + desired_align = align; + if (expected_size != -1 && expected_size < 4) + desired_align = align; + return desired_align; +} + +/* Return the smallest power of 2 greater than VAL. */ +static int +smallest_pow2_greater_than (int val) +{ + int ret = 1; + while (ret <= val) + ret <<= 1; + return ret; +} + +/* Expand string move (memcpy) operation. Use i386 string operations when + profitable. expand_clrmem contains similar code. The code depends upon + architecture, block size and alignment, but always has the same + overall structure: + + 1) Prologue guard: Conditional that jumps up to epilogues for small + blocks that can be handled by epilogue alone. This is faster but + also needed for correctness, since prologue assume the block is larger + than the desired alignment. + + Optional dynamic check for size and libcall for large + blocks is emitted here too, with -minline-stringops-dynamically. + + 2) Prologue: copy first few bytes in order to get destination aligned + to DESIRED_ALIGN. It is emitted only when ALIGN is less than + DESIRED_ALIGN and and up to DESIRED_ALIGN - ALIGN bytes can be copied. + We emit either a jump tree on power of two sized blocks, or a byte loop. + + 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks + with specified algorithm. + + 4) Epilogue: code copying tail of the block that is too small to be + handled by main body (or up to size guarded by prologue guard). */ + int -ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp) +ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, + rtx expected_align_exp, rtx expected_size_exp) { - rtx destreg, zeroreg, countreg, destexp; - enum machine_mode counter_mode; - HOST_WIDE_INT align = 0; + rtx destreg; + rtx srcreg; + rtx label = NULL; + rtx tmp; + rtx jump_around_label = NULL; + HOST_WIDE_INT align = 1; unsigned HOST_WIDE_INT count = 0; + HOST_WIDE_INT expected_size = -1; + int size_needed = 0, epilogue_size_needed; + int desired_align = 0; + enum stringop_alg alg; + int dynamic_check; - if (GET_CODE (align_exp) == CONST_INT) + if (CONST_INT_P (align_exp)) align = INTVAL (align_exp); + /* i386 can do misaligned access on reasonably increased cost. */ + if (CONST_INT_P (expected_align_exp) + && INTVAL (expected_align_exp) > align) + align = INTVAL (expected_align_exp); + if (CONST_INT_P (count_exp)) + count = expected_size = INTVAL (count_exp); + if (CONST_INT_P (expected_size_exp) && count == 0) + expected_size = INTVAL (expected_size_exp); - /* Can't use any of this if the user has appropriated esi. */ - if (global_regs[4]) - return 0; + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ + + alg = decide_alg (count, expected_size, false, &dynamic_check); + desired_align = decide_alignment (align, alg, expected_size); - /* This simple hack avoids all inlining code and simplifies code below. */ if (!TARGET_ALIGN_STRINGOPS) - align = 32; + align = desired_align; - if (GET_CODE (count_exp) == CONST_INT) + if (alg == libcall) + return 0; + gcc_assert (alg != no_stringop); + if (!count) + count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); + switch (alg) { - count = INTVAL (count_exp); - if (!TARGET_INLINE_ALL_STRINGOPS && count > 64) - return 0; + case libcall: + case no_stringop: + gcc_unreachable (); + case loop: + size_needed = GET_MODE_SIZE (Pmode); + break; + case unrolled_loop: + size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2); + break; + case rep_prefix_8_byte: + size_needed = 8; + break; + case rep_prefix_4_byte: + size_needed = 4; + break; + case rep_prefix_1_byte: + case loop_1_byte: + size_needed = 1; + break; } - /* Figure out proper mode for counter. For 32bits it is always SImode, - for 64bits use SImode when possible, otherwise DImode. - Set count to number of bytes copied when known at compile time. */ - if (!TARGET_64BIT - || GET_MODE (count_exp) == SImode - || x86_64_zext_immediate_operand (count_exp, VOIDmode)) - counter_mode = SImode; - else - counter_mode = DImode; - destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); - if (destreg != XEXP (dst, 0)) - dst = replace_equiv_address_nv (dst, destreg); + epilogue_size_needed = size_needed; + /* Step 1: Prologue guard. */ - /* When optimizing for size emit simple rep ; movsb instruction for - counts not divisible by 4. The movl $N, %ecx; rep; stosb - sequence is 7 bytes long, so if optimizing for size and count is - small enough that some stosl, stosw and stosb instructions without - rep are shorter, fall back into the next if. */ + /* Alignment code needs count to be in register. */ + if (CONST_INT_P (count_exp) && desired_align > align) + { + enum machine_mode mode = SImode; + if (TARGET_64BIT && (count & ~0xffffffff)) + mode = DImode; + count_exp = force_reg (mode, count_exp); + } + gcc_assert (desired_align >= 1 && align >= 1); - if ((!optimize || optimize_size) - && (count == 0 - || ((count & 0x03) - && (!optimize_size || (count & 0x03) + (count >> 2) > 7)))) + /* Ensure that alignment prologue won't copy past end of block. */ + if ((size_needed > 1 || (desired_align > 1 && desired_align > align)) + && !count) { - emit_insn (gen_cld ()); + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); - countreg = ix86_zero_extend_to_Pmode (count_exp); - zeroreg = copy_to_mode_reg (QImode, const0_rtx); - destexp = gen_rtx_PLUS (Pmode, destreg, countreg); - emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp)); + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); + + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, GET_MODE (count_exp), 1, label); + if (expected_size == -1 || expected_size < epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); } - else if (count != 0 - && (align >= 8 - || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) - || optimize_size || count < (unsigned int) 64)) + /* Emit code to decide on runtime whether library call or inline should be + used. */ + if (dynamic_check != -1) { - int size = TARGET_64BIT && !optimize_size ? 8 : 4; - unsigned HOST_WIDE_INT offset = 0; + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, GET_MODE (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + emit_block_move_via_libcall (dst, src, count_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } - emit_insn (gen_cld ()); + /* Step 2: Alignment prologue. */ - zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx); - if (count & ~(size - 1)) - { - unsigned HOST_WIDE_INT repcount; - unsigned int max_nonrep; + if (desired_align > align) + { + /* Except for the first move in epilogue, we no longer know + constant offset in aliasing info. It don't seems to worth + the pain to maintain it for the first move, so throw away + the info early. */ + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align, + desired_align); + } + if (label && size_needed == 1) + { + emit_label (label); + LABEL_NUSES (label) = 1; + label = NULL; + } - repcount = count >> (size == 4 ? 2 : 3); - if (!TARGET_64BIT) - repcount &= 0x3fffffff; + /* Step 3: Main loop. */ - /* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes. - movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN - bytes. In both cases the latter seems to be faster for small - values of N. */ - max_nonrep = size == 4 ? 7 : 4; - if (!optimize_size) - switch (ix86_tune) - { - case PROCESSOR_PENTIUM4: - case PROCESSOR_NOCONA: - max_nonrep = 3; - break; - default: - break; - } + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop_1_byte: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, QImode, 1, expected_size); + break; + case loop: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, Pmode, 1, expected_size); + break; + case unrolled_loop: + /* Unroll only by factor of 2 in 32bit mode, since we don't have enough + registers for 4 temporaries anyway. */ + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, Pmode, TARGET_64BIT ? 4 : 2, + expected_size); + break; + case rep_prefix_8_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + DImode); + break; + case rep_prefix_4_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + SImode); + break; + case rep_prefix_1_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + QImode); + break; + } + /* Adjust properly the offset of src and dest memory for aliasing. */ + if (CONST_INT_P (count_exp)) + { + src = adjust_automodify_address_nv (src, BLKmode, srcreg, + (count / size_needed) * size_needed); + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, + (count / size_needed) * size_needed); + } + else + { + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + } - if (repcount <= max_nonrep) - while (repcount-- > 0) - { - rtx mem = adjust_automodify_address_nv (dst, - GET_MODE (zeroreg), - destreg, offset); - emit_insn (gen_strset (destreg, mem, zeroreg)); - offset += size; - } - else - { - countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount)); - countreg = ix86_zero_extend_to_Pmode (countreg); - destexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (size == 4 ? 2 : 3)); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, - destexp)); - offset = count & ~(size - 1); - } - } - if (size == 8 && (count & 0x04)) - { - rtx mem = adjust_automodify_address_nv (dst, SImode, destreg, - offset); - emit_insn (gen_strset (destreg, mem, - gen_rtx_SUBREG (SImode, zeroreg, 0))); - offset += 4; - } - if (count & 0x02) + /* Step 4: Epilogue to copy the remaining bytes. */ + + if (label) + { + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ + + if (size_needed < epilogue_size_needed) { - rtx mem = adjust_automodify_address_nv (dst, HImode, destreg, - offset); - emit_insn (gen_strset (destreg, mem, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - offset += 2; + tmp = + expand_simple_binop (GET_MODE (count_exp), AND, count_exp, + GEN_INT (size_needed - 1), count_exp, 1, + OPTAB_DIRECT); + if (tmp != count_exp) + emit_move_insn (count_exp, tmp); } - if (count & 0x01) + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (count_exp != const0_rtx && epilogue_size_needed > 1) + expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, + epilogue_size_needed); + if (jump_around_label) + emit_label (jump_around_label); + return 1; +} + +/* Helper function for memcpy. For QImode value 0xXY produce + 0xXYXYXYXY of wide specified by MODE. This is essentially + a * 0x10101010, but we can do slightly better than + synth_mult by unwinding the sequence by hand on CPUs with + slow multiply. */ +static rtx +promote_duplicated_reg (enum machine_mode mode, rtx val) +{ + enum machine_mode valmode = GET_MODE (val); + rtx tmp; + int nops = mode == DImode ? 3 : 2; + + gcc_assert (mode == SImode || mode == DImode); + if (val == const0_rtx) + return copy_to_mode_reg (mode, const0_rtx); + if (CONST_INT_P (val)) + { + HOST_WIDE_INT v = INTVAL (val) & 255; + + v |= v << 8; + v |= v << 16; + if (mode == DImode) + v |= (v << 16) << 16; + return copy_to_mode_reg (mode, gen_int_mode (v, mode)); + } + + if (valmode == VOIDmode) + valmode = QImode; + if (valmode != QImode) + val = gen_lowpart (QImode, val); + if (mode == QImode) + return val; + if (!TARGET_PARTIAL_REG_STALL) + nops--; + if (ix86_cost->mult_init[mode == DImode ? 3 : 2] + + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) + <= (ix86_cost->shift_const + ix86_cost->add) * nops + + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) + { + rtx reg = convert_modes (mode, QImode, val, true); + tmp = promote_duplicated_reg (mode, const1_rtx); + return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, + OPTAB_DIRECT); + } + else + { + rtx reg = convert_modes (mode, QImode, val, true); + + if (!TARGET_PARTIAL_REG_STALL) + if (mode == SImode) + emit_insn (gen_movsi_insv_1 (reg, reg)); + else + emit_insn (gen_movdi_insv_1_rex64 (reg, reg)); + else { - rtx mem = adjust_automodify_address_nv (dst, QImode, destreg, - offset); - emit_insn (gen_strset (destreg, mem, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - } + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), + NULL, 1, OPTAB_DIRECT); + reg = + expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + } + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + if (mode == SImode) + return reg; + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + return reg; } +} + +/* Duplicate value VAL using promote_duplicated_reg into maximal size that will + be needed by main loop copying SIZE_NEEDED chunks and prologue getting + alignment from ALIGN to DESIRED_ALIGN. */ +static rtx +promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align) +{ + rtx promoted_val; + + if (TARGET_64BIT + && (size_needed > 4 || (desired_align > align && desired_align > 4))) + promoted_val = promote_duplicated_reg (DImode, val); + else if (size_needed > 2 || (desired_align > align && desired_align > 2)) + promoted_val = promote_duplicated_reg (SImode, val); + else if (size_needed > 1 || (desired_align > align && desired_align > 1)) + promoted_val = promote_duplicated_reg (HImode, val); else + promoted_val = val; + + return promoted_val; +} + +/* Expand string clear operation (bzero). Use i386 string operations when + profitable. See expand_movmem comment for explanation of individual + steps performed. */ +int +ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, + rtx expected_align_exp, rtx expected_size_exp) +{ + rtx destreg; + rtx label = NULL; + rtx tmp; + rtx jump_around_label = NULL; + HOST_WIDE_INT align = 1; + unsigned HOST_WIDE_INT count = 0; + HOST_WIDE_INT expected_size = -1; + int size_needed = 0, epilogue_size_needed; + int desired_align = 0; + enum stringop_alg alg; + rtx promoted_val = NULL; + bool force_loopy_epilogue = false; + int dynamic_check; + + if (CONST_INT_P (align_exp)) + align = INTVAL (align_exp); + /* i386 can do misaligned access on reasonably increased cost. */ + if (CONST_INT_P (expected_align_exp) + && INTVAL (expected_align_exp) > align) + align = INTVAL (expected_align_exp); + if (CONST_INT_P (count_exp)) + count = expected_size = INTVAL (count_exp); + if (CONST_INT_P (expected_size_exp) && count == 0) + expected_size = INTVAL (expected_size_exp); + + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ + + alg = decide_alg (count, expected_size, true, &dynamic_check); + desired_align = decide_alignment (align, alg, expected_size); + + if (!TARGET_ALIGN_STRINGOPS) + align = desired_align; + + if (alg == libcall) + return 0; + gcc_assert (alg != no_stringop); + if (!count) + count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + switch (alg) { - rtx countreg2; - rtx label = NULL; - /* Compute desired alignment of the string operation. */ - int desired_alignment = (TARGET_PENTIUMPRO - && (count == 0 || count >= (unsigned int) 260) - ? 8 : UNITS_PER_WORD); - - /* In case we don't know anything about the alignment, default to - library version, since it is usually equally fast and result in - shorter code. - - Also emit call when we know that the count is large and call overhead - will not be important. */ - if (!TARGET_INLINE_ALL_STRINGOPS - && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) - return 0; + case libcall: + case no_stringop: + gcc_unreachable (); + case loop: + size_needed = GET_MODE_SIZE (Pmode); + break; + case unrolled_loop: + size_needed = GET_MODE_SIZE (Pmode) * 4; + break; + case rep_prefix_8_byte: + size_needed = 8; + break; + case rep_prefix_4_byte: + size_needed = 4; + break; + case rep_prefix_1_byte: + case loop_1_byte: + size_needed = 1; + break; + } + epilogue_size_needed = size_needed; + + /* Step 1: Prologue guard. */ + + /* Alignment code needs count to be in register. */ + if (CONST_INT_P (count_exp) && desired_align > align) + { + enum machine_mode mode = SImode; + if (TARGET_64BIT && (count & ~0xffffffff)) + mode = DImode; + count_exp = force_reg (mode, count_exp); + } + /* Do the cheap promotion to allow better CSE across the + main loop and epilogue (ie one load of the big constant in the + front of all code. */ + if (CONST_INT_P (val_exp)) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + /* Ensure that alignment prologue won't copy past end of block. */ + if ((size_needed > 1 || (desired_align > 1 && desired_align > align)) + && !count) + { + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); + + /* To improve performance of small blocks, we jump around the VAL + promoting mode. This mean that if the promoted VAL is not constant, + we might not use it in the epilogue and have to use byte + loop variant. */ + if (epilogue_size_needed > 2 && !promoted_val) + force_loopy_epilogue = true; + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, GET_MODE (count_exp), 1, label); + if (expected_size == -1 || expected_size <= epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); + } + if (dynamic_check != -1) + { + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, GET_MODE (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + set_storage_via_libcall (dst, count_exp, val_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } - if (TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); + /* Step 2: Alignment prologue. */ - countreg2 = gen_reg_rtx (Pmode); - countreg = copy_to_mode_reg (counter_mode, count_exp); - zeroreg = copy_to_mode_reg (Pmode, const0_rtx); - /* Get rid of MEM_OFFSET, it won't be accurate. */ + /* Do the expensive promotion once we branched off the small blocks. */ + if (!promoted_val) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + gcc_assert (desired_align >= 1 && align >= 1); + + if (desired_align > align) + { + /* Except for the first move in epilogue, we no longer know + constant offset in aliasing info. It don't seems to worth + the pain to maintain it for the first move, so throw away + the info early. */ dst = change_address (dst, BLKmode, destreg); + expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align, + desired_align); + } + if (label && size_needed == 1) + { + emit_label (label); + LABEL_NUSES (label) = 1; + label = NULL; + } - if (count == 0 && align < desired_alignment) - { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (countreg, GEN_INT (desired_alignment - 1), - LEU, 0, counter_mode, 1, label); - } - if (align <= 1) - { - rtx label = ix86_expand_aligntest (destreg, 1); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - ix86_adjust_counter (countreg, 1); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 2) - { - rtx label = ix86_expand_aligntest (destreg, 2); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - ix86_adjust_counter (countreg, 2); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 4 && desired_alignment > 4) - { - rtx label = ix86_expand_aligntest (destreg, 4); - emit_insn (gen_strset (destreg, dst, - (TARGET_64BIT - ? gen_rtx_SUBREG (SImode, zeroreg, 0) - : zeroreg))); - ix86_adjust_counter (countreg, 4); - emit_label (label); - LABEL_NUSES (label) = 1; - } + /* Step 3: Main loop. */ - if (label && desired_alignment > 4 && !TARGET_64BIT) - { - emit_label (label); - LABEL_NUSES (label) = 1; - label = NULL_RTX; - } + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop_1_byte: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, QImode, 1, expected_size); + break; + case loop: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, Pmode, 1, expected_size); + break; + case unrolled_loop: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, Pmode, 4, expected_size); + break; + case rep_prefix_8_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + DImode); + break; + case rep_prefix_4_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + SImode); + break; + case rep_prefix_1_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + QImode); + break; + } + /* Adjust properly the offset of src and dest memory for aliasing. */ + if (CONST_INT_P (count_exp)) + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, + (count / size_needed) * size_needed); + else + dst = change_address (dst, BLKmode, destreg); - if (!TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - if (TARGET_64BIT) - { - emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), - GEN_INT (3))); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, GEN_INT (3)); - } - else - { - emit_insn (gen_lshrsi3 (countreg2, countreg, const2_rtx)); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, const2_rtx); - } - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_stos (destreg, countreg2, dst, zeroreg, destexp)); + /* Step 4: Epilogue to copy the remaining bytes. */ - if (label) - { - emit_label (label); - LABEL_NUSES (label) = 1; - } + if (label) + { + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ - if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (SImode, zeroreg, 0))); - if (TARGET_64BIT && (align <= 4 || count == 0)) - { - rtx label = ix86_expand_aligntest (countreg, 4); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (SImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 2 && count != 0 && (count & 2)) - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - if (align <= 2 || count == 0) + if (size_needed < desired_align - align) { - rtx label = ix86_expand_aligntest (countreg, 2); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 1 && count != 0 && (count & 1)) - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - if (align <= 1 || count == 0) - { - rtx label = ix86_expand_aligntest (countreg, 1); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; + tmp = + expand_simple_binop (GET_MODE (count_exp), AND, count_exp, + GEN_INT (size_needed - 1), count_exp, 1, + OPTAB_DIRECT); + size_needed = desired_align - align + 1; + if (tmp != count_exp) + emit_move_insn (count_exp, tmp); } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (count_exp != const0_rtx && epilogue_size_needed > 1) + { + if (force_loopy_epilogue) + expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, + size_needed); + else + expand_setmem_epilogue (dst, destreg, promoted_val, count_exp, + size_needed); } + if (jump_around_label) + emit_label (jump_around_label); return 1; } @@ -12864,7 +14578,7 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 && !TARGET_INLINE_ALL_STRINGOPS && !optimize_size - && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)) + && (!CONST_INT_P (align) || INTVAL (align) < 4)) return 0; addr = force_reg (Pmode, XEXP (src, 0)); @@ -12903,7 +14617,6 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) emit_move_insn (scratch3, addr); eoschar = force_reg (QImode, eoschar); - emit_insn (gen_cld ()); src = replace_equiv_address_nv (src, scratch3); /* If .md starts supporting :P, this can be done in .md. */ @@ -12950,7 +14663,7 @@ ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx) rtx cmp; align = 0; - if (GET_CODE (align_rtx) == CONST_INT) + if (CONST_INT_P (align_rtx)) align = INTVAL (align_rtx); /* Loop to check 1..3 bytes for null to get an aligned pointer. */ @@ -13126,15 +14839,21 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, pop = NULL; gcc_assert (!TARGET_64BIT || !pop); + if (TARGET_MACHO && !TARGET_64BIT) + { #if TARGET_MACHO - if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) - fnaddr = machopic_indirect_call_target (fnaddr); -#else - /* Static functions and indirect calls don't need the pic register. */ - if (! TARGET_64BIT && flag_pic - && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF - && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0))) - use_reg (&use, pic_offset_table_rtx); + if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF) + fnaddr = machopic_indirect_call_target (fnaddr); +#endif + } + else + { + /* Static functions and indirect calls don't need the pic register. */ + if (! TARGET_64BIT && flag_pic + && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF + && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0))) + use_reg (&use, pic_offset_table_rtx); + } if (TARGET_64BIT && INTVAL (callarg2) >= 0) { @@ -13142,7 +14861,6 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, emit_move_insn (al, callarg2); use_reg (&use, al); } -#endif /* TARGET_MACHO */ if (! call_insn_operand (XEXP (fnaddr, 0), Pmode)) { @@ -13154,7 +14872,7 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, { rtx addr; addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0)); - fnaddr = gen_rtx_REG (Pmode, FIRST_REX_INT_REG + 3 /* R11 */); + fnaddr = gen_rtx_REG (Pmode, R11_REG); emit_move_insn (fnaddr, addr); fnaddr = gen_rtx_MEM (QImode, fnaddr); } @@ -13206,7 +14924,7 @@ assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n) for (s = ix86_stack_locals; s; s = s->next) if (s->mode == mode && s->n == n) - return s->rtl; + return copy_rtx (s->rtl); s = (struct stack_local_entry *) ggc_alloc (sizeof (struct stack_local_entry)); @@ -13390,7 +15108,7 @@ ix86_attr_length_address_default (rtx insn) extract_insn_cached (insn); for (i = recog_data.n_operands - 1; i >= 0; --i) - if (GET_CODE (recog_data.operand[i]) == MEM) + if (MEM_P (recog_data.operand[i])) { return memory_address_length (XEXP (recog_data.operand[i], 0)); break; @@ -13413,11 +15131,15 @@ ix86_issue_rate (void) case PROCESSOR_PENTIUM4: case PROCESSOR_ATHLON: case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: case PROCESSOR_NOCONA: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: return 3; + case PROCESSOR_CORE2: + return 4; + default: return 1; } @@ -13427,7 +15149,7 @@ ix86_issue_rate (void) by DEP_INSN and nothing set by DEP_INSN. */ static int -ix86_flags_dependant (rtx insn, rtx dep_insn, enum attr_type insn_type) +ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type) { rtx set, set2; @@ -13454,7 +15176,7 @@ ix86_flags_dependant (rtx insn, rtx dep_insn, enum attr_type insn_type) else return 0; - if (GET_CODE (set) != REG || REGNO (set) != FLAGS_REG) + if (!REG_P (set) || REGNO (set) != FLAGS_REG) return 0; /* This test is true if the dependent insn reads the flags but @@ -13472,7 +15194,7 @@ ix86_flags_dependant (rtx insn, rtx dep_insn, enum attr_type insn_type) address with operands set by DEP_INSN. */ static int -ix86_agi_dependant (rtx insn, rtx dep_insn, enum attr_type insn_type) +ix86_agi_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type) { rtx addr; @@ -13483,9 +15205,9 @@ ix86_agi_dependant (rtx insn, rtx dep_insn, enum attr_type insn_type) if (GET_CODE (addr) == PARALLEL) addr = XVECEXP (addr, 0, 0); - + gcc_assert (GET_CODE (addr) == SET); - + addr = SET_SRC (addr); } else @@ -13493,7 +15215,7 @@ ix86_agi_dependant (rtx insn, rtx dep_insn, enum attr_type insn_type) int i; extract_insn_cached (insn); for (i = recog_data.n_operands - 1; i >= 0; --i) - if (GET_CODE (recog_data.operand[i]) == MEM) + if (MEM_P (recog_data.operand[i])) { addr = XEXP (recog_data.operand[i], 0); goto found; @@ -13530,17 +15252,17 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) { case PROCESSOR_PENTIUM: /* Address Generation Interlock adds a cycle of latency. */ - if (ix86_agi_dependant (insn, dep_insn, insn_type)) + if (ix86_agi_dependent (insn, dep_insn, insn_type)) cost += 1; /* ??? Compares pair with jump/setcc. */ - if (ix86_flags_dependant (insn, dep_insn, insn_type)) + if (ix86_flags_dependent (insn, dep_insn, insn_type)) cost = 0; /* Floating point stores require value to be ready one cycle earlier. */ if (insn_type == TYPE_FMOV && get_attr_memory (insn) == MEMORY_STORE - && !ix86_agi_dependant (insn, dep_insn, insn_type)) + && !ix86_agi_dependent (insn, dep_insn, insn_type)) cost += 1; break; @@ -13556,14 +15278,14 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) && (set = single_set (dep_insn)) != NULL_RTX && (set2 = single_set (insn)) != NULL_RTX && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) - && GET_CODE (SET_DEST (set2)) == MEM) + && MEM_P (SET_DEST (set2))) cost += 1; /* Show ability of reorder buffer to hide latency of load by executing in parallel with previous instruction in case previous instruction is not needed to compute the address. */ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependant (insn, dep_insn, insn_type)) + && !ix86_agi_dependent (insn, dep_insn, insn_type)) { /* Claim moves to take one cycle, as core can issue one load at time and the next load can start cycle later. */ @@ -13592,7 +15314,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) in parallel with previous instruction in case previous instruction is not needed to compute the address. */ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependant (insn, dep_insn, insn_type)) + && !ix86_agi_dependent (insn, dep_insn, insn_type)) { /* Claim moves to take one cycle, as core can issue one load at time and the next load can start cycle later. */ @@ -13608,6 +15330,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) case PROCESSOR_ATHLON: case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: memory = get_attr_memory (insn); @@ -13616,7 +15339,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) in parallel with previous instruction in case previous instruction is not needed to compute the address. */ if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) - && !ix86_agi_dependant (insn, dep_insn, insn_type)) + && !ix86_agi_dependent (insn, dep_insn, insn_type)) { enum attr_unit unit = get_attr_unit (insn); int loadcost = 3; @@ -13694,7 +15417,7 @@ ix86_constant_alignment (tree exp, int align) int ix86_data_alignment (tree type, int align) { - int max_align = optimize_size ? BITS_PER_WORD : 256; + int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT); if (AGGREGATE_TYPE_P (type) && TYPE_SIZE (type) @@ -14285,6 +16008,49 @@ enum ix86_builtins IX86_BUILTIN_MONITOR, IX86_BUILTIN_MWAIT, + /* SSSE3. */ + IX86_BUILTIN_PHADDW, + IX86_BUILTIN_PHADDD, + IX86_BUILTIN_PHADDSW, + IX86_BUILTIN_PHSUBW, + IX86_BUILTIN_PHSUBD, + IX86_BUILTIN_PHSUBSW, + IX86_BUILTIN_PMADDUBSW, + IX86_BUILTIN_PMULHRSW, + IX86_BUILTIN_PSHUFB, + IX86_BUILTIN_PSIGNB, + IX86_BUILTIN_PSIGNW, + IX86_BUILTIN_PSIGND, + IX86_BUILTIN_PALIGNR, + IX86_BUILTIN_PABSB, + IX86_BUILTIN_PABSW, + IX86_BUILTIN_PABSD, + + IX86_BUILTIN_PHADDW128, + IX86_BUILTIN_PHADDD128, + IX86_BUILTIN_PHADDSW128, + IX86_BUILTIN_PHSUBW128, + IX86_BUILTIN_PHSUBD128, + IX86_BUILTIN_PHSUBSW128, + IX86_BUILTIN_PMADDUBSW128, + IX86_BUILTIN_PMULHRSW128, + IX86_BUILTIN_PSHUFB128, + IX86_BUILTIN_PSIGNB128, + IX86_BUILTIN_PSIGNW128, + IX86_BUILTIN_PSIGND128, + IX86_BUILTIN_PALIGNR128, + IX86_BUILTIN_PABSB128, + IX86_BUILTIN_PABSW128, + IX86_BUILTIN_PABSD128, + + /* AMDFAM10 - SSE4A New Instructions. */ + IX86_BUILTIN_MOVNTSD, + IX86_BUILTIN_MOVNTSS, + IX86_BUILTIN_EXTRQI, + IX86_BUILTIN_EXTRQ, + IX86_BUILTIN_INSERTQI, + IX86_BUILTIN_INSERTQ, + IX86_BUILTIN_VEC_INIT_V2SI, IX86_BUILTIN_VEC_INIT_V4HI, IX86_BUILTIN_VEC_INIT_V8QI, @@ -14298,38 +16064,44 @@ enum ix86_builtins IX86_BUILTIN_VEC_SET_V8HI, IX86_BUILTIN_VEC_SET_V4HI, - /* SSE2 ABI functions. */ - IX86_BUILTIN_SSE2_ACOS, - IX86_BUILTIN_SSE2_ACOSF, - IX86_BUILTIN_SSE2_ASIN, - IX86_BUILTIN_SSE2_ASINF, - IX86_BUILTIN_SSE2_ATAN, - IX86_BUILTIN_SSE2_ATANF, - IX86_BUILTIN_SSE2_ATAN2, - IX86_BUILTIN_SSE2_ATAN2F, - IX86_BUILTIN_SSE2_COS, - IX86_BUILTIN_SSE2_COSF, - IX86_BUILTIN_SSE2_EXP, - IX86_BUILTIN_SSE2_EXPF, - IX86_BUILTIN_SSE2_LOG10, - IX86_BUILTIN_SSE2_LOG10F, - IX86_BUILTIN_SSE2_LOG, - IX86_BUILTIN_SSE2_LOGF, - IX86_BUILTIN_SSE2_SIN, - IX86_BUILTIN_SSE2_SINF, - IX86_BUILTIN_SSE2_TAN, - IX86_BUILTIN_SSE2_TANF, - IX86_BUILTIN_MAX }; -#define def_builtin(MASK, NAME, TYPE, CODE) \ -do { \ - if ((MASK) & target_flags \ - && (!((MASK) & MASK_64BIT) || TARGET_64BIT)) \ - lang_hooks.builtin_function ((NAME), (TYPE), (CODE), BUILT_IN_MD, \ - NULL, NULL_TREE); \ -} while (0) +/* Table for the ix86 builtin decls. */ +static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX]; + +/* Add a ix86 target builtin function with CODE, NAME and TYPE. Do so, + * if the target_flags include one of MASK. Stores the function decl + * in the ix86_builtins array. + * Returns the function decl or NULL_TREE, if the builtin was not added. */ + +static inline tree +def_builtin (int mask, const char *name, tree type, enum ix86_builtins code) +{ + tree decl = NULL_TREE; + + if (mask & target_flags + && (!(mask & MASK_64BIT) || TARGET_64BIT)) + { + decl = add_builtin_function (name, type, code, BUILT_IN_MD, + NULL, NULL_TREE); + ix86_builtins[(int) code] = decl; + } + + return decl; +} + +/* Like def_builtin, but also marks the function decl "const". */ + +static inline tree +def_builtin_const (int mask, const char *name, tree type, + enum ix86_builtins code) +{ + tree decl = def_builtin (mask, name, type, code); + if (decl) + TREE_READONLY (decl) = 1; + return decl; +} /* Bits for builtin_description.flag. */ @@ -14582,7 +16354,7 @@ static const struct builtin_description bdesc_2arg[] = { MASK_MMX, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, 0, 0 }, { MASK_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, 0, 0 }, + { MASK_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, 0, 0 }, { MASK_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_nandv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, 0, 0 }, @@ -14617,7 +16389,7 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, 0, 0 }, + { MASK_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_psadbw, 0, IX86_BUILTIN_PSADBW128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_umulsidi3, 0, IX86_BUILTIN_PMULUDQ, 0, 0 }, @@ -14647,7 +16419,33 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, 0, 0 }, { MASK_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, 0, 0 }, { MASK_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, 0, 0 }, - { MASK_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, 0, 0 } + { MASK_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, 0, 0 }, + + /* SSSE3 */ + { MASK_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_pmaddubswv8hi3, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_pmaddubswv4hi3, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, 0, 0 } }; static const struct builtin_description bdesc_1arg[] = @@ -14692,8 +16490,16 @@ static const struct builtin_description bdesc_1arg[] = { MASK_SSE2, CODE_FOR_sse2_cvttps2dq, 0, IX86_BUILTIN_CVTTPS2DQ, 0, 0 }, /* SSE3 */ - { MASK_SSE3, CODE_FOR_sse3_movshdup, 0, IX86_BUILTIN_MOVSHDUP, 0, 0 }, - { MASK_SSE3, CODE_FOR_sse3_movsldup, 0, IX86_BUILTIN_MOVSLDUP, 0, 0 }, + { MASK_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, 0, 0 }, + { MASK_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, 0, 0 }, + + /* SSSE3 */ + { MASK_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, 0, 0 }, + { MASK_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, 0, 0 }, + { MASK_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, 0, 0 }, + { MASK_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, 0, 0 }, }; static void @@ -14701,8 +16507,6 @@ ix86_init_builtins (void) { if (TARGET_MMX) ix86_init_mmx_sse_builtins (); - if (TARGET_SSE2) - ix86_init_sse_abi_builtins (); } /* Set up all the MMX/SSE builtins. This is not called if TARGET_MMX @@ -14714,7 +16518,7 @@ ix86_init_mmx_sse_builtins (void) const struct builtin_description * d; size_t i; - tree V16QI_type_node = build_vector_type_for_mode (intQI_type_node, V16QImode); + tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode); tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode); tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode); tree V2DI_type_node @@ -14723,7 +16527,7 @@ ix86_init_mmx_sse_builtins (void) tree V4SF_type_node = build_vector_type_for_mode (float_type_node, V4SFmode); tree V4SI_type_node = build_vector_type_for_mode (intSI_type_node, V4SImode); tree V4HI_type_node = build_vector_type_for_mode (intHI_type_node, V4HImode); - tree V8QI_type_node = build_vector_type_for_mode (intQI_type_node, V8QImode); + tree V8QI_type_node = build_vector_type_for_mode (char_type_node, V8QImode); tree V8HI_type_node = build_vector_type_for_mode (intHI_type_node, V8HImode); tree pchar_type_node = build_pointer_type (char_type_node); @@ -14830,6 +16634,16 @@ ix86_init_mmx_sse_builtins (void) /* Normal vector unops. */ tree v4sf_ftype_v4sf = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE); + tree v16qi_ftype_v16qi + = build_function_type_list (V16QI_type_node, V16QI_type_node, NULL_TREE); + tree v8hi_ftype_v8hi + = build_function_type_list (V8HI_type_node, V8HI_type_node, NULL_TREE); + tree v4si_ftype_v4si + = build_function_type_list (V4SI_type_node, V4SI_type_node, NULL_TREE); + tree v8qi_ftype_v8qi + = build_function_type_list (V8QI_type_node, V8QI_type_node, NULL_TREE); + tree v4hi_ftype_v4hi + = build_function_type_list (V4HI_type_node, V4HI_type_node, NULL_TREE); /* Normal vector binops. */ tree v4sf_ftype_v4sf_v4sf @@ -14849,6 +16663,12 @@ ix86_init_mmx_sse_builtins (void) long_long_unsigned_type_node, long_long_unsigned_type_node, NULL_TREE); + tree di_ftype_di_di_int + = build_function_type_list (long_long_unsigned_type_node, + long_long_unsigned_type_node, + long_long_unsigned_type_node, + integer_type_node, NULL_TREE); + tree v2si_ftype_v2sf = build_function_type_list (V2SI_type_node, V2SF_type_node, NULL_TREE); tree v2sf_ftype_v2si @@ -14950,6 +16770,9 @@ ix86_init_mmx_sse_builtins (void) tree v2di_ftype_v2di_int = build_function_type_list (V2DI_type_node, V2DI_type_node, integer_type_node, NULL_TREE); + tree v2di_ftype_v2di_v2di_int + = build_function_type_list (V2DI_type_node, V2DI_type_node, + V2DI_type_node, integer_type_node, NULL_TREE); tree v4si_ftype_v4si_int = build_function_type_list (V4SI_type_node, V4SI_type_node, integer_type_node, NULL_TREE); @@ -14985,6 +16808,18 @@ ix86_init_mmx_sse_builtins (void) = build_function_type_list (void_type_node, pchar_type_node, V16QI_type_node, NULL_TREE); + tree v2di_ftype_v2di_unsigned_unsigned + = build_function_type_list (V2DI_type_node, V2DI_type_node, + unsigned_type_node, unsigned_type_node, + NULL_TREE); + tree v2di_ftype_v2di_v2di_unsigned_unsigned + = build_function_type_list (V2DI_type_node, V2DI_type_node, V2DI_type_node, + unsigned_type_node, unsigned_type_node, + NULL_TREE); + tree v2di_ftype_v2di_v16qi + = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node, + NULL_TREE); + tree float80_type; tree float128_type; tree ftype; @@ -15072,6 +16907,50 @@ ix86_init_mmx_sse_builtins (void) def_builtin (d->mask, d->name, type, d->code); } + /* Add all builtins that are more or less simple operations on 1 operand. */ + for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++) + { + enum machine_mode mode; + tree type; + + if (d->name == 0) + continue; + mode = insn_data[d->icode].operand[1].mode; + + switch (mode) + { + case V16QImode: + type = v16qi_ftype_v16qi; + break; + case V8HImode: + type = v8hi_ftype_v8hi; + break; + case V4SImode: + type = v4si_ftype_v4si; + break; + case V2DFmode: + type = v2df_ftype_v2df; + break; + case V4SFmode: + type = v4sf_ftype_v4sf; + break; + case V8QImode: + type = v8qi_ftype_v8qi; + break; + case V4HImode: + type = v4hi_ftype_v4hi; + break; + case V2SImode: + type = v2si_ftype_v2si; + break; + + default: + abort (); + } + + def_builtin (d->mask, d->name, type, d->code); + } + /* Add the remaining MMX insns with somewhat more complicated types. */ def_builtin (MASK_MMX, "__builtin_ia32_emms", void_ftype_void, IX86_BUILTIN_EMMS); def_builtin (MASK_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSLLW); @@ -15101,15 +16980,15 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE, "__builtin_ia32_ldmxcsr", void_ftype_unsigned, IX86_BUILTIN_LDMXCSR); def_builtin (MASK_SSE, "__builtin_ia32_stmxcsr", unsigned_ftype_void, IX86_BUILTIN_STMXCSR); - def_builtin (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS); - def_builtin (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI); - def_builtin (MASK_SSE, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS); - def_builtin (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS); - def_builtin (MASK_SSE, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI); - def_builtin (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64); - def_builtin (MASK_SSE, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI); - def_builtin (MASK_SSE, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI); - def_builtin (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS); + def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI); + def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI); + def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64); def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_maskmovq", void_ftype_v8qi_v8qi_pchar, IX86_BUILTIN_MASKMOVQ); @@ -15134,8 +17013,8 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS); def_builtin (MASK_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS); def_builtin (MASK_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS); - def_builtin (MASK_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS); - def_builtin (MASK_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS); + def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS); + def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS); def_builtin (MASK_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS); @@ -15189,35 +17068,35 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE2, "__builtin_ia32_pshufhw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFHW); def_builtin (MASK_SSE2, "__builtin_ia32_psadbw128", v2di_ftype_v16qi_v16qi, IX86_BUILTIN_PSADBW128); - def_builtin (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD); - def_builtin (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD); def_builtin (MASK_SSE2, "__builtin_ia32_shufpd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_SHUFPD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2ps", v4sf_ftype_v2df, IX86_BUILTIN_CVTPD2PS); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTTPD2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTTPD2PI); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2ps", v4sf_ftype_v2df, IX86_BUILTIN_CVTPD2PS); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTTPD2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTTPD2PI); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpi2pd", v2df_ftype_v2si, IX86_BUILTIN_CVTPI2PD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpi2pd", v2df_ftype_v2si, IX86_BUILTIN_CVTPI2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI); - def_builtin (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64); - def_builtin (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI); + def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64); + def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD); - def_builtin (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD); + def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD); def_builtin (MASK_SSE2, "__builtin_ia32_clflush", void_ftype_pcvoid, IX86_BUILTIN_CLFLUSH); def_builtin (MASK_SSE2, "__builtin_ia32_lfence", void_ftype_void, IX86_BUILTIN_LFENCE); @@ -15262,15 +17141,29 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE3, "__builtin_ia32_mwait", void_ftype_unsigned_unsigned, IX86_BUILTIN_MWAIT); - def_builtin (MASK_SSE3, "__builtin_ia32_movshdup", - v4sf_ftype_v4sf, - IX86_BUILTIN_MOVSHDUP); - def_builtin (MASK_SSE3, "__builtin_ia32_movsldup", - v4sf_ftype_v4sf, - IX86_BUILTIN_MOVSLDUP); def_builtin (MASK_SSE3, "__builtin_ia32_lddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LDDQU); + /* SSSE3. */ + def_builtin (MASK_SSSE3, "__builtin_ia32_palignr128", + v2di_ftype_v2di_v2di_int, IX86_BUILTIN_PALIGNR128); + def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int, + IX86_BUILTIN_PALIGNR); + + /* AMDFAM10 SSE4A New built-ins */ + def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd", + void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD); + def_builtin (MASK_SSE4A, "__builtin_ia32_movntss", + void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS); + def_builtin (MASK_SSE4A, "__builtin_ia32_extrqi", + v2di_ftype_v2di_unsigned_unsigned, IX86_BUILTIN_EXTRQI); + def_builtin (MASK_SSE4A, "__builtin_ia32_extrq", + v2di_ftype_v2di_v16qi, IX86_BUILTIN_EXTRQ); + def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi", + v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI); + def_builtin (MASK_SSE4A, "__builtin_ia32_insertq", + v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ); + /* Access to the vec_init patterns. */ ftype = build_function_type_list (V2SI_type_node, integer_type_node, integer_type_node, NULL_TREE); @@ -15335,59 +17228,13 @@ ix86_init_mmx_sse_builtins (void) integer_type_node, NULL_TREE); def_builtin (MASK_SSE, "__builtin_ia32_vec_set_v8hi", ftype, IX86_BUILTIN_VEC_SET_V8HI); - + ftype = build_function_type_list (V4HI_type_node, V4HI_type_node, intHI_type_node, integer_type_node, NULL_TREE); def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_vec_set_v4hi", ftype, IX86_BUILTIN_VEC_SET_V4HI); } -#undef def_builtin - -/* Set up all the SSE ABI builtins that we may use to override - the normal builtins. */ -static void -ix86_init_sse_abi_builtins (void) -{ - tree flt, flt2; - - /* Bail out in case the template definitions are not available. */ - if (! built_in_decls [BUILT_IN_SIN] - || ! built_in_decls [BUILT_IN_SINF] - || ! built_in_decls [BUILT_IN_ATAN2] - || ! built_in_decls [BUILT_IN_ATAN2F]) - return; - - /* Build the function types as variants of the existing ones. */ - flt = build_variant_type_copy (TREE_TYPE (built_in_decls [BUILT_IN_SINF])); - TYPE_ATTRIBUTES (flt) - = tree_cons (get_identifier ("sseregparm"), - NULL_TREE, TYPE_ATTRIBUTES (flt)); - flt2 = build_variant_type_copy (TREE_TYPE (built_in_decls [BUILT_IN_ATAN2F])); - TYPE_ATTRIBUTES (flt2) - = tree_cons (get_identifier ("sseregparm"), - NULL_TREE, TYPE_ATTRIBUTES (flt2)); - -#define def_builtin(capname, name, type) \ - ix86_builtin_function_variants [BUILT_IN_ ## capname] \ - = lang_hooks.builtin_function ("__builtin_sse2_" # name, type, \ - IX86_BUILTIN_SSE2_ ## capname, \ - BUILT_IN_NORMAL, \ - "__libm_sse2_" # name, NULL_TREE) - - def_builtin (ACOSF, acosf, flt); - def_builtin (ASINF, asinf, flt); - def_builtin (ATANF, atanf, flt); - def_builtin (ATAN2F, atan2f, flt2); - def_builtin (COSF, cosf, flt); - def_builtin (EXPF, expf, flt); - def_builtin (LOG10F, log10f, flt); - def_builtin (LOGF, logf, flt); - def_builtin (SINF, sinf, flt); - def_builtin (TANF, tanf, flt); - -#undef def_builtin -} /* Errors in the source file can cause expand_expr to return const0_rtx where we expect a vector. To avoid crashing, use one of the vector @@ -15403,11 +17250,11 @@ safe_vector_operand (rtx x, enum machine_mode mode) /* Subroutine of ix86_expand_builtin to take care of binop insns. */ static rtx -ix86_expand_binop_builtin (enum insn_code icode, tree arglist, rtx target) +ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) { rtx pat, xops[3]; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); enum machine_mode tmode = insn_data[icode].operand[0].mode; @@ -15471,11 +17318,11 @@ ix86_expand_binop_builtin (enum insn_code icode, tree arglist, rtx target) /* Subroutine of ix86_expand_builtin to take care of stores. */ static rtx -ix86_expand_store_builtin (enum insn_code icode, tree arglist) +ix86_expand_store_builtin (enum insn_code icode, tree exp) { rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); enum machine_mode mode0 = insn_data[icode].operand[0].mode; @@ -15496,11 +17343,11 @@ ix86_expand_store_builtin (enum insn_code icode, tree arglist) /* Subroutine of ix86_expand_builtin to take care of unop insns. */ static rtx -ix86_expand_unop_builtin (enum insn_code icode, tree arglist, +ix86_expand_unop_builtin (enum insn_code icode, tree exp, rtx target, int do_load) { rtx pat; - tree arg0 = TREE_VALUE (arglist); + tree arg0 = CALL_EXPR_ARG (exp, 0); rtx op0 = expand_normal (arg0); enum machine_mode tmode = insn_data[icode].operand[0].mode; enum machine_mode mode0 = insn_data[icode].operand[1].mode; @@ -15532,10 +17379,10 @@ ix86_expand_unop_builtin (enum insn_code icode, tree arglist, sqrtss, rsqrtss, rcpss. */ static rtx -ix86_expand_unop1_builtin (enum insn_code icode, tree arglist, rtx target) +ix86_expand_unop1_builtin (enum insn_code icode, tree exp, rtx target) { rtx pat; - tree arg0 = TREE_VALUE (arglist); + tree arg0 = CALL_EXPR_ARG (exp, 0); rtx op1, op0 = expand_normal (arg0); enum machine_mode tmode = insn_data[icode].operand[0].mode; enum machine_mode mode0 = insn_data[icode].operand[1].mode; @@ -15566,12 +17413,12 @@ ix86_expand_unop1_builtin (enum insn_code icode, tree arglist, rtx target) /* Subroutine of ix86_expand_builtin to take care of comparison insns. */ static rtx -ix86_expand_sse_compare (const struct builtin_description *d, tree arglist, +ix86_expand_sse_compare (const struct builtin_description *d, tree exp, rtx target) { rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); rtx op2; @@ -15618,12 +17465,12 @@ ix86_expand_sse_compare (const struct builtin_description *d, tree arglist, /* Subroutine of ix86_expand_builtin to take care of comi insns. */ static rtx -ix86_expand_sse_comi (const struct builtin_description *d, tree arglist, +ix86_expand_sse_comi (const struct builtin_description *d, tree exp, rtx target) { rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); rtx op2; @@ -15694,11 +17541,11 @@ get_element_number (tree vec_type, tree arg) instructions from inside the compiler, we can't allow the use of MMX registers unless the user explicitly asks for it. So we do *not* define vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead - we have builtins invoked by mmintrin.h that gives us license to emit + we have builtins invoked by mmintrin.h that gives us license to emit these sorts of instructions. */ static rtx -ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target) +ix86_expand_vec_init_builtin (tree type, tree exp, rtx target) { enum machine_mode tmode = TYPE_MODE (type); enum machine_mode inner_mode = GET_MODE_INNER (tmode); @@ -15706,15 +17553,14 @@ ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target) rtvec v = rtvec_alloc (n_elt); gcc_assert (VECTOR_MODE_P (tmode)); + gcc_assert (call_expr_nargs (exp) == n_elt); - for (i = 0; i < n_elt; ++i, arglist = TREE_CHAIN (arglist)) + for (i = 0; i < n_elt; ++i) { - rtx x = expand_normal (TREE_VALUE (arglist)); + rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); } - gcc_assert (arglist == NULL); - if (!target || !register_operand (target, tmode)) target = gen_reg_rtx (tmode); @@ -15727,15 +17573,15 @@ ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target) had a language-level syntax for referencing vector elements. */ static rtx -ix86_expand_vec_ext_builtin (tree arglist, rtx target) +ix86_expand_vec_ext_builtin (tree exp, rtx target) { enum machine_mode tmode, mode0; tree arg0, arg1; int elt; rtx op0; - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); elt = get_element_number (TREE_TYPE (arg0), arg1); @@ -15759,16 +17605,16 @@ ix86_expand_vec_ext_builtin (tree arglist, rtx target) a language-level syntax for referencing vector elements. */ static rtx -ix86_expand_vec_set_builtin (tree arglist) +ix86_expand_vec_set_builtin (tree exp) { enum machine_mode tmode, mode1; tree arg0, arg1, arg2; int elt; rtx op0, op1; - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); tmode = TYPE_MODE (TREE_TYPE (arg0)); mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); @@ -15803,11 +17649,10 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, const struct builtin_description *d; size_t i; enum insn_code icode; - tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0); - tree arglist = TREE_OPERAND (exp, 1); - tree arg0, arg1, arg2; - rtx op0, op1, op2, pat; - enum machine_mode tmode, mode0, mode1, mode2; + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + tree arg0, arg1, arg2, arg3; + rtx op0, op1, op2, op3, pat; + enum machine_mode tmode, mode0, mode1, mode2, mode3, mode4; unsigned int fcode = DECL_FUNCTION_CODE (fndecl); switch (fcode) @@ -15826,9 +17671,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, ? CODE_FOR_mmx_maskmovq : CODE_FOR_sse2_maskmovdqu); /* Note the arg order is different from the operand order. */ - arg1 = TREE_VALUE (arglist); - arg2 = TREE_VALUE (TREE_CHAIN (arglist)); - arg0 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg1 = CALL_EXPR_ARG (exp, 0); + arg2 = CALL_EXPR_ARG (exp, 1); + arg0 = CALL_EXPR_ARG (exp, 2); op0 = expand_normal (arg0); op1 = expand_normal (arg1); op2 = expand_normal (arg2); @@ -15852,17 +17697,17 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_SQRTSS: - return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, exp, target); case IX86_BUILTIN_RSQRTSS: - return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrsqrtv4sf2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrsqrtv4sf2, exp, target); case IX86_BUILTIN_RCPSS: - return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrcpv4sf2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrcpv4sf2, exp, target); case IX86_BUILTIN_LOADUPS: - return ix86_expand_unop_builtin (CODE_FOR_sse_movups, arglist, target, 1); + return ix86_expand_unop_builtin (CODE_FOR_sse_movups, exp, target, 1); case IX86_BUILTIN_STOREUPS: - return ix86_expand_store_builtin (CODE_FOR_sse_movups, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse_movups, exp); case IX86_BUILTIN_LOADHPS: case IX86_BUILTIN_LOADLPS: @@ -15872,8 +17717,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_loadlps : fcode == IX86_BUILTIN_LOADHPD ? CODE_FOR_sse2_loadhpd : CODE_FOR_sse2_loadlpd); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); tmode = insn_data[icode].operand[0].mode; @@ -15896,8 +17741,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_STORELPS: icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_storehps : CODE_FOR_sse_storelps); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); mode0 = insn_data[icode].operand[0].mode; @@ -15913,12 +17758,12 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return const0_rtx; case IX86_BUILTIN_MOVNTPS: - return ix86_expand_store_builtin (CODE_FOR_sse_movntv4sf, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse_movntv4sf, exp); case IX86_BUILTIN_MOVNTQ: - return ix86_expand_store_builtin (CODE_FOR_sse_movntdi, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse_movntdi, exp); case IX86_BUILTIN_LDMXCSR: - op0 = expand_normal (TREE_VALUE (arglist)); + op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); target = assign_386_stack_local (SImode, SLOT_TEMP); emit_move_insn (target, op0); emit_insn (gen_sse_ldmxcsr (target)); @@ -15934,9 +17779,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, icode = (fcode == IX86_BUILTIN_SHUFPS ? CODE_FOR_sse_shufps : CODE_FOR_sse2_shufpd); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); op0 = expand_normal (arg0); op1 = expand_normal (arg1); op2 = expand_normal (arg2); @@ -15974,8 +17819,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, : fcode == IX86_BUILTIN_PSHUFLW ? CODE_FOR_sse2_pshuflw : fcode == IX86_BUILTIN_PSHUFD ? CODE_FOR_sse2_pshufd : CODE_FOR_mmx_pshufw); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); tmode = insn_data[icode].operand[0].mode; @@ -16004,8 +17849,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_PSRLDQI128: icode = ( fcode == IX86_BUILTIN_PSLLDQI128 ? CODE_FOR_sse2_ashlti3 : CODE_FOR_sse2_lshrti3); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); tmode = insn_data[icode].operand[0].mode; @@ -16034,86 +17879,86 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return NULL_RTX; case IX86_BUILTIN_PAVGUSB: - return ix86_expand_binop_builtin (CODE_FOR_mmx_uavgv8qi3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_uavgv8qi3, exp, target); case IX86_BUILTIN_PF2ID: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2id, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2id, exp, target, 0); case IX86_BUILTIN_PFACC: - return ix86_expand_binop_builtin (CODE_FOR_mmx_haddv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_haddv2sf3, exp, target); case IX86_BUILTIN_PFADD: - return ix86_expand_binop_builtin (CODE_FOR_mmx_addv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_addv2sf3, exp, target); case IX86_BUILTIN_PFCMPEQ: - return ix86_expand_binop_builtin (CODE_FOR_mmx_eqv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_eqv2sf3, exp, target); case IX86_BUILTIN_PFCMPGE: - return ix86_expand_binop_builtin (CODE_FOR_mmx_gev2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_gev2sf3, exp, target); case IX86_BUILTIN_PFCMPGT: - return ix86_expand_binop_builtin (CODE_FOR_mmx_gtv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_gtv2sf3, exp, target); case IX86_BUILTIN_PFMAX: - return ix86_expand_binop_builtin (CODE_FOR_mmx_smaxv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_smaxv2sf3, exp, target); case IX86_BUILTIN_PFMIN: - return ix86_expand_binop_builtin (CODE_FOR_mmx_sminv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_sminv2sf3, exp, target); case IX86_BUILTIN_PFMUL: - return ix86_expand_binop_builtin (CODE_FOR_mmx_mulv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_mulv2sf3, exp, target); case IX86_BUILTIN_PFRCP: - return ix86_expand_unop_builtin (CODE_FOR_mmx_rcpv2sf2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_rcpv2sf2, exp, target, 0); case IX86_BUILTIN_PFRCPIT1: - return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit1v2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit1v2sf3, exp, target); case IX86_BUILTIN_PFRCPIT2: - return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit2v2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit2v2sf3, exp, target); case IX86_BUILTIN_PFRSQIT1: - return ix86_expand_binop_builtin (CODE_FOR_mmx_rsqit1v2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_rsqit1v2sf3, exp, target); case IX86_BUILTIN_PFRSQRT: - return ix86_expand_unop_builtin (CODE_FOR_mmx_rsqrtv2sf2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_rsqrtv2sf2, exp, target, 0); case IX86_BUILTIN_PFSUB: - return ix86_expand_binop_builtin (CODE_FOR_mmx_subv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_subv2sf3, exp, target); case IX86_BUILTIN_PFSUBR: - return ix86_expand_binop_builtin (CODE_FOR_mmx_subrv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_subrv2sf3, exp, target); case IX86_BUILTIN_PI2FD: - return ix86_expand_unop_builtin (CODE_FOR_mmx_floatv2si2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_floatv2si2, exp, target, 0); case IX86_BUILTIN_PMULHRW: - return ix86_expand_binop_builtin (CODE_FOR_mmx_pmulhrwv4hi3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_pmulhrwv4hi3, exp, target); case IX86_BUILTIN_PF2IW: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2iw, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2iw, exp, target, 0); case IX86_BUILTIN_PFNACC: - return ix86_expand_binop_builtin (CODE_FOR_mmx_hsubv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_hsubv2sf3, exp, target); case IX86_BUILTIN_PFPNACC: - return ix86_expand_binop_builtin (CODE_FOR_mmx_addsubv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_addsubv2sf3, exp, target); case IX86_BUILTIN_PI2FW: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pi2fw, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pi2fw, exp, target, 0); case IX86_BUILTIN_PSWAPDSI: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2si2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2si2, exp, target, 0); case IX86_BUILTIN_PSWAPDSF: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2sf2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2sf2, exp, target, 0); case IX86_BUILTIN_SQRTSD: - return ix86_expand_unop1_builtin (CODE_FOR_sse2_vmsqrtv2df2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse2_vmsqrtv2df2, exp, target); case IX86_BUILTIN_LOADUPD: - return ix86_expand_unop_builtin (CODE_FOR_sse2_movupd, arglist, target, 1); + return ix86_expand_unop_builtin (CODE_FOR_sse2_movupd, exp, target, 1); case IX86_BUILTIN_STOREUPD: - return ix86_expand_store_builtin (CODE_FOR_sse2_movupd, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movupd, exp); case IX86_BUILTIN_MFENCE: emit_insn (gen_sse2_mfence ()); @@ -16123,7 +17968,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_CLFLUSH: - arg0 = TREE_VALUE (arglist); + arg0 = CALL_EXPR_ARG (exp, 0); op0 = expand_normal (arg0); icode = CODE_FOR_sse2_clflush; if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode)) @@ -16133,53 +17978,210 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_MOVNTPD: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, exp); case IX86_BUILTIN_MOVNTDQ: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, exp); case IX86_BUILTIN_MOVNTI: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, exp); case IX86_BUILTIN_LOADDQU: - return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, arglist, target, 1); + return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, exp, target, 1); case IX86_BUILTIN_STOREDQU: - return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, exp); case IX86_BUILTIN_MONITOR: - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); op0 = expand_normal (arg0); op1 = expand_normal (arg1); op2 = expand_normal (arg2); if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); + op0 = copy_to_mode_reg (Pmode, op0); if (!REG_P (op1)) op1 = copy_to_mode_reg (SImode, op1); if (!REG_P (op2)) op2 = copy_to_mode_reg (SImode, op2); - emit_insn (gen_sse3_monitor (op0, op1, op2)); + if (!TARGET_64BIT) + emit_insn (gen_sse3_monitor (op0, op1, op2)); + else + emit_insn (gen_sse3_monitor64 (op0, op1, op2)); return 0; - case IX86_BUILTIN_MWAIT: - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + case IX86_BUILTIN_MWAIT: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + if (!REG_P (op1)) + op1 = copy_to_mode_reg (SImode, op1); + emit_insn (gen_sse3_mwait (op0, op1)); + return 0; + + case IX86_BUILTIN_LDDQU: + return ix86_expand_unop_builtin (CODE_FOR_sse3_lddqu, exp, + target, 1); + + case IX86_BUILTIN_PALIGNR: + case IX86_BUILTIN_PALIGNR128: + if (fcode == IX86_BUILTIN_PALIGNR) + { + icode = CODE_FOR_ssse3_palignrdi; + mode = DImode; + } + else + { + icode = CODE_FOR_ssse3_palignrti; + mode = V2DImode; + } + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0); + op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0); + op2 = expand_expr (arg2, NULL_RTX, VOIDmode, 0); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + { + op0 = copy_to_reg (op0); + op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0); + } + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + { + op1 = copy_to_reg (op1); + op1 = simplify_gen_subreg (mode2, op1, GET_MODE (op1), 0); + } + if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) + { + error ("shift must be an immediate"); + return const0_rtx; + } + target = gen_reg_rtx (mode); + pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, mode, 0), + op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return target; + + case IX86_BUILTIN_MOVNTSD: + return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, exp); + + case IX86_BUILTIN_MOVNTSS: + return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv4sf, exp); + + case IX86_BUILTIN_INSERTQ: + case IX86_BUILTIN_EXTRQ: + icode = (fcode == IX86_BUILTIN_EXTRQ + ? CODE_FOR_sse4a_extrq + : CODE_FOR_sse4a_insertq); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + op1 = copy_to_mode_reg (mode2, op1); + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + + case IX86_BUILTIN_EXTRQI: + icode = CODE_FOR_sse4a_extrqi; + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + { + error ("index mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) + { + error ("length mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + + case IX86_BUILTIN_INSERTQI: + icode = CODE_FOR_sse4a_insertqi; + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + arg3 = CALL_EXPR_ARG (exp, 3); op0 = expand_normal (arg0); op1 = expand_normal (arg1); - if (!REG_P (op0)) - op0 = copy_to_mode_reg (SImode, op0); - if (!REG_P (op1)) - op1 = copy_to_mode_reg (SImode, op1); - emit_insn (gen_sse3_mwait (op0, op1)); - return 0; + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + mode4 = insn_data[icode].operand[4].mode; - case IX86_BUILTIN_LDDQU: - return ix86_expand_unop_builtin (CODE_FOR_sse3_lddqu, arglist, - target, 1); + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + op1 = copy_to_mode_reg (mode2, op1); + + if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) + { + error ("index mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (! (*insn_data[icode].operand[4].predicate) (op3, mode4)) + { + error ("length mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2, op3); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; case IX86_BUILTIN_VEC_INIT_V2SI: case IX86_BUILTIN_VEC_INIT_V4HI: case IX86_BUILTIN_VEC_INIT_V8QI: - return ix86_expand_vec_init_builtin (TREE_TYPE (exp), arglist, target); + return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); case IX86_BUILTIN_VEC_EXT_V2DF: case IX86_BUILTIN_VEC_EXT_V2DI: @@ -16188,11 +18190,11 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_VEC_EXT_V8HI: case IX86_BUILTIN_VEC_EXT_V2SI: case IX86_BUILTIN_VEC_EXT_V4HI: - return ix86_expand_vec_ext_builtin (arglist, target); + return ix86_expand_vec_ext_builtin (exp, target); case IX86_BUILTIN_VEC_SET_V8HI: case IX86_BUILTIN_VEC_SET_V4HI: - return ix86_expand_vec_set_builtin (arglist); + return ix86_expand_vec_set_builtin (exp); default: break; @@ -16206,53 +18208,101 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, || d->icode == CODE_FOR_sse_vmmaskcmpv4sf3 || d->icode == CODE_FOR_sse2_maskcmpv2df3 || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3) - return ix86_expand_sse_compare (d, arglist, target); + return ix86_expand_sse_compare (d, exp, target); - return ix86_expand_binop_builtin (d->icode, arglist, target); + return ix86_expand_binop_builtin (d->icode, exp, target); } for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++) if (d->code == fcode) - return ix86_expand_unop_builtin (d->icode, arglist, target, 0); + return ix86_expand_unop_builtin (d->icode, exp, target, 0); for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++) if (d->code == fcode) - return ix86_expand_sse_comi (d, arglist, target); + return ix86_expand_sse_comi (d, exp, target); gcc_unreachable (); } -/* Expand an expression EXP that calls a built-in library function, - with result going to TARGET if that's convenient - (and in mode MODE if that's convenient). - SUBTARGET may be used as the target for computing one of EXP's operands. - IGNORE is nonzero if the value is to be ignored. */ +/* Returns a function decl for a vectorized version of the builtin function + with builtin function code FN and the result vector type TYPE, or NULL_TREE + if it is not available. */ -static rtx -ix86_expand_library_builtin (tree exp, rtx target, - rtx subtarget ATTRIBUTE_UNUSED, - enum machine_mode mode ATTRIBUTE_UNUSED, - int ignore) +static tree +ix86_builtin_vectorized_function (enum built_in_function fn, tree type_out, + tree type_in) { - enum built_in_function fncode; - tree fndecl, newfn, call; + enum machine_mode in_mode, out_mode; + int in_n, out_n; - /* Try expanding builtin math functions to the SSE2 ABI variants. */ - if (!TARGET_SSELIBM) - return NULL_RTX; + if (TREE_CODE (type_out) != VECTOR_TYPE + || TREE_CODE (type_in) != VECTOR_TYPE) + return NULL_TREE; + + out_mode = TYPE_MODE (TREE_TYPE (type_out)); + out_n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + + switch (fn) + { + case BUILT_IN_SQRT: + if (out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return ix86_builtins[IX86_BUILTIN_SQRTPD]; + return NULL_TREE; + + case BUILT_IN_SQRTF: + if (out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return ix86_builtins[IX86_BUILTIN_SQRTPS]; + return NULL_TREE; + + case BUILT_IN_LRINTF: + if (out_mode == SImode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ]; + return NULL_TREE; + + default: + ; + } + + return NULL_TREE; +} - fncode = builtin_mathfn_code (exp); - if (!ix86_builtin_function_variants [(int)fncode]) - return NULL_RTX; +/* Returns a decl of a function that implements conversion of the + input vector of type TYPE, or NULL_TREE if it is not available. */ + +static tree +ix86_builtin_conversion (enum tree_code code, tree type) +{ + if (TREE_CODE (type) != VECTOR_TYPE) + return NULL_TREE; + + switch (code) + { + case FLOAT_EXPR: + switch (TYPE_MODE (type)) + { + case V4SImode: + return ix86_builtins[IX86_BUILTIN_CVTDQ2PS]; + default: + return NULL_TREE; + } - fndecl = get_callee_fndecl (exp); - if (DECL_RTL_SET_P (fndecl)) - return NULL_RTX; + case FIX_TRUNC_EXPR: + switch (TYPE_MODE (type)) + { + case V4SFmode: + return ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]; + default: + return NULL_TREE; + } + default: + return NULL_TREE; - /* Build the redirected call and expand it. */ - newfn = ix86_builtin_function_variants [(int)fncode]; - call = build_function_call_expr (newfn, TREE_OPERAND (exp, 1)); - return expand_call (call, target, ignore); + } } /* Store OPERAND to the memory after reload is completed. This means @@ -16261,7 +18311,7 @@ rtx ix86_force_to_memory (enum machine_mode mode, rtx operand) { rtx result; - + gcc_assert (reload_completed); if (TARGET_RED_ZONE) { @@ -16363,7 +18413,7 @@ ix86_preferred_reload_class (rtx x, enum reg_class class) { enum machine_mode mode = GET_MODE (x); - /* We're only allowed to return a subclass of CLASS. Many of the + /* We're only allowed to return a subclass of CLASS. Many of the following checks fail for NO_REGS, so eliminate that early. */ if (class == NO_REGS) return NO_REGS; @@ -16372,7 +18422,7 @@ ix86_preferred_reload_class (rtx x, enum reg_class class) if (x == CONST0_RTX (mode)) return class; - /* Force constants into memory if we are loading a (non-zero) constant into + /* Force constants into memory if we are loading a (nonzero) constant into an MMX or SSE register. This is because there are no MMX/SSE instructions to load from a constant. */ if (CONSTANT_P (x) @@ -16498,20 +18548,14 @@ ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2, if (!TARGET_SSE2) return true; - /* If the target says that inter-unit moves are more expensive + /* If the target says that inter-unit moves are more expensive than moving through memory, then don't generate them. */ - if (!TARGET_INTER_UNIT_MOVES && !optimize_size) + if (!TARGET_INTER_UNIT_MOVES) return true; /* Between SSE and general, we have moves no larger than word size. */ if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) return true; - - /* ??? For the cost of one register reformat penalty, we could use - the same instructions to move SFmode and DFmode data, but the - relevant move patterns don't support those alternatives. */ - if (mode == SFmode || mode == DFmode) - return true; } return false; @@ -16542,7 +18586,7 @@ ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to, return true; /* Vector registers do not support subreg with nonzero offsets, which - are otherwise valid for integer registers. Since we can't see + are otherwise valid for integer registers. Since we can't see whether we have a nonzero offset from here, prohibit all nonparadoxical subregs changing size. */ if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from)) @@ -16653,7 +18697,7 @@ ix86_hard_regno_mode_ok (int regno, enum machine_mode mode) else if (VALID_FP_MODE_P (mode)) return 1; /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go - on to use that value in smaller contexts, this can easily force a + on to use that value in smaller contexts, this can easily force a pseudo to be allocated to GENERAL_REGS. Since this is no worse than supporting DImode, allow it. */ else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode)) @@ -16662,7 +18706,7 @@ ix86_hard_regno_mode_ok (int regno, enum machine_mode mode) return 0; } -/* A subroutine of ix86_modes_tieable_p. Return true if MODE is a +/* A subroutine of ix86_modes_tieable_p. Return true if MODE is a tieable integer mode. */ static bool @@ -16710,7 +18754,7 @@ ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2) if (mode2 == DFmode) return mode1 == SFmode; - /* If MODE2 is only appropriate for an SSE register, then tie with + /* If MODE2 is only appropriate for an SSE register, then tie with any other mode acceptable to SSE registers. */ if (GET_MODE_SIZE (mode2) >= 8 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) @@ -16884,7 +18928,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) return false; case ASHIFT: - if (GET_CODE (XEXP (x, 1)) == CONST_INT + if (CONST_INT_P (XEXP (x, 1)) && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT)) { HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); @@ -16908,7 +18952,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) case ROTATERT: if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode) { - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) { if (INTVAL (XEXP (x, 1)) > 32) *total = ix86_cost->shift_const + COSTS_N_INSNS (2); @@ -16925,7 +18969,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) } else { - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) *total = ix86_cost->shift_const; else *total = ix86_cost->shift_var; @@ -16943,7 +18987,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) rtx op0 = XEXP (x, 0); rtx op1 = XEXP (x, 1); int nbits; - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) { unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); for (nbits = 0; value != 0; value &= value - 1) @@ -16963,7 +19007,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) if (GET_CODE (op0) == GET_CODE (op1)) is_mulwiden = 1, op1 = XEXP (op1, 0); - else if (GET_CODE (op1) == CONST_INT) + else if (CONST_INT_P (op1)) { if (GET_CODE (op0) == SIGN_EXTEND) is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode) @@ -17001,7 +19045,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) { if (GET_CODE (XEXP (x, 0)) == PLUS && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT - && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1)) && CONSTANT_P (XEXP (x, 1))) { HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1)); @@ -17016,7 +19060,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) } } else if (GET_CODE (XEXP (x, 0)) == MULT - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT) + && CONST_INT_P (XEXP (XEXP (x, 0), 1))) { HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1)); if (val == 2 || val == 4 || val == 8) @@ -17078,7 +19122,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) case COMPARE: if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT && XEXP (XEXP (x, 0), 1) == const1_rtx - && GET_CODE (XEXP (XEXP (x, 0), 2)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (x, 0), 2)) && XEXP (x, 1) == const0_rtx) { /* This kind of construct is implemented using test[bwl]. @@ -17131,6 +19175,9 @@ machopic_output_stub (FILE *file, const char *symb, const char *stub) char *binder_name, *symbol_name, lazy_ptr_name[32]; int label = ++current_machopic_label_num; + /* For 64-bit we shouldn't get here. */ + gcc_assert (!TARGET_64BIT); + /* Lose our funky encoding stuff so it doesn't contaminate the stub. */ symb = (*targetm.strip_name_encoding) (symb); @@ -17381,7 +19428,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, { if (!x86_64_general_operand (xops[0], DImode)) { - tmp = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 2 /* R10 */); + tmp = gen_rtx_REG (DImode, R10_REG); xops[1] = tmp; output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", xops); xops[0] = tmp; @@ -17397,7 +19444,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, if (vcall_offset) { if (TARGET_64BIT) - tmp = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 2 /* R10 */); + tmp = gen_rtx_REG (DImode, R10_REG); else { int tmp_regno = 2 /* ECX */; @@ -17418,7 +19465,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, xops[0] = gen_rtx_MEM (Pmode, plus_constant (tmp, vcall_offset)); if (TARGET_64BIT && !memory_operand (xops[0], Pmode)) { - rtx tmp2 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */); + rtx tmp2 = gen_rtx_REG (DImode, R11_REG); xops[0] = GEN_INT (vcall_offset); xops[1] = tmp2; output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops); @@ -17569,14 +19616,14 @@ min_insn_size (rtx insn) if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN) return 0; - if (GET_CODE (insn) == JUMP_INSN + if (JUMP_P (insn) && (GET_CODE (PATTERN (insn)) == ADDR_VEC || GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC)) return 0; /* Important case - calls are always 5 bytes. It is common to have many calls in the row. */ - if (GET_CODE (insn) == CALL_INSN + if (CALL_P (insn) && symbolic_reference_mentioned_p (PATTERN (insn)) && !SIBLING_CALL_P (insn)) return 5; @@ -17586,7 +19633,7 @@ min_insn_size (rtx insn) /* For normal instructions we may rely on the sizes of addresses and the presence of symbol to require 4 bytes of encoding. This is not the case for jumps where references are PC relative. */ - if (GET_CODE (insn) != JUMP_INSN) + if (!JUMP_P (insn)) { l = get_attr_length_address (insn); if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn))) @@ -17625,10 +19672,10 @@ ix86_avoid_jump_misspredicts (void) if (dump_file) fprintf(dump_file, "Insn %i estimated to %i bytes\n", INSN_UID (insn), min_insn_size (insn)); - if ((GET_CODE (insn) == JUMP_INSN + if ((JUMP_P (insn) && GET_CODE (PATTERN (insn)) != ADDR_VEC && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC) - || GET_CODE (insn) == CALL_INSN) + || CALL_P (insn)) njumps++; else continue; @@ -17636,10 +19683,10 @@ ix86_avoid_jump_misspredicts (void) while (njumps > 3) { start = NEXT_INSN (start); - if ((GET_CODE (start) == JUMP_INSN + if ((JUMP_P (start) && GET_CODE (PATTERN (start)) != ADDR_VEC && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC) - || GET_CODE (start) == CALL_INSN) + || CALL_P (start)) njumps--, isjump = 1; else isjump = 0; @@ -17679,13 +19726,13 @@ ix86_pad_returns (void) rtx prev; bool replace = false; - if (GET_CODE (ret) != JUMP_INSN || GET_CODE (PATTERN (ret)) != RETURN + if (!JUMP_P (ret) || GET_CODE (PATTERN (ret)) != RETURN || !maybe_hot_bb_p (bb)) continue; for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev)) - if (active_insn_p (prev) || GET_CODE (prev) == CODE_LABEL) + if (active_insn_p (prev) || LABEL_P (prev)) break; - if (prev && GET_CODE (prev) == CODE_LABEL) + if (prev && LABEL_P (prev)) { edge e; edge_iterator ei; @@ -17699,8 +19746,8 @@ ix86_pad_returns (void) { prev = prev_active_insn (ret); if (prev - && ((GET_CODE (prev) == JUMP_INSN && any_condjump_p (prev)) - || GET_CODE (prev) == CALL_INSN)) + && ((JUMP_P (prev) && any_condjump_p (prev)) + || CALL_P (prev))) replace = true; /* Empty functions get branch mispredict even when the jump destination is not visible to us. */ @@ -17777,21 +19824,25 @@ x86_emit_floatuns (rtx operands[2]) mode = GET_MODE (out); neglab = gen_label_rtx (); donelab = gen_label_rtx (); - i1 = gen_reg_rtx (Pmode); f0 = gen_reg_rtx (mode); - emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, Pmode, 0, neglab); + emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab); + + expand_float (out, in, 0); - emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_FLOAT (mode, in))); emit_jump_insn (gen_jump (donelab)); emit_barrier (); emit_label (neglab); - i0 = expand_simple_binop (Pmode, LSHIFTRT, in, const1_rtx, NULL, 1, OPTAB_DIRECT); - i1 = expand_simple_binop (Pmode, AND, in, const1_rtx, NULL, 1, OPTAB_DIRECT); - i0 = expand_simple_binop (Pmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT); + i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT); + expand_float (f0, i0, 0); + emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0))); emit_label (donelab); @@ -17811,7 +19862,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, { case V2SImode: case V2SFmode: - if (!mmx_ok && !TARGET_SSE) + if (!mmx_ok) return false; /* FALLTHRU */ @@ -17851,11 +19902,66 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, wvmode = V4HImode; goto widen; case V8HImode: + if (TARGET_SSE2) + { + rtx tmp1, tmp2; + /* Extend HImode to SImode using a paradoxical SUBREG. */ + tmp1 = gen_reg_rtx (SImode); + emit_move_insn (tmp1, gen_lowpart (SImode, val)); + /* Insert the SImode value as low element of V4SImode vector. */ + tmp2 = gen_reg_rtx (V4SImode); + tmp1 = gen_rtx_VEC_MERGE (V4SImode, + gen_rtx_VEC_DUPLICATE (V4SImode, tmp1), + CONST0_RTX (V4SImode), + const1_rtx); + emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1)); + /* Cast the V4SImode vector back to a V8HImode vector. */ + tmp1 = gen_reg_rtx (V8HImode); + emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2)); + /* Duplicate the low short through the whole low SImode word. */ + emit_insn (gen_sse2_punpcklwd (tmp1, tmp1, tmp1)); + /* Cast the V8HImode vector back to a V4SImode vector. */ + tmp2 = gen_reg_rtx (V4SImode); + emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1)); + /* Replicate the low element of the V4SImode vector. */ + emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx)); + /* Cast the V2SImode back to V8HImode, and store in target. */ + emit_move_insn (target, gen_lowpart (V8HImode, tmp2)); + return true; + } smode = HImode; wsmode = SImode; wvmode = V4SImode; goto widen; case V16QImode: + if (TARGET_SSE2) + { + rtx tmp1, tmp2; + /* Extend QImode to SImode using a paradoxical SUBREG. */ + tmp1 = gen_reg_rtx (SImode); + emit_move_insn (tmp1, gen_lowpart (SImode, val)); + /* Insert the SImode value as low element of V4SImode vector. */ + tmp2 = gen_reg_rtx (V4SImode); + tmp1 = gen_rtx_VEC_MERGE (V4SImode, + gen_rtx_VEC_DUPLICATE (V4SImode, tmp1), + CONST0_RTX (V4SImode), + const1_rtx); + emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1)); + /* Cast the V4SImode vector back to a V16QImode vector. */ + tmp1 = gen_reg_rtx (V16QImode); + emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2)); + /* Duplicate the low byte through the whole low SImode word. */ + emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1)); + emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1)); + /* Cast the V16QImode vector back to a V4SImode vector. */ + tmp2 = gen_reg_rtx (V4SImode); + emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1)); + /* Replicate the low element of the V4SImode vector. */ + emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx)); + /* Cast the V2SImode back to V16QImode, and store in target. */ + emit_move_insn (target, gen_lowpart (V16QImode, tmp2)); + return true; + } smode = QImode; wsmode = HImode; wvmode = V8HImode; @@ -17880,26 +19986,29 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode, } /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector - whose low element is VAR, and other elements are zero. Return true + whose ONE_VAR element is VAR, and other elements are zero. Return true if successful. */ static bool -ix86_expand_vector_init_low_nonzero (bool mmx_ok, enum machine_mode mode, - rtx target, rtx var) +ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode, + rtx target, rtx var, int one_var) { enum machine_mode vsimode; - rtx x; + rtx new_target; + rtx x, tmp; switch (mode) { case V2SFmode: case V2SImode: - if (!mmx_ok && !TARGET_SSE) + if (!mmx_ok) return false; /* FALLTHRU */ case V2DFmode: case V2DImode: + if (one_var != 0) + return false; var = force_reg (GET_MODE_INNER (mode), var); x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode))); emit_insn (gen_rtx_SET (VOIDmode, target, x)); @@ -17907,10 +20016,55 @@ ix86_expand_vector_init_low_nonzero (bool mmx_ok, enum machine_mode mode, case V4SFmode: case V4SImode: + if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER) + new_target = gen_reg_rtx (mode); + else + new_target = target; var = force_reg (GET_MODE_INNER (mode), var); x = gen_rtx_VEC_DUPLICATE (mode, var); x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx); - emit_insn (gen_rtx_SET (VOIDmode, target, x)); + emit_insn (gen_rtx_SET (VOIDmode, new_target, x)); + if (one_var != 0) + { + /* We need to shuffle the value to the correct position, so + create a new pseudo to store the intermediate result. */ + + /* With SSE2, we can use the integer shuffle insns. */ + if (mode != V4SFmode && TARGET_SSE2) + { + emit_insn (gen_sse2_pshufd_1 (new_target, new_target, + GEN_INT (1), + GEN_INT (one_var == 1 ? 0 : 1), + GEN_INT (one_var == 2 ? 0 : 1), + GEN_INT (one_var == 3 ? 0 : 1))); + if (target != new_target) + emit_move_insn (target, new_target); + return true; + } + + /* Otherwise convert the intermediate result to V4SFmode and + use the SSE1 shuffle instructions. */ + if (mode != V4SFmode) + { + tmp = gen_reg_rtx (V4SFmode); + emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target)); + } + else + tmp = new_target; + + emit_insn (gen_sse_shufps_1 (tmp, tmp, tmp, + GEN_INT (1), + GEN_INT (one_var == 1 ? 0 : 1), + GEN_INT (one_var == 2 ? 0+4 : 1+4), + GEN_INT (one_var == 3 ? 0+4 : 1+4))); + + if (mode != V4SFmode) + emit_move_insn (target, gen_lowpart (V4SImode, tmp)); + else if (tmp != target) + emit_move_insn (target, tmp); + } + else if (target != new_target) + emit_move_insn (target, new_target); return true; case V8HImode: @@ -17924,11 +20078,15 @@ ix86_expand_vector_init_low_nonzero (bool mmx_ok, enum machine_mode mode, vsimode = V2SImode; goto widen; widen: + if (one_var != 0) + return false; + /* Zero extend the variable element to SImode and recurse. */ var = convert_modes (SImode, GET_MODE_INNER (mode), var, true); x = gen_reg_rtx (vsimode); - if (!ix86_expand_vector_init_low_nonzero (mmx_ok, vsimode, x, var)) + if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x, + var, one_var)) gcc_unreachable (); emit_move_insn (target, gen_lowpart (mode, x)); @@ -18083,7 +20241,7 @@ ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode, if (!register_operand (op1, half_mode)) op1 = force_reg (half_mode, op1); - emit_insn (gen_rtx_SET (VOIDmode, target, + emit_insn (gen_rtx_SET (VOIDmode, target, gen_rtx_VEC_CONCAT (mode, op0, op1))); } else @@ -18143,7 +20301,7 @@ ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode, } } -/* Initialize vector TARGET via VALS. Suppress the use of MMX +/* Initialize vector TARGET via VALS. Suppress the use of MMX instructions unless MMX_OK is true. */ void @@ -18185,9 +20343,10 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals) the pool and overwritten via move later. */ if (n_var == 1) { - if (all_const_zero && one_var == 0 - && ix86_expand_vector_init_low_nonzero (mmx_ok, mode, target, - XVECEXP (vals, 0, 0))) + if (all_const_zero + && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target, + XVECEXP (vals, 0, one_var), + one_var)) return; if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var)) @@ -18545,8 +20704,6 @@ ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED, clobbers); clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"), clobbers); - clobbers = tree_cons (NULL_TREE, build_string (7, "dirflag"), - clobbers); return clobbers; } @@ -18612,9 +20769,8 @@ output_387_reg_move (rtx insn, rtx *operands) if (REG_P (operands[1]) && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) { - if (REGNO (operands[0]) == FIRST_STACK_REG - && TARGET_USE_FFREEP) - return "ffreep\t%y0"; + if (REGNO (operands[0]) == FIRST_STACK_REG) + return output_387_ffreep (operands, 0); return "fstp\t%y0"; } if (STACK_TOP_P (operands[0])) @@ -18673,14 +20829,14 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1) emit_jump_insn (gen_bge (label1)); emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */ - emit_insn (gen_fyl2xp1_xf3 (op0, tmp2, op1)); + emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2)); emit_jump (label2); emit_label (label1); emit_move_insn (tmp, CONST1_RTX (XFmode)); emit_insn (gen_addxf3 (tmp, op1, tmp)); emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */ - emit_insn (gen_fyl2x_xf3 (op0, tmp2, tmp)); + emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2)); emit_label (label2); } @@ -18748,7 +20904,7 @@ asm_preferred_eh_data_format (int code, int global) { if (flag_pic) { -int type = DW_EH_PE_sdata8; + int type = DW_EH_PE_sdata8; if (!TARGET_64BIT || ix86_cmodel == CM_SMALL_PIC || (ix86_cmodel == CM_MEDIUM_PIC && (global || code))) @@ -18760,5 +20916,581 @@ int type = DW_EH_PE_sdata8; return DW_EH_PE_udata4; return DW_EH_PE_absptr; } + +/* Expand copysign from SIGN to the positive value ABS_VALUE + storing in RESULT. If MASK is non-null, it shall be a mask to mask out + the sign-bit. */ +static void +ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask) +{ + enum machine_mode mode = GET_MODE (sign); + rtx sgn = gen_reg_rtx (mode); + if (mask == NULL_RTX) + { + mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), false); + if (!VECTOR_MODE_P (mode)) + { + /* We need to generate a scalar mode mask in this case. */ + rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); + mask = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (VOIDmode, mask, tmp)); + } + } + else + mask = gen_rtx_NOT (mode, mask); + emit_insn (gen_rtx_SET (VOIDmode, sgn, + gen_rtx_AND (mode, mask, sign))); + emit_insn (gen_rtx_SET (VOIDmode, result, + gen_rtx_IOR (mode, abs_value, sgn))); +} + +/* Expand fabs (OP0) and return a new rtx that holds the result. The + mask for masking out the sign-bit is stored in *SMASK, if that is + non-null. */ +static rtx +ix86_expand_sse_fabs (rtx op0, rtx *smask) +{ + enum machine_mode mode = GET_MODE (op0); + rtx xa, mask; + + xa = gen_reg_rtx (mode); + mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), true); + if (!VECTOR_MODE_P (mode)) + { + /* We need to generate a scalar mode mask in this case. */ + rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx)); + tmp = gen_rtx_VEC_SELECT (mode, mask, tmp); + mask = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (VOIDmode, mask, tmp)); + } + emit_insn (gen_rtx_SET (VOIDmode, xa, + gen_rtx_AND (mode, op0, mask))); + + if (smask) + *smask = mask; + + return xa; +} + +/* Expands a comparison of OP0 with OP1 using comparison code CODE, + swapping the operands if SWAP_OPERANDS is true. The expanded + code is a forward jump to a newly created label in case the + comparison is true. The generated label rtx is returned. */ +static rtx +ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1, + bool swap_operands) +{ + rtx label, tmp; + + if (swap_operands) + { + tmp = op0; + op0 = op1; + op1 = tmp; + } + + label = gen_label_rtx (); + tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_COMPARE (CCFPUmode, op0, op1))); + tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx); + tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp, + gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx); + tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp)); + JUMP_LABEL (tmp) = label; + + return label; +} + +/* Expand a mask generating SSE comparison instruction comparing OP0 with OP1 + using comparison code CODE. Operands are swapped for the comparison if + SWAP_OPERANDS is true. Returns a rtx for the generated mask. */ +static rtx +ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1, + bool swap_operands) +{ + enum machine_mode mode = GET_MODE (op0); + rtx mask = gen_reg_rtx (mode); + + if (swap_operands) + { + rtx tmp = op0; + op0 = op1; + op1 = tmp; + } + + if (mode == DFmode) + emit_insn (gen_sse2_maskcmpdf3 (mask, op0, op1, + gen_rtx_fmt_ee (code, mode, op0, op1))); + else + emit_insn (gen_sse_maskcmpsf3 (mask, op0, op1, + gen_rtx_fmt_ee (code, mode, op0, op1))); + + return mask; +} + +/* Generate and return a rtx of mode MODE for 2**n where n is the number + of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */ +static rtx +ix86_gen_TWO52 (enum machine_mode mode) +{ + REAL_VALUE_TYPE TWO52r; + rtx TWO52; + + real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23); + TWO52 = const_double_from_real_value (TWO52r, mode); + TWO52 = force_reg (mode, TWO52); + + return TWO52; +} + +/* Expand SSE sequence for computing lround from OP1 storing + into OP0. */ +void +ix86_expand_lround (rtx op0, rtx op1) +{ + /* C code for the stuff we're doing below: + tmp = op1 + copysign (nextafter (0.5, 0.0), op1) + return (long)tmp; + */ + enum machine_mode mode = GET_MODE (op1); + const struct real_format *fmt; + REAL_VALUE_TYPE pred_half, half_minus_pred_half; + rtx adj; + + /* load nextafter (0.5, 0.0) */ + fmt = REAL_MODE_FORMAT (mode); + real_2expN (&half_minus_pred_half, -(fmt->p) - 1); + REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half); + + /* adj = copysign (0.5, op1) */ + adj = force_reg (mode, const_double_from_real_value (pred_half, mode)); + ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX); + + /* adj = op1 + adj */ + adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT); + + /* op0 = (imode)adj */ + expand_fix (op0, adj, 0); +} + +/* Expand SSE2 sequence for computing lround from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor) +{ + /* C code for the stuff we're doing below (for do_floor): + xi = (long)op1; + xi -= (double)xi > op1 ? 1 : 0; + return xi; + */ + enum machine_mode fmode = GET_MODE (op1); + enum machine_mode imode = GET_MODE (op0); + rtx ireg, freg, label, tmp; + + /* reg = (long)op1 */ + ireg = gen_reg_rtx (imode); + expand_fix (ireg, op1, 0); + + /* freg = (double)reg */ + freg = gen_reg_rtx (fmode); + expand_float (freg, ireg, 0); + + /* ireg = (freg > op1) ? ireg - 1 : ireg */ + label = ix86_expand_sse_compare_and_jump (UNLE, + freg, op1, !do_floor); + tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS, + ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (ireg, tmp); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (op0, ireg); +} + +/* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the + result in OPERAND0. */ +void +ix86_expand_rint (rtx operand0, rtx operand1) +{ + /* C code for the stuff we're doing below: + xa = fabs (operand1); + if (!isless (xa, 2**52)) + return operand1; + xa = xa + 2**52 - 2**52; + return copysign (xa, operand1); + */ + enum machine_mode mode = GET_MODE (operand0); + rtx res, xa, label, TWO52, mask; + + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + TWO52 = ix86_gen_TWO52 (mode); + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); + + ix86_sse_copysign_to_positive (res, xa, res, mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor) +{ + /* C code for the stuff we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + xa = xa + TWO52 - TWO52; + x2 = copysign (xa, x); + Compensate. Floor: + if (x2 > x) + x2 -= 1; + Compensate. Ceil: + if (x2 < x) + x2 -= -1; + return x2; + */ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, TWO52, tmp, label, one, res, mask; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* xa = xa + TWO52 - TWO52; */ + xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); + + /* xa = copysign (xa, operand1) */ + ix86_sse_copysign_to_positive (xa, xa, res, mask); + + /* generate 1.0 or -1.0 */ + one = force_reg (mode, + const_double_from_real_value (do_floor + ? dconst1 : dconstm1, mode)); + + /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_AND (mode, one, tmp))); + /* We always need to subtract here to preserve signed zero. */ + tmp = expand_simple_binop (mode, MINUS, + xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor) +{ + /* C code for the stuff we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + x2 = (double)(long)x; + Compensate. Floor: + if (x2 > x) + x2 -= 1; + Compensate. Ceil: + if (x2 < x) + x2 += 1; + if (HONOR_SIGNED_ZEROS (mode)) + return copysign (x2, x); + return x2; + */ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, xi, TWO52, tmp, label, one, res, mask; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* xa = (double)(long)x */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, res, 0); + expand_float (xa, xi, 0); + + /* generate 1.0 */ + one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + + /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_AND (mode, one, tmp))); + tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, + xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + if (HONOR_SIGNED_ZEROS (mode)) + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing round from OPERAND1 storing + into OPERAND0. Sequence that works without relying on DImode truncation + via cvttsd2siq that is only available on 64bit targets. */ +void +ix86_expand_rounddf_32 (rtx operand0, rtx operand1) +{ + /* C code for the stuff we expand below. + double xa = fabs (x), xa2, x2; + if (!isless (xa, TWO52)) + return x; + Using the absolute value and copying back sign makes + -0.0 -> -0.0 correct. + xa2 = xa + TWO52 - TWO52; + Compensate. + dxa = xa2 - xa; + if (dxa <= -0.5) + xa2 += 1; + else if (dxa > 0.5) + xa2 -= 1; + x2 = copysign (xa2, x); + return x2; + */ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* xa2 = xa + TWO52 - TWO52; */ + xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT); + + /* dxa = xa2 - xa; */ + dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT); + + /* generate 0.5, 1.0 and -0.5 */ + half = force_reg (mode, const_double_from_real_value (dconsthalf, mode)); + one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT); + mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX, + 0, OPTAB_DIRECT); + + /* Compensate. */ + tmp = gen_reg_rtx (mode); + /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_AND (mode, one, tmp))); + xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); + /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */ + tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false); + emit_insn (gen_rtx_SET (VOIDmode, tmp, + gen_rtx_AND (mode, one, tmp))); + xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); + + /* res = copysign (xa2, operand1) */ + ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing trunc from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_trunc (rtx operand0, rtx operand1) +{ + /* C code for SSE variant we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + x2 = (double)(long)x; + if (HONOR_SIGNED_ZEROS (mode)) + return copysign (x2, x); + return x2; + */ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, xi, TWO52, label, res, mask; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* x = (double)(long)x */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, res, 0); + expand_float (res, xi, 0); + + if (HONOR_SIGNED_ZEROS (mode)) + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing trunc from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_truncdf_32 (rtx operand0, rtx operand1) +{ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, mask, TWO52, label, one, res, smask, tmp; + + /* C code for SSE variant we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + xa2 = xa + TWO52 - TWO52; + Compensate: + if (xa2 > xa) + xa2 -= 1.0; + x2 = copysign (xa2, x); + return x2; + */ + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &smask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* res = xa + TWO52 - TWO52; */ + tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + /* generate 1.0 */ + one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + + /* Compensate: res = xa2 - (res > xa ? 1 : 0) */ + mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false); + emit_insn (gen_rtx_SET (VOIDmode, mask, + gen_rtx_AND (mode, mask, one))); + tmp = expand_simple_binop (mode, MINUS, + res, mask, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + /* res = copysign (res, operand1) */ + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing round from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_round (rtx operand0, rtx operand1) +{ + /* C code for the stuff we're doing below: + double xa = fabs (x); + if (!isless (xa, TWO52)) + return x; + xa = (double)(long)(xa + nextafter (0.5, 0.0)); + return copysign (xa, x); + */ + enum machine_mode mode = GET_MODE (operand0); + rtx res, TWO52, xa, label, xi, half, mask; + const struct real_format *fmt; + REAL_VALUE_TYPE pred_half, half_minus_pred_half; + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + TWO52 = ix86_gen_TWO52 (mode); + xa = ix86_expand_sse_fabs (res, &mask); + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* load nextafter (0.5, 0.0) */ + fmt = REAL_MODE_FORMAT (mode); + real_2expN (&half_minus_pred_half, -(fmt->p) - 1); + REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half); + + /* xa = xa + 0.5 */ + half = force_reg (mode, const_double_from_real_value (pred_half, mode)); + xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT); + + /* xa = (double)(int64_t)xa */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, xa, 0); + expand_float (xa, xi, 0); + + /* res = copysign (xa, operand1) */ + ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} #include "gt-i386.h"