X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Fconfig%2Fi386%2Fi386.c;h=0ca17121128427805d21d33e8782bba7424d9d36;hb=40fe46c9811f4bc90a0f47ee442dd13140406f67;hp=4546f4d2b3ea5a71d51d2b2ecce7a9787aa916bb;hpb=56a575a5543a08eb72130265bb34a315195ccfa0;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 4546f4d2b3e..0ca17121128 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1,6 +1,6 @@ /* Subroutines used for code generation on IA-32. Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. + 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -68,6 +68,8 @@ Boston, MA 02110-1301, USA. */ /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ #define COSTS_N_BYTES(N) ((N) * 2) +#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}} + static const struct processor_costs size_cost = { /* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of an add instruction */ @@ -119,6 +121,10 @@ struct processor_costs size_cost = { /* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of FABS instruction. */ COSTS_N_BYTES (2), /* cost of FCHS instruction. */ COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}} }; /* Processor costs (relative to an add) */ @@ -173,6 +179,10 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (22), /* cost of FABS instruction. */ COSTS_N_INSNS (24), /* cost of FCHS instruction. */ COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, }; static const @@ -226,6 +236,10 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -279,6 +293,10 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -332,6 +350,17 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure + the alignment). For small blocks inline loop is still a noticeable win, for bigger + blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently + more expensive startup time in CPU, but after 4K the difference is down in the noise. + */ + {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop}, + {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_4_byte, {{1024, unrolled_loop}, + {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -386,6 +415,10 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -439,6 +472,10 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -492,6 +529,13 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + /* For some reason, Athlon deals better with REP prefix (relative to loops) + compared to K8. Alignment becomes important after 8 bytes for memcpy and + 128 bytes for memset. */ + {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -550,6 +594,88 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + /* K8 has optimized REP instruction for medium sized blocks, but for very small + blocks it is better to use loop. For large blocks, libcall can do + nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} +}; + +struct processor_costs amdfam10_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8 + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10 + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 5, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + + /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; static const @@ -603,6 +729,11 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ + {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, + {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, }; static const @@ -656,6 +787,72 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ + {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, + {libcall, {{32, loop}, {20000, rep_prefix_8_byte}, + {100000, unrolled_loop}, {-1, libcall}}}}, + {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, + {-1, libcall}}}, + {libcall, {{24, loop}, {64, unrolled_loop}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}} +}; + +static const +struct processor_costs core2_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (3), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (3), /* DI */ + COSTS_N_INSNS (3)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (22), /* HI */ + COSTS_N_INSNS (22), /* SI */ + COSTS_N_INSNS (22), /* DI */ + COSTS_N_INSNS (22)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 16, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {6, 6, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 4}, /* cost of loading integer registers */ + 2, /* cost of moving MMX register */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {6, 6, 6}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 4}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 2, /* MMX or SSE register to integer */ + 128, /* size of prefetch block */ + 8, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (5), /* cost of FMUL instruction. */ + COSTS_N_INSNS (32), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (58), /* cost of FSQRT instruction. */ + {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{32, loop}, {64, rep_prefix_4_byte}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {15, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{24, loop}, {32, unrolled_loop}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; /* Generic64 should produce code tuned for Nocona and K8. */ @@ -716,6 +913,10 @@ struct processor_costs generic64_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */ @@ -770,6 +971,10 @@ struct processor_costs generic32_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, }; const struct processor_costs *ix86_cost = &pentium_cost; @@ -779,121 +984,274 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_486 (1< not supported yet"); if ((TARGET_64BIT != 0) != ((target_flags & MASK_64BIT) != 0)) sorry ("%i-bit mode not compiled in", (target_flags & MASK_64BIT) ? 64 : 32); @@ -1757,6 +1930,19 @@ override_options (void) target_flags |= MASK_SSSE3; if (processor_alias_table[i].flags & PTA_PREFETCH_SSE) x86_prefetch_sse = true; + if (processor_alias_table[i].flags & PTA_CX16) + x86_cmpxchg16b = true; + if (processor_alias_table[i].flags & PTA_POPCNT + && !(target_flags_explicit & MASK_POPCNT)) + target_flags |= MASK_POPCNT; + if (processor_alias_table[i].flags & PTA_ABM + && !(target_flags_explicit & MASK_ABM)) + target_flags |= MASK_ABM; + if (processor_alias_table[i].flags & PTA_SSE4A + && !(target_flags_explicit & MASK_SSE4A)) + target_flags |= MASK_SSE4A; + if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))) + x86_sahf = true; if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT)) error ("CPU you selected does not support x86-64 " "instruction set"); @@ -1766,6 +1952,10 @@ override_options (void) if (i == pta_size) error ("bad value (%s) for -march= switch", ix86_arch_string); + ix86_arch_mask = 1u << ix86_arch; + for (i = 0; i < X86_ARCH_LAST; ++i) + ix86_arch_features[i] &= ix86_arch_mask; + for (i = 0; i < pta_size; i++) if (! strcmp (ix86_tune_string, processor_alias_table[i].name)) { @@ -1797,6 +1987,10 @@ override_options (void) if (i == pta_size) error ("bad value (%s) for -mtune= switch", ix86_tune_string); + ix86_tune_mask = 1u << ix86_tune; + for (i = 0; i < X86_TUNE_LAST; ++i) + ix86_tune_features[i] &= ix86_tune_mask; + if (optimize_size) ix86_cost = &size_cost; else @@ -1810,15 +2004,16 @@ override_options (void) /* Validate -mregparm= value. */ if (ix86_regparm_string) { + if (TARGET_64BIT) + warning (0, "-mregparm is ignored in 64-bit mode"); i = atoi (ix86_regparm_string); if (i < 0 || i > REGPARM_MAX) error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX); else ix86_regparm = i; } - else - if (TARGET_64BIT) - ix86_regparm = REGPARM_MAX; + if (TARGET_64BIT) + ix86_regparm = REGPARM_MAX; /* If the user has provided any of the -malign-* options, warn and use that value only if -falign-* is not set. @@ -1910,6 +2105,13 @@ override_options (void) ix86_tls_dialect_string); } + if (ix87_precision_string) + { + i = atoi (ix87_precision_string); + if (i != 32 && i != 64 && i != 80) + error ("pc%d is not valid precision setting (32, 64 or 80)", i); + } + /* Keep nonleaf frame pointers. */ if (flag_omit_frame_pointer) target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER; @@ -1923,7 +2125,7 @@ override_options (void) /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387, since the insns won't need emulation. */ - if (x86_arch_always_fancy_math_387 & (1 << ix86_arch)) + if (x86_arch_always_fancy_math_387 & ix86_arch_mask) target_flags &= ~MASK_NO_FANCY_MATH_387; /* Likewise, if the target doesn't have a 387, or we've specified @@ -1935,6 +2137,10 @@ override_options (void) if (TARGET_SSSE3) target_flags |= MASK_SSE3; + /* Turn on SSE3 builtins for -msse4a. */ + if (TARGET_SSE4A) + target_flags |= MASK_SSE3; + /* Turn on SSE2 builtins for -msse3. */ if (TARGET_SSE3) target_flags |= MASK_SSE2; @@ -1954,20 +2160,22 @@ override_options (void) if (TARGET_3DNOW) target_flags |= MASK_MMX; + /* Turn on POPCNT builtins for -mabm. */ + if (TARGET_ABM) + target_flags |= MASK_POPCNT; + if (TARGET_64BIT) { - if (TARGET_ALIGN_DOUBLE) - error ("-malign-double makes no sense in the 64bit mode"); if (TARGET_RTD) - error ("-mrtd calling convention not supported in the 64bit mode"); + warning (0, "-mrtd is ignored in 64bit mode"); /* Enable by default the SSE and MMX builtins. Do allow the user to explicitly disable any of these. In particular, disabling SSE and MMX for kernel code is extremely useful. */ target_flags - |= ((MASK_SSE2 | MASK_SSE | MASK_MMX | MASK_128BIT_LONG_DOUBLE) + |= ((MASK_SSE2 | MASK_SSE | MASK_MMX | TARGET_SUBTARGET64_DEFAULT) & ~target_flags_explicit); - } + } else { /* i386 ABI does not specify red zone. It still makes sense to use it @@ -1991,18 +2199,12 @@ override_options (void) ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT; } - /* Accept -mx87regparm only if 80387 support is enabled. */ - if (TARGET_X87REGPARM - && ! TARGET_80387) - error ("-mx87regparm used without 80387 enabled"); - /* Accept -msseregparm only if at least SSE support is enabled. */ if (TARGET_SSEREGPARM && ! TARGET_SSE) error ("-msseregparm used without SSE enabled"); ix86_fpmath = TARGET_FPMATH_DEFAULT; - if (ix86_fpmath_string != 0) { if (! strcmp (ix86_fpmath_string, "387")) @@ -2041,7 +2243,7 @@ override_options (void) if (!TARGET_80387) target_flags &= ~MASK_FLOAT_RETURNS; - if ((x86_accumulate_outgoing_args & TUNEMASK) + if ((x86_accumulate_outgoing_args & ix86_tune_mask) && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) && !optimize_size) target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; @@ -2061,6 +2263,11 @@ override_options (void) target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; } + /* For sane SSE instruction set generation we need fcomi instruction. + It is safe to enable all CMOVE instructions. */ + if (TARGET_SSE) + TARGET_CMOVE = 1; + /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */ { char *p; @@ -2082,11 +2289,47 @@ override_options (void) set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block); } -/* switch to the appropriate section for output of DECL. +/* Return true if this goes in large data/bss. */ + +static bool +ix86_in_large_data_p (tree exp) +{ + if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC) + return false; + + /* Functions are never large data. */ + if (TREE_CODE (exp) == FUNCTION_DECL) + return false; + + if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp)) + { + const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp)); + if (strcmp (section, ".ldata") == 0 + || strcmp (section, ".lbss") == 0) + return true; + return false; + } + else + { + HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp)); + + /* If this is an incomplete type with size 0, then we can't put it + in data because it might be too big when completed. */ + if (!size || size > ix86_section_threshold) + return true; + } + + return false; +} + +/* Switch to the appropriate section for output of DECL. DECL is either a `VAR_DECL' node or a constant of some sort. RELOC indicates whether forming the initial value of DECL requires link-time relocations. */ +static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT) + ATTRIBUTE_UNUSED; + static section * x86_64_elf_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align) @@ -2096,7 +2339,7 @@ x86_64_elf_select_section (tree decl, int reloc, { const char *sname = NULL; unsigned int flags = SECTION_WRITE; - switch (categorize_decl_for_section (decl, reloc, flag_pic)) + switch (categorize_decl_for_section (decl, reloc)) { case SECCAT_DATA: sname = ".ldata"; @@ -2153,7 +2396,7 @@ x86_64_elf_select_section (tree decl, int reloc, RELOC indicates whether the initial value of EXP requires link-time relocations. */ -static void +static void ATTRIBUTE_UNUSED x86_64_elf_unique_section (tree decl, int reloc) { if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC) @@ -2163,7 +2406,7 @@ x86_64_elf_unique_section (tree decl, int reloc) /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */ bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP; - switch (categorize_decl_for_section (decl, reloc, flag_pic)) + switch (categorize_decl_for_section (decl, reloc)) { case SECCAT_DATA: case SECCAT_DATA_REL: @@ -2234,6 +2477,7 @@ x86_elf_aligned_common (FILE *file, fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n", size, align / BITS_PER_UNIT); } +#endif /* Utility function for targets to use in implementing ASM_OUTPUT_ALIGNED_BSS. */ @@ -2258,7 +2502,6 @@ x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED, #endif /* ASM_DECLARE_OBJECT_NAME */ ASM_OUTPUT_SKIP (file, size ? size : 1); } -#endif void optimization_options (int level, int size ATTRIBUTE_UNUSED) @@ -2288,43 +2531,6 @@ optimization_options (int level, int size ATTRIBUTE_UNUSED) #endif } -/* Table of valid machine attributes. */ -const struct attribute_spec ix86_attribute_table[] = -{ - /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */ - /* Stdcall attribute says callee is responsible for popping arguments - if they are not variable. */ - { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, - /* Fastcall attribute says callee is responsible for popping arguments - if they are not variable. */ - { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, - /* Cdecl attribute says the callee is a normal C declaration */ - { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute }, - /* Regparm attribute specifies how many integer arguments are to be - passed in registers. */ - { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute }, - /* X87regparm attribute says we are passing floating point arguments - in 80387 registers. */ - { "x87regparm", 0, 0, false, true, true, ix86_handle_cconv_attribute }, - /* Sseregparm attribute says we are using x86_64 calling conventions - for FP arguments. */ - { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute }, - /* force_align_arg_pointer says this function realigns the stack at entry. */ - { (const char *)&ix86_force_align_arg_pointer_string, 0, 0, - false, true, true, ix86_handle_cconv_attribute }, -#if TARGET_DLLIMPORT_DECL_ATTRIBUTES - { "dllimport", 0, 0, false, false, false, handle_dll_attribute }, - { "dllexport", 0, 0, false, false, false, handle_dll_attribute }, - { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute }, -#endif - { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute }, - { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute }, -#ifdef SUBTARGET_ATTRIBUTE_TABLE - SUBTARGET_ATTRIBUTE_TABLE, -#endif - { NULL, 0, 0, false, false, false, NULL } -}; - /* Decide whether we can make a sibling call to a function. DECL is the declaration of the function being targeted by the call and EXP is the CALL_EXPR representing the call. */ @@ -2345,7 +2551,7 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) func = decl; else { - func = TREE_TYPE (TREE_OPERAND (exp, 0)); + func = TREE_TYPE (CALL_EXPR_FN (exp)); if (POINTER_TYPE_P (func)) func = TREE_TYPE (func); } @@ -2380,7 +2586,7 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) tree type; /* We're looking at the CALL_EXPR, we need the type of the function. */ - type = TREE_OPERAND (exp, 0); /* pointer expression */ + type = CALL_EXPR_FN (exp); /* pointer expression */ type = TREE_TYPE (type); /* pointer type */ type = TREE_TYPE (type); /* function type */ @@ -2392,12 +2598,11 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) } } -#if TARGET_DLLIMPORT_DECL_ATTRIBUTES /* Dllimport'd functions are also called indirectly. */ - if (decl && DECL_DLLIMPORT_P (decl) + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES + && decl && DECL_DLLIMPORT_P (decl) && ix86_function_regparm (TREE_TYPE (decl), NULL) >= 3) return false; -#endif /* If we forced aligned the stack, then sibcalling would unalign the stack, which may break the called function. */ @@ -2408,8 +2613,8 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) return true; } -/* Handle "cdecl", "stdcall", "fastcall", "regparm", "x87regparm" - and "sseregparm" calling convention attributes; +/* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm" + calling convention attributes; arguments as in struct attribute_spec.handler. */ static tree @@ -2468,14 +2673,15 @@ ix86_handle_cconv_attribute (tree *node, tree name, if (TARGET_64BIT) { - warning (OPT_Wattributes, "%qs attribute ignored", - IDENTIFIER_POINTER (name)); + /* Do not warn when emulating the MS ABI. */ + if (!TARGET_64BIT_MS_ABI) + warning (OPT_Wattributes, "%qs attribute ignored", + IDENTIFIER_POINTER (name)); *no_add_attrs = true; return NULL_TREE; } - /* Can combine fastcall with stdcall (redundant), x87regparm - and sseregparm. */ + /* Can combine fastcall with stdcall (redundant) and sseregparm. */ if (is_attribute_p ("fastcall", name)) { if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) @@ -2492,8 +2698,8 @@ ix86_handle_cconv_attribute (tree *node, tree name, } } - /* Can combine stdcall with fastcall (redundant), regparm, - x87regparm and sseregparm. */ + /* Can combine stdcall with fastcall (redundant), regparm and + sseregparm. */ else if (is_attribute_p ("stdcall", name)) { if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) @@ -2506,7 +2712,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, } } - /* Can combine cdecl with regparm, x87regparm and sseregparm. */ + /* Can combine cdecl with regparm and sseregparm. */ else if (is_attribute_p ("cdecl", name)) { if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) @@ -2519,7 +2725,7 @@ ix86_handle_cconv_attribute (tree *node, tree name, } } - /* Can combine x87regparm or sseregparm with all attributes. */ + /* Can combine sseregparm with all attributes. */ return NULL_TREE; } @@ -2544,11 +2750,6 @@ ix86_comp_type_attributes (tree type1, tree type2) != ix86_function_regparm (type2, NULL))) return 0; - /* Check for mismatched x87regparm types. */ - if (!lookup_attribute ("x87regparm", TYPE_ATTRIBUTES (type1)) - != !lookup_attribute ("x87regparm", TYPE_ATTRIBUTES (type2))) - return 0; - /* Check for mismatched sseregparm types. */ if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1)) != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2))) @@ -2571,127 +2772,81 @@ ix86_function_regparm (tree type, tree decl) { tree attr; int regparm = ix86_regparm; - bool user_convention = false; - if (!TARGET_64BIT) - { - attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); - if (attr) - { - regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); - user_convention = true; - } + if (TARGET_64BIT) + return regparm; - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) - { - regparm = 2; - user_convention = true; - } - - /* Use register calling convention for local functions when possible. */ - if (!TARGET_64BIT && !user_convention && decl - && flag_unit_at_a_time && !profile_flag) - { - struct cgraph_local_info *i = cgraph_local_info (decl); - if (i && i->local) - { - int local_regparm, globals = 0, regno; - - /* Make sure no regparm register is taken by a global register - variable. */ - for (local_regparm = 0; local_regparm < 3; local_regparm++) - if (global_regs[local_regparm]) - break; - /* We can't use regparm(3) for nested functions as these use - static chain pointer in third argument. */ - if (local_regparm == 3 - && decl_function_context (decl) - && !DECL_NO_STATIC_CHAIN (decl)) - local_regparm = 2; - /* If the function realigns its stackpointer, the - prologue will clobber %ecx. If we've already - generated code for the callee, the callee - DECL_STRUCT_FUNCTION is gone, so we fall back to - scanning the attributes for the self-realigning - property. */ - if ((DECL_STRUCT_FUNCTION (decl) - && DECL_STRUCT_FUNCTION (decl)->machine->force_align_arg_pointer) - || (!DECL_STRUCT_FUNCTION (decl) - && lookup_attribute (ix86_force_align_arg_pointer_string, - TYPE_ATTRIBUTES (TREE_TYPE (decl))))) - local_regparm = 2; - /* Each global register variable increases register preassure, - so the more global reg vars there are, the smaller regparm - optimization use, unless requested by the user explicitly. */ - for (regno = 0; regno < 6; regno++) - if (global_regs[regno]) - globals++; - local_regparm - = globals < local_regparm ? local_regparm - globals : 0; - - if (local_regparm > regparm) - regparm = local_regparm; - } - } - } - return regparm; -} + attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); + if (attr) + return TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); -/* Return 1 if we can pass up to X87_REGPARM_MAX floating point - arguments in x87 registers for a function with the indicated - TYPE and DECL. DECL may be NULL when calling function indirectly - or considering a libcall. For local functions, return 2. - Otherwise return 0. */ + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) + return 2; -static int -ix86_function_x87regparm (tree type, tree decl) -{ - /* Use x87 registers to pass floating point arguments if requested - by the x87regparm attribute. */ - if (TARGET_X87REGPARM - || (type - && lookup_attribute ("x87regparm", TYPE_ATTRIBUTES (type)))) + /* Use register calling convention for local functions when possible. */ + if (decl && flag_unit_at_a_time && !profile_flag) { - if (!TARGET_80387) + struct cgraph_local_info *i = cgraph_local_info (decl); + if (i && i->local) { - if (decl) - error ("Calling %qD with attribute x87regparm without " - "80387 enabled", decl); - else - error ("Calling %qT with attribute x87regparm without " - "80387 enabled", type); - return 0; - } + int local_regparm, globals = 0, regno; + struct function *f; - return 1; - } + /* Make sure no regparm register is taken by a + global register variable. */ + for (local_regparm = 0; local_regparm < 3; local_regparm++) + if (global_regs[local_regparm]) + break; - /* For local functions, pass up to X87_REGPARM_MAX floating point - arguments in x87 registers. */ - if (!TARGET_64BIT && decl - && flag_unit_at_a_time && !profile_flag) - { - struct cgraph_local_info *i = cgraph_local_info (decl); - if (i && i->local) - return 2; + /* We can't use regparm(3) for nested functions as these use + static chain pointer in third argument. */ + if (local_regparm == 3 + && decl_function_context (decl) + && !DECL_NO_STATIC_CHAIN (decl)) + local_regparm = 2; + + /* If the function realigns its stackpointer, the prologue will + clobber %ecx. If we've already generated code for the callee, + the callee DECL_STRUCT_FUNCTION is gone, so we fall back to + scanning the attributes for the self-realigning property. */ + f = DECL_STRUCT_FUNCTION (decl); + if (local_regparm == 3 + && (f ? !!f->machine->force_align_arg_pointer + : !!lookup_attribute (ix86_force_align_arg_pointer_string, + TYPE_ATTRIBUTES (TREE_TYPE (decl))))) + local_regparm = 2; + + /* Each global register variable increases register preassure, + so the more global reg vars there are, the smaller regparm + optimization use, unless requested by the user explicitly. */ + for (regno = 0; regno < 6; regno++) + if (global_regs[regno]) + globals++; + local_regparm + = globals < local_regparm ? local_regparm - globals : 0; + + if (local_regparm > regparm) + regparm = local_regparm; + } } - return 0; + return regparm; } -/* Return 1 or 2, if we can pass up to 8 SFmode (1) and DFmode (2) arguments - in SSE registers for a function with the indicated TYPE and DECL. - DECL may be NULL when calling function indirectly - or considering a libcall. Otherwise return 0. */ +/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and + DFmode (2) arguments in SSE registers for a function with the + indicated TYPE and DECL. DECL may be NULL when calling function + indirectly or considering a libcall. Otherwise return 0. */ static int ix86_function_sseregparm (tree type, tree decl) { + gcc_assert (!TARGET_64BIT); + /* Use SSE registers to pass SFmode and DFmode arguments if requested by the sseregparm attribute. */ if (TARGET_SSEREGPARM - || (type - && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type)))) + || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type)))) { if (!TARGET_SSE) { @@ -2708,10 +2863,8 @@ ix86_function_sseregparm (tree type, tree decl) } /* For local functions, pass up to SSE_REGPARM_MAX SFmode - (and DFmode for SSE2) arguments in SSE registers, - even for 32-bit targets. */ - if (!TARGET_64BIT && decl - && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag) + (and DFmode for SSE2) arguments in SSE registers. */ + if (decl && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag) { struct cgraph_local_info *i = cgraph_local_info (decl); if (i && i->local) @@ -2737,6 +2890,19 @@ ix86_eax_live_at_start_p (void) return REGNO_REG_SET_P (ENTRY_BLOCK_PTR->il.rtl->global_live_at_end, 0); } +/* Return true if TYPE has a variable argument list. */ + +static bool +type_has_variadic_args_p (tree type) +{ + tree t; + + for (t = TYPE_ARG_TYPES (type); t; t = TREE_CHAIN (t)) + if (t == void_list_node) + return false; + return true; +} + /* Value is the number of bytes of arguments automatically popped when returning from a subroutine call. FUNDECL is the declaration node of the function (as a tree), @@ -2757,32 +2923,33 @@ ix86_eax_live_at_start_p (void) int ix86_return_pops_args (tree fundecl, tree funtype, int size) { - int rtd = TARGET_RTD && (!fundecl || TREE_CODE (fundecl) != IDENTIFIER_NODE); + int rtd; + + /* None of the 64-bit ABIs pop arguments. */ + if (TARGET_64BIT) + return 0; + + rtd = TARGET_RTD && (!fundecl || TREE_CODE (fundecl) != IDENTIFIER_NODE); /* Cdecl functions override -mrtd, and never pop the stack. */ - if (! lookup_attribute ("cdecl", TYPE_ATTRIBUTES (funtype))) { - - /* Stdcall and fastcall functions will pop the stack if not - variable args. */ - if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype)) - || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype))) - rtd = 1; - - if (rtd - && (TYPE_ARG_TYPES (funtype) == NULL_TREE - || (TREE_VALUE (tree_last (TYPE_ARG_TYPES (funtype))) - == void_type_node))) - return size; - } + if (! lookup_attribute ("cdecl", TYPE_ATTRIBUTES (funtype))) + { + /* Stdcall and fastcall functions will pop the stack if not + variable args. */ + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype)) + || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype))) + rtd = 1; + + if (rtd && ! type_has_variadic_args_p (funtype)) + return size; + } /* Lose any fake structure return argument if it is passed on the stack. */ if (aggregate_value_p (TREE_TYPE (funtype), fundecl) - && !TARGET_64BIT && !KEEP_AGGREGATE_RETURN_POINTER) { int nregs = ix86_function_regparm (funtype, fundecl); - - if (!nregs) + if (nregs == 0) return GET_MODE_SIZE (Pmode); } @@ -2796,23 +2963,43 @@ bool ix86_function_arg_regno_p (int regno) { int i; + const int *parm_regs; + if (!TARGET_64BIT) - return (regno < REGPARM_MAX - || (TARGET_80387 && FP_REGNO_P (regno) - && (regno < FIRST_FLOAT_REG + X87_REGPARM_MAX)) - || (TARGET_MMX && MMX_REGNO_P (regno) - && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX)) - || (TARGET_SSE && SSE_REGNO_P (regno) - && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))); - - if (TARGET_SSE && SSE_REGNO_P (regno) - && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)) - return true; + { + if (TARGET_MACHO) + return (regno < REGPARM_MAX + || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno])); + else + return (regno < REGPARM_MAX + || (TARGET_MMX && MMX_REGNO_P (regno) + && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX)) + || (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))); + } + + if (TARGET_MACHO) + { + if (SSE_REGNO_P (regno) && TARGET_SSE) + return true; + } + else + { + if (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)) + return true; + } + /* RAX is used as hidden argument to va_arg functions. */ - if (!regno) + if (!TARGET_64BIT_MS_ABI && regno == 0) return true; + + if (TARGET_64BIT_MS_ABI) + parm_regs = x86_64_ms_abi_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; for (i = 0; i < REGPARM_MAX; i++) - if (regno == x86_64_int_parameter_registers[i]) + if (regno == parm_regs[i]) return true; return false; } @@ -2842,96 +3029,49 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ rtx libname, /* SYMBOL_REF of library name or 0 */ tree fndecl) { - static CUMULATIVE_ARGS zero_cum; - tree param, next_param; - - if (TARGET_DEBUG_ARG) - { - fprintf (stderr, "\ninit_cumulative_args ("); - if (fntype) - fprintf (stderr, "fntype code = %s, ret code = %s", - tree_code_name[(int) TREE_CODE (fntype)], - tree_code_name[(int) TREE_CODE (TREE_TYPE (fntype))]); - else - fprintf (stderr, "no fntype"); - - if (libname) - fprintf (stderr, ", libname = %s", XSTR (libname, 0)); - } - - *cum = zero_cum; + memset (cum, 0, sizeof (*cum)); /* Set up the number of registers to use for passing arguments. */ cum->nregs = ix86_regparm; - if (TARGET_80387) - cum->x87_nregs = X87_REGPARM_MAX; if (TARGET_SSE) cum->sse_nregs = SSE_REGPARM_MAX; if (TARGET_MMX) cum->mmx_nregs = MMX_REGPARM_MAX; cum->warn_sse = true; cum->warn_mmx = true; - cum->maybe_vaarg = false; + cum->maybe_vaarg = (fntype ? type_has_variadic_args_p (fntype) : !libname); - /* Use ecx and edx registers if function has fastcall attribute, - else look for regparm information. */ - if (fntype && !TARGET_64BIT) + if (!TARGET_64BIT) { - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype))) + /* If there are variable arguments, then we won't pass anything + in registers in 32-bit mode. */ + if (cum->maybe_vaarg) { - cum->nregs = 2; - cum->fastcall = 1; + cum->nregs = 0; + cum->sse_nregs = 0; + cum->mmx_nregs = 0; + cum->warn_sse = 0; + cum->warn_mmx = 0; + return; } - else - cum->nregs = ix86_function_regparm (fntype, fndecl); - } - - /* Set up the number of 80387 registers used for passing - floating point arguments. Warn for mismatching ABI. */ - cum->float_in_x87 = ix86_function_x87regparm (fntype, fndecl); - - /* Set up the number of SSE registers used for passing SFmode - and DFmode arguments. Warn for mismatching ABI. */ - cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl); - /* Determine if this function has variable arguments. This is - indicated by the last argument being 'void_type_mode' if there - are no variable arguments. If there are variable arguments, then - we won't pass anything in registers in 32-bit mode. */ - - if (cum->nregs || cum->mmx_nregs - || cum->x87_nregs || cum->sse_nregs) - { - for (param = (fntype) ? TYPE_ARG_TYPES (fntype) : 0; - param != 0; param = next_param) + /* Use ecx and edx registers if function has fastcall attribute, + else look for regparm information. */ + if (fntype) { - next_param = TREE_CHAIN (param); - if (next_param == 0 && TREE_VALUE (param) != void_type_node) + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype))) { - if (!TARGET_64BIT) - { - cum->nregs = 0; - cum->x87_nregs = 0; - cum->sse_nregs = 0; - cum->mmx_nregs = 0; - cum->warn_sse = 0; - cum->warn_mmx = 0; - cum->fastcall = 0; - cum->float_in_x87 = 0; - cum->float_in_sse = 0; - } - cum->maybe_vaarg = true; + cum->nregs = 2; + cum->fastcall = 1; } + else + cum->nregs = ix86_function_regparm (fntype, fndecl); } - } - if ((!fntype && !libname) - || (fntype && !TYPE_ARG_TYPES (fntype))) - cum->maybe_vaarg = true; - if (TARGET_DEBUG_ARG) - fprintf (stderr, ", nregs=%d )\n", cum->nregs); - - return; + /* Set up the number of SSE registers used for passing SFmode + and DFmode arguments. Warn for mismatching ABI. */ + cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl); + } } /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE. @@ -3394,20 +3534,6 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, rtx ret; n = classify_argument (mode, type, class, 0); - if (TARGET_DEBUG_ARG) - { - if (!n) - fprintf (stderr, "Memory class\n"); - else - { - fprintf (stderr, "Classes:"); - for (i = 0; i < n; i++) - { - fprintf (stderr, " %s", x86_64_reg_class_name[class[i]]); - } - fprintf (stderr, "\n"); - } - } if (!n) return NULL; if (!examine_argument (mode, type, in_return, &needed_intregs, @@ -3476,6 +3602,7 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, if (n == 2 && class[0] == X86_64_SSE_CLASS && class[1] == X86_64_SSEUP_CLASS && mode != BLKmode) return gen_rtx_REG (mode, SSE_REGNO (sse_regno)); + if (n == 2 && class[0] == X86_64_X87_CLASS && class[1] == X86_64_X87UP_CLASS) return gen_rtx_REG (XFmode, FIRST_STACK_REG); @@ -3551,140 +3678,141 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, return ret; } -/* Update the data in CUM to advance over an argument - of mode MODE and data type TYPE. - (TYPE is null for libcalls where that information may not be available.) */ +/* Update the data in CUM to advance over an argument of mode MODE + and data type TYPE. (TYPE is null for libcalls where that information + may not be available.) */ -void -function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode, - tree type, int named) +static void +function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, + tree type, HOST_WIDE_INT bytes, HOST_WIDE_INT words) { - int bytes = - (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); - int words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + switch (mode) + { + default: + break; - if (type) - mode = type_natural_mode (type); + case BLKmode: + if (bytes < 0) + break; + /* FALLTHRU */ - if (TARGET_DEBUG_ARG) - fprintf (stderr, "function_adv (sz=%d, wds=%2d, nregs=%d, ssenregs=%d, " - "mode=%s, named=%d)\n\n", - words, cum->words, cum->nregs, cum->sse_nregs, - GET_MODE_NAME (mode), named); + case DImode: + case SImode: + case HImode: + case QImode: + cum->words += words; + cum->nregs -= words; + cum->regno += words; - if (TARGET_64BIT) - { - int int_nregs, sse_nregs; - if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)) - cum->words += words; - else if (sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs) + if (cum->nregs <= 0) { - cum->nregs -= int_nregs; - cum->sse_nregs -= sse_nregs; - cum->regno += int_nregs; - cum->sse_regno += sse_nregs; + cum->nregs = 0; + cum->regno = 0; } - else - cum->words += words; - } - else - { - switch (mode) - { - default: - break; + break; - case BLKmode: - if (bytes < 0) - break; - /* FALLTHRU */ + case DFmode: + if (cum->float_in_sse < 2) + break; + case SFmode: + if (cum->float_in_sse < 1) + break; + /* FALLTHRU */ - case DImode: - case SImode: - case HImode: - case QImode: - cum->words += words; - cum->nregs -= words; - cum->regno += words; + case TImode: + case V16QImode: + case V8HImode: + case V4SImode: + case V2DImode: + case V4SFmode: + case V2DFmode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + cum->sse_words += words; + cum->sse_nregs -= 1; + cum->sse_regno += 1; + if (cum->sse_nregs <= 0) + { + cum->sse_nregs = 0; + cum->sse_regno = 0; + } + } + break; - if (cum->nregs <= 0) + case V8QImode: + case V4HImode: + case V2SImode: + case V2SFmode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + cum->mmx_words += words; + cum->mmx_nregs -= 1; + cum->mmx_regno += 1; + if (cum->mmx_nregs <= 0) { - cum->nregs = 0; - cum->regno = 0; + cum->mmx_nregs = 0; + cum->mmx_regno = 0; } - break; + } + break; + } +} - case SFmode: - if (cum->float_in_sse > 0) - goto skip_80387; +static void +function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode, + tree type, HOST_WIDE_INT words) +{ + int int_nregs, sse_nregs; - case DFmode: - if (cum->float_in_sse > 1) - goto skip_80387; - - /* Because no inherent XFmode->DFmode and XFmode->SFmode - rounding takes place when values are passed in x87 - registers, pass DFmode and SFmode types to local functions - only when flag_unsafe_math_optimizations is set. */ - if (!cum->float_in_x87 - || (cum->float_in_x87 == 2 - && !flag_unsafe_math_optimizations)) - break; + if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs)) + cum->words += words; + else if (sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs) + { + cum->nregs -= int_nregs; + cum->sse_nregs -= sse_nregs; + cum->regno += int_nregs; + cum->sse_regno += sse_nregs; + } + else + cum->words += words; +} - case XFmode: - if (!cum->float_in_x87) - break; +static void +function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes, + HOST_WIDE_INT words) +{ + /* Otherwise, this should be passed indirect. */ + gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8); - if (!type || !AGGREGATE_TYPE_P (type)) - { - cum->x87_nregs -= 1; - cum->x87_regno += 1; - if (cum->x87_nregs <= 0) - { - cum->x87_nregs = 0; - cum->x87_regno = 0; - } - } - break; + cum->words += words; + if (cum->nregs > 0) + { + cum->nregs -= 1; + cum->regno += 1; + } +} - skip_80387: +void +function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode, + tree type, int named ATTRIBUTE_UNUSED) +{ + HOST_WIDE_INT bytes, words; - case TImode: - case V16QImode: - case V8HImode: - case V4SImode: - case V2DImode: - case V4SFmode: - case V2DFmode: - if (!type || !AGGREGATE_TYPE_P (type)) - { - cum->sse_nregs -= 1; - cum->sse_regno += 1; - if (cum->sse_nregs <= 0) - { - cum->sse_nregs = 0; - cum->sse_regno = 0; - } - } - break; + if (mode == BLKmode) + bytes = int_size_in_bytes (type); + else + bytes = GET_MODE_SIZE (mode); + words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; - case V8QImode: - case V4HImode: - case V2SImode: - case V2SFmode: - if (!type || !AGGREGATE_TYPE_P (type)) - { - cum->mmx_nregs -= 1; - cum->mmx_regno += 1; - if (cum->mmx_nregs <= 0) - { - cum->mmx_nregs = 0; - cum->mmx_regno = 0; - } - } - break; - } - } + if (type) + mode = type_natural_mode (type); + + if (TARGET_64BIT_MS_ABI) + function_arg_advance_ms_64 (cum, bytes, words); + else if (TARGET_64BIT) + function_arg_advance_64 (cum, mode, type, words); + else + function_arg_advance_32 (cum, mode, type, bytes, words); } /* Define where to put the arguments to a function. @@ -3700,156 +3828,180 @@ function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode, NAMED is nonzero if this argument is a named parameter (otherwise it is an extra parameter matching an ellipsis). */ -rtx -function_arg (CUMULATIVE_ARGS *cum, enum machine_mode orig_mode, - tree type, int named) +static rtx +function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, + enum machine_mode orig_mode, tree type, + HOST_WIDE_INT bytes, HOST_WIDE_INT words) { - enum machine_mode mode = orig_mode; - rtx ret = NULL_RTX; - int bytes = - (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); - int words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; static bool warnedsse, warnedmmx; - /* To simplify the code below, represent vector types with a vector mode - even if MMX/SSE are not active. */ - if (type && TREE_CODE (type) == VECTOR_TYPE) - mode = type_natural_mode (type); - - /* Handle a hidden AL argument containing number of registers for varargs - x86-64 functions. For i386 ABI just return constm1_rtx to avoid - any AL settings. */ + /* Avoid the AL settings for the Unix64 ABI. */ if (mode == VOIDmode) - { - if (TARGET_64BIT) - return GEN_INT (cum->maybe_vaarg - ? (cum->sse_nregs < 0 - ? SSE_REGPARM_MAX - : cum->sse_regno) - : -1); - else - return constm1_rtx; - } - if (TARGET_64BIT) - ret = construct_container (mode, orig_mode, type, 0, cum->nregs, - cum->sse_nregs, - &x86_64_int_parameter_registers [cum->regno], - cum->sse_regno); - else - switch (mode) - { - default: - break; - - case BLKmode: - if (bytes < 0) - break; - /* FALLTHRU */ - case DImode: - case SImode: - case HImode: - case QImode: - if (words <= cum->nregs) - { - int regno = cum->regno; + return constm1_rtx; - /* Fastcall allocates the first two DWORD (SImode) or - smaller arguments to ECX and EDX. */ - if (cum->fastcall) - { - if (mode == BLKmode || mode == DImode) - break; + switch (mode) + { + default: + break; - /* ECX not EAX is the first allocated register. */ - if (regno == 0) - regno = 2; - } - ret = gen_rtx_REG (mode, regno); - } + case BLKmode: + if (bytes < 0) break; + /* FALLTHRU */ + case DImode: + case SImode: + case HImode: + case QImode: + if (words <= cum->nregs) + { + int regno = cum->regno; - case SFmode: - if (cum->float_in_sse > 0) - goto skip_80387; + /* Fastcall allocates the first two DWORD (SImode) or + smaller arguments to ECX and EDX. */ + if (cum->fastcall) + { + if (mode == BLKmode || mode == DImode) + break; - case DFmode: - if (cum->float_in_sse > 1) - goto skip_80387; - - /* Because no inherent XFmode->DFmode and XFmode->SFmode - rounding takes place when values are passed in x87 - registers, pass DFmode and SFmode types to local functions - only when flag_unsafe_math_optimizations is set. */ - if (!cum->float_in_x87 - || (cum->float_in_x87 == 2 - && !flag_unsafe_math_optimizations)) - break; + /* ECX not EAX is the first allocated register. */ + if (regno == 0) + regno = 2; + } + return gen_rtx_REG (mode, regno); + } + break; - case XFmode: - if (!cum->float_in_x87) - break; + case DFmode: + if (cum->float_in_sse < 2) + break; + case SFmode: + if (cum->float_in_sse < 1) + break; + /* FALLTHRU */ + case TImode: + case V16QImode: + case V8HImode: + case V4SImode: + case V2DImode: + case V4SFmode: + case V2DFmode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + if (!TARGET_SSE && !warnedsse && cum->warn_sse) + { + warnedsse = true; + warning (0, "SSE vector argument without SSE enabled " + "changes the ABI"); + } + if (cum->sse_nregs) + return gen_reg_or_parallel (mode, orig_mode, + cum->sse_regno + FIRST_SSE_REG); + } + break; - if (!type || !AGGREGATE_TYPE_P (type)) - if (cum->x87_nregs) - ret = gen_rtx_REG (mode, cum->x87_regno + FIRST_FLOAT_REG); - break; + case V8QImode: + case V4HImode: + case V2SImode: + case V2SFmode: + if (!type || !AGGREGATE_TYPE_P (type)) + { + if (!TARGET_MMX && !warnedmmx && cum->warn_mmx) + { + warnedmmx = true; + warning (0, "MMX vector argument without MMX enabled " + "changes the ABI"); + } + if (cum->mmx_nregs) + return gen_reg_or_parallel (mode, orig_mode, + cum->mmx_regno + FIRST_MMX_REG); + } + break; + } - skip_80387: + return NULL_RTX; +} - case TImode: - case V16QImode: - case V8HImode: - case V4SImode: - case V2DImode: - case V4SFmode: - case V2DFmode: - if (!type || !AGGREGATE_TYPE_P (type)) - { - if (!TARGET_SSE && !warnedsse && cum->warn_sse) - { - warnedsse = true; - warning (0, "SSE vector argument without SSE enabled " - "changes the ABI"); - } - if (cum->sse_nregs) - ret = gen_reg_or_parallel (mode, orig_mode, - cum->sse_regno + FIRST_SSE_REG); - } - break; - case V8QImode: - case V4HImode: - case V2SImode: - case V2SFmode: - if (!type || !AGGREGATE_TYPE_P (type)) - { - if (!TARGET_MMX && !warnedmmx && cum->warn_mmx) - { - warnedmmx = true; - warning (0, "MMX vector argument without MMX enabled " - "changes the ABI"); - } - if (cum->mmx_nregs) - ret = gen_reg_or_parallel (mode, orig_mode, - cum->mmx_regno + FIRST_MMX_REG); - } - break; - } +static rtx +function_arg_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode, + enum machine_mode orig_mode, tree type) +{ + /* Handle a hidden AL argument containing number of registers + for varargs x86-64 functions. */ + if (mode == VOIDmode) + return GEN_INT (cum->maybe_vaarg + ? (cum->sse_nregs < 0 + ? SSE_REGPARM_MAX + : cum->sse_regno) + : -1); - if (TARGET_DEBUG_ARG) - { - fprintf (stderr, - "function_arg (size=%d, wds=%2d, nregs=%d, mode=%4s, named=%d, ", - words, cum->words, cum->nregs, GET_MODE_NAME (mode), named); + return construct_container (mode, orig_mode, type, 0, cum->nregs, + cum->sse_nregs, + &x86_64_int_parameter_registers [cum->regno], + cum->sse_regno); +} + +static rtx +function_arg_ms_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode, + enum machine_mode orig_mode, int named) +{ + unsigned int regno; + + /* Avoid the AL settings for the Unix64 ABI. */ + if (mode == VOIDmode) + return constm1_rtx; + + /* If we've run out of registers, it goes on the stack. */ + if (cum->nregs == 0) + return NULL_RTX; - if (ret) - print_simple_rtl (stderr, ret); + regno = x86_64_ms_abi_int_parameter_registers[cum->regno]; + + /* Only floating point modes are passed in anything but integer regs. */ + if (TARGET_SSE && (mode == SFmode || mode == DFmode)) + { + if (named) + regno = cum->regno + FIRST_SSE_REG; else - fprintf (stderr, ", stack"); + { + rtx t1, t2; - fprintf (stderr, " )\n"); + /* Unnamed floating parameters are passed in both the + SSE and integer registers. */ + t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG); + t2 = gen_rtx_REG (mode, regno); + t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx); + t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx); + return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2)); + } } - return ret; + return gen_reg_or_parallel (mode, orig_mode, regno); +} + +rtx +function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode, + tree type, int named) +{ + enum machine_mode mode = omode; + HOST_WIDE_INT bytes, words; + + if (mode == BLKmode) + bytes = int_size_in_bytes (type); + else + bytes = GET_MODE_SIZE (mode); + words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + + /* To simplify the code below, represent vector types with a vector mode + even if MMX/SSE are not active. */ + if (type && TREE_CODE (type) == VECTOR_TYPE) + mode = type_natural_mode (type); + + if (TARGET_64BIT_MS_ABI) + return function_arg_ms_64 (cum, mode, omode, named); + else if (TARGET_64BIT) + return function_arg_64 (cum, mode, omode, type); + else + return function_arg_32 (cum, mode, omode, type, bytes, words); } /* A C expression that indicates when an argument must be passed by @@ -3863,15 +4015,31 @@ ix86_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED, enum machine_mode mode ATTRIBUTE_UNUSED, tree type, bool named ATTRIBUTE_UNUSED) { - if (!TARGET_64BIT) - return 0; - - if (type && int_size_in_bytes (type) == -1) + if (TARGET_64BIT_MS_ABI) { - if (TARGET_DEBUG_ARG) - fprintf (stderr, "function_arg_pass_by_reference\n"); - return 1; + if (type) + { + /* Arrays are passed by reference. */ + if (TREE_CODE (type) == ARRAY_TYPE) + return true; + + if (AGGREGATE_TYPE_P (type)) + { + /* Structs/unions of sizes other than 8, 16, 32, or 64 bits + are passed by reference. */ + int el2 = exact_log2 (int_size_in_bytes (type)); + return !(el2 >= 0 && el2 <= 3); + } + } + + /* __m128 is passed by reference. */ + /* ??? How to handle complex? For now treat them as structs, + and pass them by reference if they're too large. */ + if (GET_MODE_SIZE (mode) > 8) + return true; } + else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1) + return 1; return 0; } @@ -3963,17 +4131,28 @@ ix86_function_arg_boundary (enum machine_mode mode, tree type) } /* Return true if N is a possible register number of function value. */ + bool ix86_function_value_regno_p (int regno) { - if (regno == 0 - || (regno == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387) - || (regno == FIRST_SSE_REG && TARGET_SSE)) - return true; + switch (regno) + { + case 0: + return true; - if (!TARGET_64BIT - && (regno == FIRST_MMX_REG && TARGET_MMX)) - return true; + case FIRST_FLOAT_REG: + if (TARGET_64BIT_MS_ABI) + return false; + return TARGET_FLOAT_RETURNS_IN_80387; + + case FIRST_SSE_REG: + return TARGET_SSE; + + case FIRST_MMX_REG: + if (TARGET_MACHO || TARGET_64BIT) + return false; + return TARGET_MMX; + } return false; } @@ -3982,44 +4161,146 @@ ix86_function_value_regno_p (int regno) VALTYPE is the data type of the value (as a tree). If the precise function being called is known, FUNC is its FUNCTION_DECL; otherwise, FUNC is 0. */ -rtx -ix86_function_value (tree valtype, tree fntype_or_decl, - bool outgoing ATTRIBUTE_UNUSED) + +static rtx +function_value_32 (enum machine_mode orig_mode, enum machine_mode mode, + tree fntype, tree fn) { - enum machine_mode natmode = type_natural_mode (valtype); + unsigned int regno; - if (TARGET_64BIT) + /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where + we normally prevent this case when mmx is not available. However + some ABIs may require the result to be returned like DImode. */ + if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8) + regno = TARGET_MMX ? FIRST_MMX_REG : 0; + + /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where + we prevent this case when sse is not available. However some ABIs + may require the result to be returned like integer TImode. */ + else if (mode == TImode + || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16)) + regno = TARGET_SSE ? FIRST_SSE_REG : 0; + + /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */ + else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387) + regno = FIRST_FLOAT_REG; + else + /* Most things go in %eax. */ + regno = 0; + + /* Override FP return register with %xmm0 for local functions when + SSE math is enabled or for functions with sseregparm attribute. */ + if ((fn || fntype) && (mode == SFmode || mode == DFmode)) { - rtx ret = construct_container (natmode, TYPE_MODE (valtype), valtype, - 1, REGPARM_MAX, SSE_REGPARM_MAX, - x86_64_int_return_registers, 0); - /* For zero sized structures, construct_container return NULL, but we - need to keep rest of compiler happy by returning meaningful value. */ - if (!ret) - ret = gen_rtx_REG (TYPE_MODE (valtype), 0); - return ret; + int sse_level = ix86_function_sseregparm (fntype, fn); + if ((sse_level >= 1 && mode == SFmode) + || (sse_level == 2 && mode == DFmode)) + regno = FIRST_SSE_REG; } - else + + return gen_rtx_REG (orig_mode, regno); +} + +static rtx +function_value_64 (enum machine_mode orig_mode, enum machine_mode mode, + tree valtype) +{ + rtx ret; + + /* Handle libcalls, which don't provide a type node. */ + if (valtype == NULL) { - tree fn = NULL_TREE, fntype; - if (fntype_or_decl - && DECL_P (fntype_or_decl)) - fn = fntype_or_decl; - fntype = fn ? TREE_TYPE (fn) : fntype_or_decl; - return gen_rtx_REG (TYPE_MODE (valtype), - ix86_value_regno (natmode, fn, fntype)); + switch (mode) + { + case SFmode: + case SCmode: + case DFmode: + case DCmode: + case TFmode: + case SDmode: + case DDmode: + case TDmode: + return gen_rtx_REG (mode, FIRST_SSE_REG); + case XFmode: + case XCmode: + return gen_rtx_REG (mode, FIRST_FLOAT_REG); + case TCmode: + return NULL; + default: + return gen_rtx_REG (mode, 0); + } } + + ret = construct_container (mode, orig_mode, valtype, 1, + REGPARM_MAX, SSE_REGPARM_MAX, + x86_64_int_return_registers, 0); + + /* For zero sized structures, construct_container returns NULL, but we + need to keep rest of compiler happy by returning meaningful value. */ + if (!ret) + ret = gen_rtx_REG (orig_mode, 0); + + return ret; } -/* Return true iff type is returned in memory. */ -int -ix86_return_in_memory (tree type) +static rtx +function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode) { - int needed_intregs, needed_sseregs, size; - enum machine_mode mode = type_natural_mode (type); + unsigned int regno = 0; - if (TARGET_64BIT) - return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs); + if (TARGET_SSE) + { + if (mode == SFmode || mode == DFmode) + regno = FIRST_SSE_REG; + else if (VECTOR_MODE_P (mode) || GET_MODE_SIZE (mode) == 16) + regno = FIRST_SSE_REG; + } + + return gen_rtx_REG (orig_mode, regno); +} + +static rtx +ix86_function_value_1 (tree valtype, tree fntype_or_decl, + enum machine_mode orig_mode, enum machine_mode mode) +{ + tree fn, fntype; + + fn = NULL_TREE; + if (fntype_or_decl && DECL_P (fntype_or_decl)) + fn = fntype_or_decl; + fntype = fn ? TREE_TYPE (fn) : fntype_or_decl; + + if (TARGET_64BIT_MS_ABI) + return function_value_ms_64 (orig_mode, mode); + else if (TARGET_64BIT) + return function_value_64 (orig_mode, mode, valtype); + else + return function_value_32 (orig_mode, mode, fntype, fn); +} + +static rtx +ix86_function_value (tree valtype, tree fntype_or_decl, + bool outgoing ATTRIBUTE_UNUSED) +{ + enum machine_mode mode, orig_mode; + + orig_mode = TYPE_MODE (valtype); + mode = type_natural_mode (valtype); + return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode); +} + +rtx +ix86_libcall_value (enum machine_mode mode) +{ + return ix86_function_value_1 (NULL, NULL, mode, mode); +} + +/* Return true iff type is returned in memory. */ + +static int +return_in_memory_32 (tree type, enum machine_mode mode) +{ + HOST_WIDE_INT size; if (mode == BLKmode) return 1; @@ -4056,6 +4337,39 @@ ix86_return_in_memory (tree type) return 0; } +static int +return_in_memory_64 (tree type, enum machine_mode mode) +{ + int needed_intregs, needed_sseregs; + return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs); +} + +static int +return_in_memory_ms_64 (tree type, enum machine_mode mode) +{ + HOST_WIDE_INT size = int_size_in_bytes (type); + + /* __m128 and friends are returned in xmm0. */ + if (size == 16 && VECTOR_MODE_P (mode)) + return 0; + + /* Otherwise, the size must be exactly in [1248]. */ + return (size != 1 && size != 2 && size != 4 && size != 8); +} + +int +ix86_return_in_memory (tree type) +{ + enum machine_mode mode = type_natural_mode (type); + + if (TARGET_64BIT_MS_ABI) + return return_in_memory_ms_64 (type, mode); + else if (TARGET_64BIT) + return return_in_memory_64 (type, mode); + else + return return_in_memory_32 (type, mode); +} + /* When returning SSE vector types, we have a choice of either (1) being abi incompatible with a -march switch, or (2) generating an error. @@ -4072,7 +4386,7 @@ ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED) { static bool warnedsse, warnedmmx; - if (type) + if (!TARGET_64BIT && type) { /* Look at the return type of the function, not the function type. */ enum machine_mode mode = TYPE_MODE (TREE_TYPE (type)); @@ -4102,91 +4416,20 @@ ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED) return NULL; } -/* Define how to find the value returned by a library function - assuming the value has mode MODE. */ -rtx -ix86_libcall_value (enum machine_mode mode) -{ - if (TARGET_64BIT) - { - switch (mode) - { - case SFmode: - case SCmode: - case DFmode: - case DCmode: - case TFmode: - case SDmode: - case DDmode: - case TDmode: - return gen_rtx_REG (mode, FIRST_SSE_REG); - case XFmode: - case XCmode: - return gen_rtx_REG (mode, FIRST_FLOAT_REG); - case TCmode: - return NULL; - default: - return gen_rtx_REG (mode, 0); - } - } - else - return gen_rtx_REG (mode, ix86_value_regno (mode, NULL, NULL)); -} - -/* Given a mode, return the register to use for a return value. */ + +/* Create the va_list data type. */ -static int -ix86_value_regno (enum machine_mode mode, tree func, tree fntype) +static tree +ix86_build_builtin_va_list (void) { - gcc_assert (!TARGET_64BIT); + tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl; - /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where - we normally prevent this case when mmx is not available. However - some ABIs may require the result to be returned like DImode. */ - if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8) - return TARGET_MMX ? FIRST_MMX_REG : 0; + /* For i386 we use plain pointer to argument area. */ + if (!TARGET_64BIT || TARGET_64BIT_MS_ABI) + return build_pointer_type (char_type_node); - /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where - we prevent this case when sse is not available. However some ABIs - may require the result to be returned like integer TImode. */ - if (mode == TImode || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16)) - return TARGET_SSE ? FIRST_SSE_REG : 0; - - /* Decimal floating point values can go in %eax, unlike other float modes. */ - if (DECIMAL_FLOAT_MODE_P (mode)) - return 0; - - /* Most things go in %eax, except (unless -mno-fp-ret-in-387) fp values. */ - if (!SCALAR_FLOAT_MODE_P (mode) || !TARGET_FLOAT_RETURNS_IN_80387) - return 0; - - /* Floating point return values in %st(0), except for local functions when - SSE math is enabled or for functions with sseregparm attribute. */ - if ((func || fntype) - && (mode == SFmode || mode == DFmode)) - { - int sse_level = ix86_function_sseregparm (fntype, func); - if ((sse_level >= 1 && mode == SFmode) - || (sse_level == 2 && mode == DFmode)) - return FIRST_SSE_REG; - } - - return FIRST_FLOAT_REG; -} - -/* Create the va_list data type. */ - -static tree -ix86_build_builtin_va_list (void) -{ - tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl; - - /* For i386 we use plain pointer to argument area. */ - if (!TARGET_64BIT) - return build_pointer_type (char_type_node); - - record = (*lang_hooks.types.make_type) (RECORD_TYPE); - type_decl = build_decl (TYPE_DECL, get_identifier ("__va_list_tag"), record); + record = (*lang_hooks.types.make_type) (RECORD_TYPE); + type_decl = build_decl (TYPE_DECL, get_identifier ("__va_list_tag"), record); f_gpr = build_decl (FIELD_DECL, get_identifier ("gp_offset"), unsigned_type_node); @@ -4221,51 +4464,29 @@ ix86_build_builtin_va_list (void) /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */ static void -ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode, - tree type, int *pretend_size ATTRIBUTE_UNUSED, - int no_rtl) +setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) { - CUMULATIVE_ARGS next_cum; - rtx save_area = NULL_RTX, mem; + rtx save_area, mem; rtx label; rtx label_ref; rtx tmp_reg; rtx nsse_reg; int set; - tree fntype; - int stdarg_p; int i; - if (!TARGET_64BIT) - return; - if (! cfun->va_list_gpr_size && ! cfun->va_list_fpr_size) return; /* Indicate to allocate space on the stack for varargs save area. */ ix86_save_varrargs_registers = 1; - cfun->stack_alignment_needed = 128; - fntype = TREE_TYPE (current_function_decl); - stdarg_p = (TYPE_ARG_TYPES (fntype) != 0 - && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (fntype))) - != void_type_node)); - - /* For varargs, we do not want to skip the dummy va_dcl argument. - For stdargs, we do want to skip the last named argument. */ - next_cum = *cum; - if (stdarg_p) - function_arg_advance (&next_cum, mode, type, 1); - - if (!no_rtl) - save_area = frame_pointer_rtx; - + save_area = frame_pointer_rtx; set = get_varargs_alias_set (); - for (i = next_cum.regno; + for (i = cum->regno; i < ix86_regparm - && i < next_cum.regno + cfun->va_list_gpr_size / UNITS_PER_WORD; + && i < cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD; i++) { mem = gen_rtx_MEM (Pmode, @@ -4276,7 +4497,7 @@ ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode, x86_64_int_parameter_registers[i])); } - if (next_cum.sse_nregs && cfun->va_list_fpr_size) + if (cum->sse_nregs && cfun->va_list_fpr_size) { /* Now emit code to save SSE registers. The AX parameter contains number of SSE parameter registers used to call this function. We use @@ -4294,13 +4515,13 @@ ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode, emit_insn (gen_rtx_SET (VOIDmode, tmp_reg, gen_rtx_MULT (Pmode, nsse_reg, GEN_INT (4)))); - if (next_cum.sse_regno) + if (cum->sse_regno) emit_move_insn (nsse_reg, gen_rtx_CONST (DImode, gen_rtx_PLUS (DImode, label_ref, - GEN_INT (next_cum.sse_regno * 4)))); + GEN_INT (cum->sse_regno * 4)))); else emit_move_insn (nsse_reg, label_ref); emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg)); @@ -4319,9 +4540,62 @@ ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode, /* And finally do the dirty job! */ emit_insn (gen_sse_prologue_save (mem, nsse_reg, - GEN_INT (next_cum.sse_regno), label)); + GEN_INT (cum->sse_regno), label)); + } +} + +static void +setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum) +{ + int set = get_varargs_alias_set (); + int i; + + for (i = cum->regno; i < REGPARM_MAX; i++) + { + rtx reg, mem; + + mem = gen_rtx_MEM (Pmode, + plus_constant (virtual_incoming_args_rtx, + i * UNITS_PER_WORD)); + MEM_NOTRAP_P (mem) = 1; + set_mem_alias_set (mem, set); + + reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]); + emit_move_insn (mem, reg); } +} + +static void +ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode, + tree type, int *pretend_size ATTRIBUTE_UNUSED, + int no_rtl) +{ + CUMULATIVE_ARGS next_cum; + tree fntype; + int stdarg_p; + + /* This argument doesn't appear to be used anymore. Which is good, + because the old code here didn't suppress rtl generation. */ + gcc_assert (!no_rtl); + + if (!TARGET_64BIT) + return; + + fntype = TREE_TYPE (current_function_decl); + stdarg_p = (TYPE_ARG_TYPES (fntype) != 0 + && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (fntype))) + != void_type_node)); + + /* For varargs, we do not want to skip the dummy va_dcl argument. + For stdargs, we do want to skip the last named argument. */ + next_cum = *cum; + if (stdarg_p) + function_arg_advance (&next_cum, mode, type, 1); + if (TARGET_64BIT_MS_ABI) + setup_incoming_varargs_ms_64 (&next_cum); + else + setup_incoming_varargs_64 (&next_cum); } /* Implement va_start. */ @@ -4335,7 +4609,7 @@ ix86_va_start (tree valist, rtx nextarg) tree type; /* Only 64bit target needs something special. */ - if (!TARGET_64BIT) + if (!TARGET_64BIT || TARGET_64BIT_MS_ABI) { std_expand_builtin_va_start (valist, nextarg); return; @@ -4357,14 +4631,10 @@ ix86_va_start (tree valist, rtx nextarg) n_gpr = current_function_args_info.regno; n_fpr = current_function_args_info.sse_regno; - if (TARGET_DEBUG_ARG) - fprintf (stderr, "va_start: words = %d, n_gpr = %d, n_fpr = %d\n", - (int) words, (int) n_gpr, (int) n_fpr); - if (cfun->va_list_gpr_size) { type = TREE_TYPE (gpr); - t = build2 (MODIFY_EXPR, type, gpr, + t = build2 (GIMPLE_MODIFY_STMT, type, gpr, build_int_cst (type, n_gpr * 8)); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); @@ -4373,7 +4643,7 @@ ix86_va_start (tree valist, rtx nextarg) if (cfun->va_list_fpr_size) { type = TREE_TYPE (fpr); - t = build2 (MODIFY_EXPR, type, fpr, + t = build2 (GIMPLE_MODIFY_STMT, type, fpr, build_int_cst (type, n_fpr * 16 + 8*REGPARM_MAX)); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); @@ -4385,7 +4655,7 @@ ix86_va_start (tree valist, rtx nextarg) if (words != 0) t = build2 (PLUS_EXPR, type, t, build_int_cst (type, words * UNITS_PER_WORD)); - t = build2 (MODIFY_EXPR, type, ovf, t); + t = build2 (GIMPLE_MODIFY_STMT, type, ovf, t); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); @@ -4395,7 +4665,7 @@ ix86_va_start (tree valist, rtx nextarg) Prologue of the function save it right above stack frame. */ type = TREE_TYPE (sav); t = make_tree (type, frame_pointer_rtx); - t = build2 (MODIFY_EXPR, type, sav, t); + t = build2 (GIMPLE_MODIFY_STMT, type, sav, t); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); } @@ -4403,7 +4673,7 @@ ix86_va_start (tree valist, rtx nextarg) /* Implement va_arg. */ -tree +static tree ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) { static const int intreg[6] = { 0, 1, 2, 3, 4, 5 }; @@ -4418,7 +4688,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) enum machine_mode nat_mode; /* Only 64bit target needs something special. */ - if (!TARGET_64BIT) + if (!TARGET_64BIT || TARGET_64BIT_MS_ABI) return std_gimplify_va_arg_expr (valist, type, pre_p, post_p); f_gpr = TYPE_FIELDS (TREE_TYPE (va_list_type_node)); @@ -4532,7 +4802,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) /* int_addr = gpr + sav; */ t = fold_convert (ptr_type_node, gpr); t = build2 (PLUS_EXPR, ptr_type_node, sav, t); - t = build2 (MODIFY_EXPR, void_type_node, int_addr, t); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, int_addr, t); gimplify_and_add (t, pre_p); } if (needed_sseregs) @@ -4540,7 +4810,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) /* sse_addr = fpr + sav; */ t = fold_convert (ptr_type_node, fpr); t = build2 (PLUS_EXPR, ptr_type_node, sav, t); - t = build2 (MODIFY_EXPR, void_type_node, sse_addr, t); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, sse_addr, t); gimplify_and_add (t, pre_p); } if (need_temp) @@ -4550,7 +4820,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) /* addr = &temp; */ t = build1 (ADDR_EXPR, build_pointer_type (type), temp); - t = build2 (MODIFY_EXPR, void_type_node, addr, t); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t); gimplify_and_add (t, pre_p); for (i = 0; i < XVECLEN (container, 0); i++) @@ -4575,16 +4845,16 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) src_offset = REGNO (reg) * 8; } src_addr = fold_convert (addr_type, src_addr); - src_addr = fold (build2 (PLUS_EXPR, addr_type, src_addr, - size_int (src_offset))); + src_addr = fold_build2 (PLUS_EXPR, addr_type, src_addr, + size_int (src_offset)); src = build_va_arg_indirect_ref (src_addr); dest_addr = fold_convert (addr_type, addr); - dest_addr = fold (build2 (PLUS_EXPR, addr_type, dest_addr, - size_int (INTVAL (XEXP (slot, 1))))); + dest_addr = fold_build2 (PLUS_EXPR, addr_type, dest_addr, + size_int (INTVAL (XEXP (slot, 1)))); dest = build_va_arg_indirect_ref (dest_addr); - t = build2 (MODIFY_EXPR, void_type_node, dest, src); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, dest, src); gimplify_and_add (t, pre_p); } } @@ -4593,14 +4863,14 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) { t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr, build_int_cst (TREE_TYPE (gpr), needed_intregs * 8)); - t = build2 (MODIFY_EXPR, TREE_TYPE (gpr), gpr, t); + t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (gpr), gpr, t); gimplify_and_add (t, pre_p); } if (needed_sseregs) { t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr, build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16)); - t = build2 (MODIFY_EXPR, TREE_TYPE (fpr), fpr, t); + t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (fpr), fpr, t); gimplify_and_add (t, pre_p); } @@ -4627,12 +4897,12 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) } gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue); - t2 = build2 (MODIFY_EXPR, void_type_node, addr, t); + t2 = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t); gimplify_and_add (t2, pre_p); t = build2 (PLUS_EXPR, TREE_TYPE (t), t, build_int_cst (TREE_TYPE (t), rsize * UNITS_PER_WORD)); - t = build2 (MODIFY_EXPR, TREE_TYPE (ovf), ovf, t); + t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (ovf), ovf, t); gimplify_and_add (t, pre_p); if (container) @@ -4664,7 +4934,7 @@ ix86_check_movabs (rtx insn, int opnum) mem = XEXP (set, opnum); while (GET_CODE (mem) == SUBREG) mem = SUBREG_REG (mem); - gcc_assert (GET_CODE (mem) == MEM); + gcc_assert (MEM_P (mem)); return (volatile_ok || !MEM_VOLATILE_P (mem)); } @@ -4700,22 +4970,24 @@ init_ext_80387_constants (void) int standard_80387_constant_p (rtx x) { + enum machine_mode mode = GET_MODE (x); + REAL_VALUE_TYPE r; - if (GET_CODE (x) != CONST_DOUBLE || !FLOAT_MODE_P (GET_MODE (x))) + if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE))) return -1; - if (x == CONST0_RTX (GET_MODE (x))) + if (x == CONST0_RTX (mode)) return 1; - if (x == CONST1_RTX (GET_MODE (x))) + if (x == CONST1_RTX (mode)) return 2; REAL_VALUE_FROM_CONST_DOUBLE (r, x); /* For XFmode constants, try to find a special 80387 instruction when optimizing for size or on those CPUs that benefit from them. */ - if (GET_MODE (x) == XFmode - && (optimize_size || x86_ext_80387_constants & TUNEMASK)) + if (mode == XFmode + && (optimize_size || TARGET_EXT_80387_CONSTANTS)) { int i; @@ -5043,6 +5315,23 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED) rtx xops[3]; xops[0] = dest; + + if (TARGET_VXWORKS_RTP && flag_pic) + { + /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */ + xops[2] = gen_rtx_MEM (Pmode, + gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE)); + output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops); + + /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register. + Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as + an unadorned address. */ + xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX); + SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL; + output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops); + return ""; + } + xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic) @@ -5355,18 +5644,22 @@ ix86_compute_frame_layout (struct ix86_frame *frame) frame->to_allocate -= frame->red_zone_size; frame->stack_pointer_offset -= frame->red_zone_size; #if 0 - fprintf (stderr, "nregs: %i\n", frame->nregs); - fprintf (stderr, "size: %i\n", size); - fprintf (stderr, "alignment1: %i\n", stack_alignment_needed); - fprintf (stderr, "padding1: %i\n", frame->padding1); - fprintf (stderr, "va_arg: %i\n", frame->va_arg_size); - fprintf (stderr, "padding2: %i\n", frame->padding2); - fprintf (stderr, "to_allocate: %i\n", frame->to_allocate); - fprintf (stderr, "red_zone_size: %i\n", frame->red_zone_size); - fprintf (stderr, "frame_pointer_offset: %i\n", frame->frame_pointer_offset); - fprintf (stderr, "hard_frame_pointer_offset: %i\n", - frame->hard_frame_pointer_offset); - fprintf (stderr, "stack_pointer_offset: %i\n", frame->stack_pointer_offset); + fprintf (stderr, "\n"); + fprintf (stderr, "nregs: %ld\n", (long)frame->nregs); + fprintf (stderr, "size: %ld\n", (long)size); + fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed); + fprintf (stderr, "padding1: %ld\n", (long)frame->padding1); + fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size); + fprintf (stderr, "padding2: %ld\n", (long)frame->padding2); + fprintf (stderr, "to_allocate: %ld\n", (long)frame->to_allocate); + fprintf (stderr, "red_zone_size: %ld\n", (long)frame->red_zone_size); + fprintf (stderr, "frame_pointer_offset: %ld\n", (long)frame->frame_pointer_offset); + fprintf (stderr, "hard_frame_pointer_offset: %ld\n", + (long)frame->hard_frame_pointer_offset); + fprintf (stderr, "stack_pointer_offset: %ld\n", (long)frame->stack_pointer_offset); + fprintf (stderr, "current_function_is_leaf: %ld\n", (long)current_function_is_leaf); + fprintf (stderr, "current_function_calls_alloca: %ld\n", (long)current_function_calls_alloca); + fprintf (stderr, "x86_current_function_calls_tls_descriptor: %ld\n", (long)ix86_current_function_calls_tls_descriptor); #endif } @@ -5428,7 +5721,7 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style) shouldn't be used together with huge frame sizes in one function because of the frame_size check in sibcall.c. */ gcc_assert (style); - r11 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */); + r11 = gen_rtx_REG (DImode, R11_REG); insn = emit_insn (gen_rtx_SET (DImode, r11, offset)); if (style < 0) RTX_FRAME_RELATED_P (insn) = 1; @@ -5589,21 +5882,30 @@ ix86_expand_prologue (void) else { /* Only valid for Win32. */ - rtx eax = gen_rtx_REG (SImode, 0); - bool eax_live = ix86_eax_live_at_start_p (); + rtx eax = gen_rtx_REG (Pmode, 0); + bool eax_live; rtx t; - gcc_assert (!TARGET_64BIT); + gcc_assert (!TARGET_64BIT || TARGET_64BIT_MS_ABI); + + if (TARGET_64BIT_MS_ABI) + eax_live = false; + else + eax_live = ix86_eax_live_at_start_p (); if (eax_live) { emit_insn (gen_push (eax)); - allocate -= 4; + allocate -= UNITS_PER_WORD; } emit_move_insn (eax, GEN_INT (allocate)); - insn = emit_insn (gen_allocate_stack_worker (eax)); + if (TARGET_64BIT) + insn = gen_allocate_stack_worker_64 (eax); + else + insn = gen_allocate_stack_worker_32 (eax); + insn = emit_insn (insn); RTX_FRAME_RELATED_P (insn) = 1; t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (-allocate)); t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t); @@ -5619,7 +5921,7 @@ ix86_expand_prologue (void) - frame.nregs * UNITS_PER_WORD); else t = plus_constant (stack_pointer_rtx, allocate); - emit_move_insn (eax, gen_rtx_MEM (SImode, t)); + emit_move_insn (eax, gen_rtx_MEM (Pmode, t)); } } @@ -5648,7 +5950,25 @@ ix86_expand_prologue (void) if (pic_reg_used) { if (TARGET_64BIT) - insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx)); + { + if (ix86_cmodel == CM_LARGE_PIC) + { + rtx tmp_reg = gen_rtx_REG (DImode, + FIRST_REX_INT_REG + 3 /* R11 */); + rtx label = gen_label_rtx (); + emit_label (label); + LABEL_PRESERVE_P (label) = 1; + gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg)); + insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label)); + REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_MAYBE_DEAD, const0_rtx, NULL); + insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label)); + REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_MAYBE_DEAD, const0_rtx, NULL); + insn = emit_insn (gen_adddi3 (pic_offset_table_rtx, + pic_offset_table_rtx, tmp_reg)); + } + else + insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx)); + } else insn = emit_insn (gen_set_got (pic_offset_table_rtx)); @@ -5685,7 +6005,7 @@ ix86_emit_restore_regs_using_mov (rtx pointer, HOST_WIDE_INT offset, { rtx r11; - r11 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */); + r11 = gen_rtx_REG (DImode, R11_REG); emit_move_insn (r11, GEN_INT (offset)); emit_insn (gen_adddi3 (r11, r11, pointer)); base_address = gen_rtx_MEM (Pmode, r11); @@ -5855,7 +6175,7 @@ ix86_expand_epilogue (int style) { rtx ecx = gen_rtx_REG (SImode, 2); - /* There is no "pascal" calling convention in 64bit ABI. */ + /* There is no "pascal" calling convention in any 64bit ABI. */ gcc_assert (!TARGET_64BIT); emit_insn (gen_popsi1 (ecx)); @@ -5911,7 +6231,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) int retval = 1; enum ix86_address_seg seg = SEG_DEFAULT; - if (GET_CODE (addr) == REG || GET_CODE (addr) == SUBREG) + if (REG_P (addr) || GET_CODE (addr) == SUBREG) base = addr; else if (GET_CODE (addr) == PLUS) { @@ -5988,7 +6308,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) /* We're called for lea too, which implements ashift on occasion. */ index = XEXP (addr, 0); tmp = XEXP (addr, 1); - if (GET_CODE (tmp) != CONST_INT) + if (!CONST_INT_P (tmp)) return 0; scale = INTVAL (tmp); if ((unsigned HOST_WIDE_INT) scale > 3) @@ -6002,7 +6322,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) /* Extract the integral value of scale. */ if (scale_rtx) { - if (GET_CODE (scale_rtx) != CONST_INT) + if (!CONST_INT_P (scale_rtx)) return 0; scale = INTVAL (scale_rtx); } @@ -6116,46 +6436,6 @@ ix86_address_cost (rtx x) return cost; } -/* If X is a machine specific address (i.e. a symbol or label being - referenced as a displacement from the GOT implemented using an - UNSPEC), then return the base term. Otherwise return X. */ - -rtx -ix86_find_base_term (rtx x) -{ - rtx term; - - if (TARGET_64BIT) - { - if (GET_CODE (x) != CONST) - return x; - term = XEXP (x, 0); - if (GET_CODE (term) == PLUS - && (GET_CODE (XEXP (term, 1)) == CONST_INT - || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE)) - term = XEXP (term, 0); - if (GET_CODE (term) != UNSPEC - || XINT (term, 1) != UNSPEC_GOTPCREL) - return x; - - term = XVECEXP (term, 0, 0); - - if (GET_CODE (term) != SYMBOL_REF - && GET_CODE (term) != LABEL_REF) - return x; - - return term; - } - - term = ix86_delegitimize_address (x); - - if (GET_CODE (term) != SYMBOL_REF - && GET_CODE (term) != LABEL_REF) - return x; - - return term; -} - /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as this is used for to form addresses to local data when -fPIC is in use. */ @@ -6177,7 +6457,7 @@ darwin_local_data_pic (rtx disp) return false; } - + /* Determine if a given RTX is a valid constant. We already know this satisfies CONSTANT_P. */ @@ -6191,7 +6471,7 @@ legitimate_constant_p (rtx x) if (GET_CODE (x) == PLUS) { - if (GET_CODE (XEXP (x, 1)) != CONST_INT) + if (!CONST_INT_P (XEXP (x, 1))) return false; x = XEXP (x, 0); } @@ -6203,7 +6483,9 @@ legitimate_constant_p (rtx x) if (GET_CODE (x) == UNSPEC) switch (XINT (x, 1)) { + case UNSPEC_GOT: case UNSPEC_GOTOFF: + case UNSPEC_PLTOFF: return TARGET_64BIT; case UNSPEC_TPOFF: case UNSPEC_NTPOFF: @@ -6229,6 +6511,11 @@ legitimate_constant_p (rtx x) /* TLS symbols are never valid. */ if (SYMBOL_REF_TLS_MODEL (x)) return false; + + /* DLLIMPORT symbols are never valid. */ + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES + && SYMBOL_REF_DLLIMPORT_P (x)) + return false; break; case CONST_DOUBLE: @@ -6294,14 +6581,16 @@ legitimate_pic_operand_p (rtx x) case CONST: inner = XEXP (x, 0); if (GET_CODE (inner) == PLUS - && GET_CODE (XEXP (inner, 1)) == CONST_INT) + && CONST_INT_P (XEXP (inner, 1))) inner = XEXP (inner, 0); /* Only some unspecs are valid as "constants". */ if (GET_CODE (inner) == UNSPEC) switch (XINT (inner, 1)) { + case UNSPEC_GOT: case UNSPEC_GOTOFF: + case UNSPEC_PLTOFF: return TARGET_64BIT; case UNSPEC_TPOFF: x = XVECEXP (inner, 0, 0); @@ -6345,7 +6634,7 @@ legitimate_pic_address_disp_p (rtx disp) break; op0 = XEXP (XEXP (disp, 0), 0); op1 = XEXP (XEXP (disp, 0), 1); - if (GET_CODE (op1) != CONST_INT + if (!CONST_INT_P (op1) || INTVAL (op1) >= 16*1024*1024 || INTVAL (op1) < -16*1024*1024) break; @@ -6359,7 +6648,8 @@ legitimate_pic_address_disp_p (rtx disp) /* TLS references should always be enclosed in UNSPEC. */ if (SYMBOL_REF_TLS_MODEL (op0)) return false; - if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)) + if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0) + && ix86_cmodel != CM_LARGE_PIC) return true; break; @@ -6377,7 +6667,8 @@ legitimate_pic_address_disp_p (rtx disp) of GOT tables. We should not need these anyway. */ if (GET_CODE (disp) != UNSPEC || (XINT (disp, 1) != UNSPEC_GOTPCREL - && XINT (disp, 1) != UNSPEC_GOTOFF)) + && XINT (disp, 1) != UNSPEC_GOTOFF + && XINT (disp, 1) != UNSPEC_PLTOFF)) return 0; if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF @@ -6389,7 +6680,7 @@ legitimate_pic_address_disp_p (rtx disp) saw_plus = false; if (GET_CODE (disp) == PLUS) { - if (GET_CODE (XEXP (disp, 1)) != CONST_INT) + if (!CONST_INT_P (XEXP (disp, 1))) return 0; disp = XEXP (disp, 0); saw_plus = true; @@ -6406,7 +6697,11 @@ legitimate_pic_address_disp_p (rtx disp) case UNSPEC_GOT: if (saw_plus) return false; - return GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF; + /* We need to check for both symbols and labels because VxWorks loads + text labels with @GOT rather than @GOTOFF. See gotoff_operand for + details. */ + return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF + || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF); case UNSPEC_GOTOFF: /* Refuse GOTOFF in 64bit mode since it is always 64bit when used. While ABI specify also 32bit relocation but we don't produce it in @@ -6414,7 +6709,7 @@ legitimate_pic_address_disp_p (rtx disp) if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF) && !TARGET_64BIT) - return local_symbolic_operand (XVECEXP (disp, 0, 0), Pmode); + return gotoff_operand (XVECEXP (disp, 0, 0), Pmode); return false; case UNSPEC_GOTTPOFF: case UNSPEC_GOTNTPOFF: @@ -6446,7 +6741,8 @@ legitimate_pic_address_disp_p (rtx disp) be recognized. */ int -legitimate_address_p (enum machine_mode mode, rtx addr, int strict) +legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED, + rtx addr, int strict) { struct ix86_address parts; rtx base, index, disp; @@ -6454,14 +6750,6 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) const char *reason = NULL; rtx reason_rtx = NULL_RTX; - if (TARGET_DEBUG_ADDR) - { - fprintf (stderr, - "\n======\nGO_IF_LEGITIMATE_ADDRESS, mode = %s, strict = %d\n", - GET_MODE_NAME (mode), strict); - debug_rtx (addr); - } - if (ix86_decompose_address (addr, &parts) <= 0) { reason = "decomposition failed"; @@ -6617,7 +6905,7 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) if (GET_CODE (disp) != CONST || GET_CODE (XEXP (disp, 0)) != PLUS || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC - || GET_CODE (XEXP (XEXP (disp, 0), 1)) != CONST_INT + || !CONST_INT_P (XEXP (XEXP (disp, 0), 1)) || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF)) { @@ -6654,7 +6942,7 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) correct fix for crash to disable this test. */ } else if (GET_CODE (disp) != LABEL_REF - && GET_CODE (disp) != CONST_INT + && !CONST_INT_P (disp) && (GET_CODE (disp) != CONST || !legitimate_constant_p (disp)) && (GET_CODE (disp) != SYMBOL_REF @@ -6672,16 +6960,9 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) } /* Everything looks valid. */ - if (TARGET_DEBUG_ADDR) - fprintf (stderr, "Success.\n"); return TRUE; report_error: - if (TARGET_DEBUG_ADDR) - { - fprintf (stderr, "Error: %s\n", reason); - debug_rtx (reason_rtx); - } return FALSE; } @@ -6735,7 +7016,7 @@ legitimize_pic_address (rtx orig, rtx reg) new = addr; else if (TARGET_64BIT && ix86_cmodel != CM_SMALL_PIC - && local_symbolic_operand (addr, Pmode)) + && gotoff_operand (addr, Pmode)) { rtx tmpreg; /* This symbol may be referenced via a displacement from the PIC @@ -6747,7 +7028,8 @@ legitimize_pic_address (rtx orig, rtx reg) addr = XEXP (addr, 0); if (GET_CODE (addr) == PLUS) { - new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), UNSPEC_GOTOFF); + new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), + UNSPEC_GOTOFF); new = gen_rtx_PLUS (Pmode, new, XEXP (addr, 1)); } else @@ -6767,7 +7049,7 @@ legitimize_pic_address (rtx orig, rtx reg) } else new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg); } - else if (!TARGET_64BIT && local_symbolic_operand (addr, Pmode)) + else if (!TARGET_64BIT && gotoff_operand (addr, Pmode)) { /* This symbol may be referenced via a displacement from the PIC base address (@GOTOFF). */ @@ -6778,7 +7060,8 @@ legitimize_pic_address (rtx orig, rtx reg) addr = XEXP (addr, 0); if (GET_CODE (addr) == PLUS) { - new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), UNSPEC_GOTOFF); + new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)), + UNSPEC_GOTOFF); new = gen_rtx_PLUS (Pmode, new, XEXP (addr, 1)); } else @@ -6792,9 +7075,17 @@ legitimize_pic_address (rtx orig, rtx reg) new = reg; } } - else if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0) + else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0) + /* We can't use @GOTOFF for text labels on VxWorks; + see gotoff_operand. */ + || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF)) { - if (TARGET_64BIT) + /* Given that we've already handled dllimport variables separately + in legitimize_address, and all other variables should satisfy + legitimate_pic_address_disp_p, we should never arrive here. */ + gcc_assert (!TARGET_64BIT_MS_ABI); + + if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC) { new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL); new = gen_rtx_CONST (Pmode, new); @@ -6818,6 +7109,8 @@ legitimize_pic_address (rtx orig, rtx reg) regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1; new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT); new = gen_rtx_CONST (Pmode, new); + if (TARGET_64BIT) + new = force_reg (Pmode, new); new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new); new = gen_const_mem (Pmode, new); set_mem_alias_set (new, ix86_GOT_alias_set ()); @@ -6830,7 +7123,7 @@ legitimize_pic_address (rtx orig, rtx reg) } else { - if (GET_CODE (addr) == CONST_INT + if (CONST_INT_P (addr) && !x86_64_immediate_operand (addr, VOIDmode)) { if (reg) @@ -6860,8 +7153,8 @@ legitimize_pic_address (rtx orig, rtx reg) /* Check first to see if this is a constant offset from a @GOTOFF symbol reference. */ - if (local_symbolic_operand (op0, Pmode) - && GET_CODE (op1) == CONST_INT) + if (gotoff_operand (op0, Pmode) + && CONST_INT_P (op1)) { if (!TARGET_64BIT) { @@ -6896,7 +7189,7 @@ legitimize_pic_address (rtx orig, rtx reg) new = legitimize_pic_address (XEXP (addr, 1), base == reg ? NULL_RTX : reg); - if (GET_CODE (new) == CONST_INT) + if (CONST_INT_P (new)) new = plus_constant (base, INTVAL (new)); else { @@ -7087,6 +7380,90 @@ legitimize_tls_address (rtx x, enum tls_model model, int for_mov) return dest; } +/* Create or return the unique __imp_DECL dllimport symbol corresponding + to symbol DECL. */ + +static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map))) + htab_t dllimport_map; + +static tree +get_dllimport_decl (tree decl) +{ + struct tree_map *h, in; + void **loc; + const char *name; + const char *prefix; + size_t namelen, prefixlen; + char *imp_name; + tree to; + rtx rtl; + + if (!dllimport_map) + dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0); + + in.hash = htab_hash_pointer (decl); + in.base.from = decl; + loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT); + h = *loc; + if (h) + return h->to; + + *loc = h = ggc_alloc (sizeof (struct tree_map)); + h->hash = in.hash; + h->base.from = decl; + h->to = to = build_decl (VAR_DECL, NULL, ptr_type_node); + DECL_ARTIFICIAL (to) = 1; + DECL_IGNORED_P (to) = 1; + DECL_EXTERNAL (to) = 1; + TREE_READONLY (to) = 1; + + name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)); + name = targetm.strip_name_encoding (name); + if (name[0] == FASTCALL_PREFIX) + { + name++; + prefix = "*__imp_"; + } + else + prefix = "*__imp__"; + + namelen = strlen (name); + prefixlen = strlen (prefix); + imp_name = alloca (namelen + prefixlen + 1); + memcpy (imp_name, prefix, prefixlen); + memcpy (imp_name + prefixlen, name, namelen + 1); + + name = ggc_alloc_string (imp_name, namelen + prefixlen); + rtl = gen_rtx_SYMBOL_REF (Pmode, name); + SET_SYMBOL_REF_DECL (rtl, to); + SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL; + + rtl = gen_const_mem (Pmode, rtl); + set_mem_alias_set (rtl, ix86_GOT_alias_set ()); + + SET_DECL_RTL (to, rtl); + + return to; +} + +/* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is + true if we require the result be a register. */ + +static rtx +legitimize_dllimport_symbol (rtx symbol, bool want_reg) +{ + tree imp_decl; + rtx x; + + gcc_assert (SYMBOL_REF_DECL (symbol)); + imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol)); + + x = DECL_RTL (imp_decl); + if (want_reg) + x = force_reg (Pmode, x); + return x; +} + /* Try machine-dependent ways of modifying an illegitimate address to be legitimate. If we find one, return the new, valid address. This macro is used in only one place: `memory_address' in explow.c. @@ -7114,13 +7491,6 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) int changed = 0; unsigned log; - if (TARGET_DEBUG_ADDR) - { - fprintf (stderr, "\n==========\nLEGITIMIZE_ADDRESS, mode = %s\n", - GET_MODE_NAME (mode)); - debug_rtx (x); - } - log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0; if (log) return legitimize_tls_address (x, log, false); @@ -7136,9 +7506,23 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) if (flag_pic && SYMBOLIC_CONST (x)) return legitimize_pic_address (x, 0); + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES) + { + if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x)) + return legitimize_dllimport_symbol (x, true); + if (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF + && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0))) + { + rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true); + return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1)); + } + } + /* Canonicalize shifts by 0, 1, 2, 3 into multiply */ if (GET_CODE (x) == ASHIFT - && GET_CODE (XEXP (x, 1)) == CONST_INT + && CONST_INT_P (XEXP (x, 1)) && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4) { changed = 1; @@ -7152,7 +7536,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */ if (GET_CODE (XEXP (x, 0)) == ASHIFT - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (x, 0), 1)) && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4) { changed = 1; @@ -7163,7 +7547,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) } if (GET_CODE (XEXP (x, 1)) == ASHIFT - && GET_CODE (XEXP (XEXP (x, 1), 1)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (x, 1), 1)) && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4) { changed = 1; @@ -7206,12 +7590,12 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) rtx constant; rtx other = NULL_RTX; - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) { constant = XEXP (x, 1); other = XEXP (XEXP (XEXP (x, 0), 1), 1); } - else if (GET_CODE (XEXP (XEXP (XEXP (x, 0), 1), 1)) == CONST_INT) + else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1))) { constant = XEXP (XEXP (XEXP (x, 0), 1), 1); other = XEXP (x, 1); @@ -7245,8 +7629,8 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) } if (changed - && GET_CODE (XEXP (x, 1)) == REG - && GET_CODE (XEXP (x, 0)) == REG) + && REG_P (XEXP (x, 1)) + && REG_P (XEXP (x, 0))) return x; if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1))) @@ -7258,7 +7642,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) if (changed && legitimate_address_p (mode, x, FALSE)) return x; - if (GET_CODE (XEXP (x, 0)) == REG) + if (REG_P (XEXP (x, 0))) { rtx temp = gen_reg_rtx (Pmode); rtx val = force_operand (XEXP (x, 1), temp); @@ -7269,7 +7653,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) return x; } - else if (GET_CODE (XEXP (x, 1)) == REG) + else if (REG_P (XEXP (x, 1))) { rtx temp = gen_reg_rtx (Pmode); rtx val = force_operand (XEXP (x, 0), temp); @@ -7302,8 +7686,26 @@ output_pic_addr_const (FILE *file, rtx x, int code) break; case SYMBOL_REF: - output_addr_const (file, x); - if (!TARGET_MACHO && code == 'P' && ! SYMBOL_REF_LOCAL_P (x)) + if (! TARGET_MACHO || TARGET_64BIT) + output_addr_const (file, x); + else + { + const char *name = XSTR (x, 0); + + /* Mark the decl as referenced so that cgraph will + output the function. */ + if (SYMBOL_REF_DECL (x)) + mark_decl_referenced (SYMBOL_REF_DECL (x)); + +#if TARGET_MACHO + if (MACHOPIC_INDIRECT + && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION) + name = machopic_indirection_name (x, /*stub_p=*/true); +#endif + assemble_name (file, name); + } + if (!TARGET_MACHO && !TARGET_64BIT_MS_ABI + && code == 'P' && ! SYMBOL_REF_LOCAL_P (x)) fputs ("@PLT", file); break; @@ -7344,7 +7746,7 @@ output_pic_addr_const (FILE *file, rtx x, int code) case PLUS: /* Some assemblers need integer constants to appear first. */ - if (GET_CODE (XEXP (x, 0)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 0))) { output_pic_addr_const (file, XEXP (x, 0), code); putc ('+', file); @@ -7352,7 +7754,7 @@ output_pic_addr_const (FILE *file, rtx x, int code) } else { - gcc_assert (GET_CODE (XEXP (x, 1)) == CONST_INT); + gcc_assert (CONST_INT_P (XEXP (x, 1))); output_pic_addr_const (file, XEXP (x, 1), code); putc ('+', file); output_pic_addr_const (file, XEXP (x, 0), code); @@ -7380,6 +7782,9 @@ output_pic_addr_const (FILE *file, rtx x, int code) case UNSPEC_GOTOFF: fputs ("@GOTOFF", file); break; + case UNSPEC_PLTOFF: + fputs ("@PLTOFF", file); + break; case UNSPEC_GOTPCREL: fputs ("@GOTPCREL(%rip)", file); break; @@ -7422,7 +7827,7 @@ output_pic_addr_const (FILE *file, rtx x, int code) /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL. We need to emit DTP-relative relocations. */ -static void +static void ATTRIBUTE_UNUSED i386_output_dwarf_dtprel (FILE *file, int size, rtx x) { fputs (ASM_LONG, file); @@ -7461,7 +7866,7 @@ ix86_delegitimize_address (rtx orig_x) /* This is the result, or NULL. */ rtx result = NULL_RTX; - if (GET_CODE (x) == MEM) + if (MEM_P (x)) x = XEXP (x, 0); if (TARGET_64BIT) @@ -7469,7 +7874,7 @@ ix86_delegitimize_address (rtx orig_x) if (GET_CODE (x) != CONST || GET_CODE (XEXP (x, 0)) != UNSPEC || XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL - || GET_CODE (orig_x) != MEM) + || !MEM_P (orig_x)) return orig_x; return XVECEXP (XEXP (x, 0), 0, 0); } @@ -7478,7 +7883,7 @@ ix86_delegitimize_address (rtx orig_x) || GET_CODE (XEXP (x, 1)) != CONST) return orig_x; - if (GET_CODE (XEXP (x, 0)) == REG + if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == PIC_OFFSET_TABLE_REGNUM) /* %ebx + GOT/GOTOFF */ ; @@ -7486,15 +7891,15 @@ ix86_delegitimize_address (rtx orig_x) { /* %ebx + %reg * scale + GOT/GOTOFF */ reg_addend = XEXP (x, 0); - if (GET_CODE (XEXP (reg_addend, 0)) == REG + if (REG_P (XEXP (reg_addend, 0)) && REGNO (XEXP (reg_addend, 0)) == PIC_OFFSET_TABLE_REGNUM) reg_addend = XEXP (reg_addend, 1); - else if (GET_CODE (XEXP (reg_addend, 1)) == REG + else if (REG_P (XEXP (reg_addend, 1)) && REGNO (XEXP (reg_addend, 1)) == PIC_OFFSET_TABLE_REGNUM) reg_addend = XEXP (reg_addend, 0); else return orig_x; - if (GET_CODE (reg_addend) != REG + if (!REG_P (reg_addend) && GET_CODE (reg_addend) != MULT && GET_CODE (reg_addend) != ASHIFT) return orig_x; @@ -7504,19 +7909,19 @@ ix86_delegitimize_address (rtx orig_x) x = XEXP (XEXP (x, 1), 0); if (GET_CODE (x) == PLUS - && GET_CODE (XEXP (x, 1)) == CONST_INT) + && CONST_INT_P (XEXP (x, 1))) { const_addend = XEXP (x, 1); x = XEXP (x, 0); } if (GET_CODE (x) == UNSPEC - && ((XINT (x, 1) == UNSPEC_GOT && GET_CODE (orig_x) == MEM) - || (XINT (x, 1) == UNSPEC_GOTOFF && GET_CODE (orig_x) != MEM))) + && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x)) + || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x)))) result = XVECEXP (x, 0, 0); if (TARGET_MACHO && darwin_local_data_pic (x) - && GET_CODE (orig_x) != MEM) + && !MEM_P (orig_x)) result = XEXP (x, 0); if (! result) @@ -7528,6 +7933,46 @@ ix86_delegitimize_address (rtx orig_x) result = gen_rtx_PLUS (Pmode, reg_addend, result); return result; } + +/* If X is a machine specific address (i.e. a symbol or label being + referenced as a displacement from the GOT implemented using an + UNSPEC), then return the base term. Otherwise return X. */ + +rtx +ix86_find_base_term (rtx x) +{ + rtx term; + + if (TARGET_64BIT) + { + if (GET_CODE (x) != CONST) + return x; + term = XEXP (x, 0); + if (GET_CODE (term) == PLUS + && (CONST_INT_P (XEXP (term, 1)) + || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE)) + term = XEXP (term, 0); + if (GET_CODE (term) != UNSPEC + || XINT (term, 1) != UNSPEC_GOTPCREL) + return x; + + term = XVECEXP (term, 0, 0); + + if (GET_CODE (term) != SYMBOL_REF + && GET_CODE (term) != LABEL_REF) + return x; + + return term; + } + + term = ix86_delegitimize_address (x); + + if (GET_CODE (term) != SYMBOL_REF + && GET_CODE (term) != LABEL_REF) + return x; + + return term; +} static void put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse, @@ -7729,26 +8174,10 @@ print_reg (rtx x, int code, FILE *file) so that we can print its name in some tls_local_dynamic_base pattern. */ -static const char * -get_some_local_dynamic_name (void) +static int +get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED) { - rtx insn; - - if (cfun->machine->some_ld_name) - return cfun->machine->some_ld_name; - - for (insn = get_insns (); insn ; insn = NEXT_INSN (insn)) - if (INSN_P (insn) - && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0)) - return cfun->machine->some_ld_name; - - gcc_unreachable (); -} - -static int -get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED) -{ - rtx x = *px; + rtx x = *px; if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC) @@ -7760,6 +8189,22 @@ get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED) return 0; } +static const char * +get_some_local_dynamic_name (void) +{ + rtx insn; + + if (cfun->machine->some_ld_name) + return cfun->machine->some_ld_name; + + for (insn = get_insns (); insn ; insn = NEXT_INSN (insn)) + if (INSN_P (insn) + && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0)) + return cfun->machine->some_ld_name; + + gcc_unreachable (); +} + /* Meaning of CODE: L,W,B,Q,S,T -- print the opcode suffix for specified size of operand. C -- print opcode suffix for set/cmov insn. @@ -7814,7 +8259,7 @@ print_operand (FILE *file, rtx x, int code) case ASM_INTEL: /* Intel syntax. For absolute addresses, registers should not be surrounded by braces. */ - if (GET_CODE (x) != REG) + if (!REG_P (x)) { putc ('[', file); PRINT_OPERAND (file, x, 0); @@ -7874,6 +8319,10 @@ print_operand (FILE *file, rtx x, int code) /* This is the size of op from size of operand. */ switch (GET_MODE_SIZE (GET_MODE (x))) { + case 1: + putc ('b', file); + return; + case 2: #ifdef HAVE_GAS_FILDS_FISTS putc ('s', file); @@ -7924,7 +8373,7 @@ print_operand (FILE *file, rtx x, int code) break; case 's': - if (GET_CODE (x) == CONST_INT || ! SHIFT_DOUBLE_OMITS_COUNT) + if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT) { PRINT_OPERAND (file, x, 0); putc (',', file); @@ -8062,10 +8511,10 @@ print_operand (FILE *file, rtx x, int code) } } - if (GET_CODE (x) == REG) + if (REG_P (x)) print_reg (x, code, file); - else if (GET_CODE (x) == MEM) + else if (MEM_P (x)) { /* No `byte ptr' prefix for call instructions. */ if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P') @@ -8098,7 +8547,7 @@ print_operand (FILE *file, rtx x, int code) x = XEXP (x, 0); /* Avoid (%rip) for call operands. */ if (CONSTANT_ADDRESS_P (x) && code == 'P' - && GET_CODE (x) != CONST_INT) + && !CONST_INT_P (x)) output_addr_const (file, x); else if (this_is_asm_operands && ! address_operand (x, VOIDmode)) output_operand_lossage ("invalid constraints for operand"); @@ -8150,7 +8599,7 @@ print_operand (FILE *file, rtx x, int code) if (code != 'P') { - if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE) + if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE) { if (ASSEMBLER_DIALECT == ASM_ATT) putc ('$', file); @@ -8164,7 +8613,7 @@ print_operand (FILE *file, rtx x, int code) fputs ("OFFSET FLAT:", file); } } - if (GET_CODE (x) == CONST_INT) + if (CONST_INT_P (x)) fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); else if (flag_pic) output_pic_addr_const (file, x, code); @@ -8208,7 +8657,7 @@ print_operand_address (FILE *file, rtx addr) { /* Displacement only requires special attention. */ - if (GET_CODE (disp) == CONST_INT) + if (CONST_INT_P (disp)) { if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT) { @@ -8228,7 +8677,7 @@ print_operand_address (FILE *file, rtx addr) { if (GET_CODE (disp) == CONST && GET_CODE (XEXP (disp, 0)) == PLUS - && GET_CODE (XEXP (XEXP (disp, 0), 1)) == CONST_INT) + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) disp = XEXP (XEXP (disp, 0), 0); if (GET_CODE (disp) == LABEL_REF || (GET_CODE (disp) == SYMBOL_REF @@ -8271,7 +8720,7 @@ print_operand_address (FILE *file, rtx addr) /* Pull out the offset of a symbol; print any symbol itself. */ if (GET_CODE (disp) == CONST && GET_CODE (XEXP (disp, 0)) == PLUS - && GET_CODE (XEXP (XEXP (disp, 0), 1)) == CONST_INT) + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) { offset = XEXP (XEXP (disp, 0), 1); disp = gen_rtx_CONST (VOIDmode, @@ -8282,7 +8731,7 @@ print_operand_address (FILE *file, rtx addr) output_pic_addr_const (file, disp, 0); else if (GET_CODE (disp) == LABEL_REF) output_asm_label (disp); - else if (GET_CODE (disp) == CONST_INT) + else if (CONST_INT_P (disp)) offset = disp; else output_addr_const (file, disp); @@ -8381,7 +8830,7 @@ split_di (rtx operands[], int num, rtx lo_half[], rtx hi_half[]) /* simplify_subreg refuse to split volatile memory addresses, but we still have to handle it. */ - if (GET_CODE (op) == MEM) + if (MEM_P (op)) { lo_half[num] = adjust_address (op, SImode, 0); hi_half[num] = adjust_address (op, SImode, 4); @@ -8412,7 +8861,7 @@ split_ti (rtx operands[], int num, rtx lo_half[], rtx hi_half[]) /* simplify_subreg refuse to split volatile memory addresses, but we still have to handle it. */ - if (GET_CODE (op) == MEM) + if (MEM_P (op)) { lo_half[num] = adjust_address (op, DImode, 0); hi_half[num] = adjust_address (op, DImode, 8); @@ -8456,10 +8905,10 @@ output_387_binary_op (rtx insn, rtx *operands) if (STACK_REG_P (operands[0]) && ((REG_P (operands[1]) && REGNO (operands[0]) == REGNO (operands[1]) - && (STACK_REG_P (operands[2]) || GET_CODE (operands[2]) == MEM)) + && (STACK_REG_P (operands[2]) || MEM_P (operands[2]))) || (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]) - && (STACK_REG_P (operands[1]) || GET_CODE (operands[1]) == MEM))) + && (STACK_REG_P (operands[1]) || MEM_P (operands[1])))) && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2]))) ; /* ok */ else @@ -8532,7 +8981,7 @@ output_387_binary_op (rtx insn, rtx *operands) /* know operands[0] == operands[1]. */ - if (GET_CODE (operands[2]) == MEM) + if (MEM_P (operands[2])) { p = "%z2\t%2"; break; @@ -8562,13 +9011,13 @@ output_387_binary_op (rtx insn, rtx *operands) case MINUS: case DIV: - if (GET_CODE (operands[1]) == MEM) + if (MEM_P (operands[1])) { p = "r%z1\t%1"; break; } - if (GET_CODE (operands[2]) == MEM) + if (MEM_P (operands[2])) { p = "%z2\t%2"; break; @@ -8807,7 +9256,8 @@ output_fix_trunc (rtx insn, rtx *operands, int fisttp) output_asm_insn ("fld\t%y1", operands); gcc_assert (STACK_TOP_P (operands[1])); - gcc_assert (GET_CODE (operands[0]) == MEM); + gcc_assert (MEM_P (operands[0])); + gcc_assert (GET_MODE (operands[1]) != TFmode); if (fisttp) output_asm_insn ("fisttp%z0\t%0", operands); @@ -8840,7 +9290,7 @@ output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno) { static char retval[] = ".word\t0xc_df"; int regno = REGNO (operands[opno]); - + gcc_assert (FP_REGNO_P (regno)); retval[9] = '0' + (regno - FIRST_STACK_REG); @@ -8990,9 +9440,18 @@ ix86_output_addr_vec_elt (FILE *file, int value) void ix86_output_addr_diff_elt (FILE *file, int value, int rel) { - if (TARGET_64BIT) + const char *directive = ASM_LONG; + +#ifdef ASM_QUAD + if (TARGET_64BIT && CASE_VECTOR_MODE == DImode) + directive = ASM_QUAD; +#else + gcc_assert (!TARGET_64BIT); +#endif + /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */ + if (TARGET_64BIT || TARGET_VXWORKS_RTP) fprintf (file, "%s%s%d-%s%d\n", - ASM_LONG, LPREFIX, value, LPREFIX, rel); + directive, LPREFIX, value, LPREFIX, rel); else if (HAVE_AS_GOTOFF_IN_DATA) fprintf (file, "%s%s%d@GOTOFF\n", ASM_LONG, LPREFIX, value); #if TARGET_MACHO @@ -9022,7 +9481,6 @@ ix86_expand_clear (rtx dest) /* Avoid HImode and its attendant prefix byte. */ if (GET_MODE_SIZE (GET_MODE (dest)) < 4) dest = gen_rtx_REG (SImode, REGNO (dest)); - tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx); /* This predicate should match that for movsi_xor and movdi_xor_rex64. */ @@ -9069,20 +9527,31 @@ ix86_expand_move (enum machine_mode mode, rtx operands[]) if (op1 == op0) return; } + else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES + && SYMBOL_REF_DLLIMPORT_P (op1)) + op1 = legitimize_dllimport_symbol (op1, false); } else if (GET_CODE (op1) == CONST && GET_CODE (XEXP (op1, 0)) == PLUS && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF) { - model = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (op1, 0), 0)); + rtx addend = XEXP (XEXP (op1, 0), 1); + rtx symbol = XEXP (XEXP (op1, 0), 0); + rtx tmp = NULL; + + model = SYMBOL_REF_TLS_MODEL (symbol); if (model) + tmp = legitimize_tls_address (symbol, model, true); + else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES + && SYMBOL_REF_DLLIMPORT_P (symbol)) + tmp = legitimize_dllimport_symbol (symbol, true); + + if (tmp) { - rtx addend = XEXP (XEXP (op1, 0), 1); - op1 = legitimize_tls_address (XEXP (XEXP (op1, 0), 0), model, true); - op1 = force_operand (op1, NULL); - op1 = expand_simple_binop (Pmode, PLUS, op1, addend, + tmp = force_operand (tmp, NULL); + tmp = expand_simple_binop (Pmode, PLUS, tmp, addend, op0, 1, OPTAB_DIRECT); - if (op1 == op0) + if (tmp == op0) return; } } @@ -9095,7 +9564,7 @@ ix86_expand_move (enum machine_mode mode, rtx operands[]) if (MACHOPIC_PURE) { rtx temp = ((reload_in_progress - || ((op0 && GET_CODE (op0) == REG) + || ((op0 && REG_P (op0)) && mode == Pmode)) ? op0 : gen_reg_rtx (Pmode)); op1 = machopic_indirect_data_reference (op1, temp); @@ -9110,18 +9579,23 @@ ix86_expand_move (enum machine_mode mode, rtx operands[]) } else { - if (GET_CODE (op0) == MEM) + if (MEM_P (op0)) op1 = force_reg (Pmode, op1); - else - op1 = legitimize_address (op1, op1, Pmode); + else if (!TARGET_64BIT || !x86_64_movabs_operand (op1, Pmode)) + { + rtx reg = no_new_pseudos ? op0 : NULL_RTX; + op1 = legitimize_pic_address (op1, reg); + if (op0 == op1) + return; + } } } else { - if (GET_CODE (op0) == MEM + if (MEM_P (op0) && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode) || !push_operand (op0, mode)) - && GET_CODE (op1) == MEM) + && MEM_P (op1)) op1 = force_reg (mode, op1); if (push_operand (op0, mode) @@ -9191,6 +9665,55 @@ ix86_expand_vector_move (enum machine_mode mode, rtx operands[]) /* Implement the movmisalign patterns for SSE. Non-SSE modes go straight to ix86_expand_vector_move. */ +/* Code generation for scalar reg-reg moves of single and double precision data: + if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true) + movaps reg, reg + else + movss reg, reg + if (x86_sse_partial_reg_dependency == true) + movapd reg, reg + else + movsd reg, reg + + Code generation for scalar loads of double precision data: + if (x86_sse_split_regs == true) + movlpd mem, reg (gas syntax) + else + movsd mem, reg + + Code generation for unaligned packed loads of single precision data + (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency): + if (x86_sse_unaligned_move_optimal) + movups mem, reg + + if (x86_sse_partial_reg_dependency == true) + { + xorps reg, reg + movlps mem, reg + movhps mem+8, reg + } + else + { + movlps mem, reg + movhps mem+8, reg + } + + Code generation for unaligned packed loads of double precision data + (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs): + if (x86_sse_unaligned_move_optimal) + movupd mem, reg + + if (x86_sse_split_regs == true) + { + movlpd mem, reg + movhpd mem+8, reg + } + else + { + movsd mem, reg + movhpd mem+8, reg + } + */ void ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) @@ -9223,8 +9746,16 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) } if (TARGET_SSE2 && mode == V2DFmode) - { - rtx zero; + { + rtx zero; + + if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + return; + } /* When SSE registers are split into halves, we can avoid writing to the top half twice. */ @@ -9252,7 +9783,15 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) emit_insn (gen_sse2_loadhpd (op0, op0, m)); } else - { + { + if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_sse_movups (op0, op1)); + return; + } + if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) emit_move_insn (op0, CONST0_RTX (mode)); else @@ -9328,6 +9867,43 @@ ix86_expand_push (enum machine_mode mode, rtx x) emit_move_insn (tmp, x); } +/* Helper function of ix86_fixup_binary_operands to canonicalize + operand order. Returns true if the operands should be swapped. */ + +static bool +ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + + /* If the operation is not commutative, we can't do anything. */ + if (GET_RTX_CLASS (code) != RTX_COMM_ARITH) + return false; + + /* Highest priority is that src1 should match dst. */ + if (rtx_equal_p (dst, src1)) + return false; + if (rtx_equal_p (dst, src2)) + return true; + + /* Next highest priority is that immediate constants come second. */ + if (immediate_operand (src2, mode)) + return false; + if (immediate_operand (src1, mode)) + return true; + + /* Lowest priority is that memory references should come second. */ + if (MEM_P (src2)) + return false; + if (MEM_P (src1)) + return true; + + return false; +} + + /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the destination to use for the operation. If different from the true destination in operands[0], a copy operation will be required. */ @@ -9336,55 +9912,46 @@ rtx ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode, rtx operands[]) { - int matching_memory; - rtx src1, src2, dst; - - dst = operands[0]; - src1 = operands[1]; - src2 = operands[2]; + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; - /* Recognize = for commutative operators */ - if (GET_RTX_CLASS (code) == RTX_COMM_ARITH - && (rtx_equal_p (dst, src2) - || immediate_operand (src1, mode))) + /* Canonicalize operand order. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) { rtx temp = src1; src1 = src2; src2 = temp; } - /* If the destination is memory, and we do not have matching source - operands, do things in registers. */ - matching_memory = 0; - if (GET_CODE (dst) == MEM) - { - if (rtx_equal_p (dst, src1)) - matching_memory = 1; - else if (GET_RTX_CLASS (code) == RTX_COMM_ARITH - && rtx_equal_p (dst, src2)) - matching_memory = 2; - else - dst = gen_reg_rtx (mode); - } - /* Both source operands cannot be in memory. */ - if (GET_CODE (src1) == MEM && GET_CODE (src2) == MEM) + if (MEM_P (src1) && MEM_P (src2)) { - if (matching_memory != 2) - src2 = force_reg (mode, src2); + /* Optimization: Only read from memory once. */ + if (rtx_equal_p (src1, src2)) + { + src2 = force_reg (mode, src2); + src1 = src2; + } else - src1 = force_reg (mode, src1); + src2 = force_reg (mode, src2); } - /* If the operation is not commutable, source 1 cannot be a constant - or non-matching memory. */ - if ((CONSTANT_P (src1) - || (!matching_memory && GET_CODE (src1) == MEM)) - && GET_RTX_CLASS (code) != RTX_COMM_ARITH) + /* If the destination is memory, and we do not have matching source + operands, do things in registers. */ + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + dst = gen_reg_rtx (mode); + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) src1 = force_reg (mode, src1); - src1 = operands[1] = src1; - src2 = operands[2] = src2; + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) + src1 = force_reg (mode, src1); + + operands[1] = src1; + operands[2] = src2; return dst; } @@ -9438,28 +10005,37 @@ ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode, appropriate constraints. */ int -ix86_binary_operator_ok (enum rtx_code code, - enum machine_mode mode ATTRIBUTE_UNUSED, +ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode, rtx operands[3]) { + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + /* Both source operands cannot be in memory. */ - if (GET_CODE (operands[1]) == MEM && GET_CODE (operands[2]) == MEM) - return 0; - /* If the operation is not commutable, source 1 cannot be a constant. */ - if (CONSTANT_P (operands[1]) && GET_RTX_CLASS (code) != RTX_COMM_ARITH) + if (MEM_P (src1) && MEM_P (src2)) return 0; + + /* Canonicalize operand order for commutative operators. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) + { + rtx temp = src1; + src1 = src2; + src2 = temp; + } + /* If the destination is memory, we must have a matching source operand. */ - if (GET_CODE (operands[0]) == MEM - && ! (rtx_equal_p (operands[0], operands[1]) - || (GET_RTX_CLASS (code) == RTX_COMM_ARITH - && rtx_equal_p (operands[0], operands[2])))) + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + return 0; + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) return 0; - /* If the operation is not commutable and the source 1 is memory, we must - have a matching destination. */ - if (GET_CODE (operands[1]) == MEM - && GET_RTX_CLASS (code) != RTX_COMM_ARITH - && ! rtx_equal_p (operands[0], operands[1])) + + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) return 0; + return 1; } @@ -9522,13 +10098,246 @@ ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED, rtx operands[2] ATTRIBUTE_UNUSED) { /* If one of operands is memory, source and destination must match. */ - if ((GET_CODE (operands[0]) == MEM - || GET_CODE (operands[1]) == MEM) + if ((MEM_P (operands[0]) + || MEM_P (operands[1])) && ! rtx_equal_p (operands[0], operands[1])) return FALSE; return TRUE; } +/* Post-reload splitter for converting an SF or DFmode value in an + SSE register into an unsigned SImode. */ + +void +ix86_split_convert_uns_si_sse (rtx operands[]) +{ + enum machine_mode vecmode; + rtx value, large, zero_or_two31, input, two31, x; + + large = operands[1]; + zero_or_two31 = operands[2]; + input = operands[3]; + two31 = operands[4]; + vecmode = GET_MODE (large); + value = gen_rtx_REG (vecmode, REGNO (operands[0])); + + /* Load up the value into the low element. We must ensure that the other + elements are valid floats -- zero is the easiest such value. */ + if (MEM_P (input)) + { + if (vecmode == V4SFmode) + emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input)); + else + emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input)); + } + else + { + input = gen_rtx_REG (vecmode, REGNO (input)); + emit_move_insn (value, CONST0_RTX (vecmode)); + if (vecmode == V4SFmode) + emit_insn (gen_sse_movss (value, value, input)); + else + emit_insn (gen_sse2_movsd (value, value, input)); + } + + emit_move_insn (large, two31); + emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31); + + x = gen_rtx_fmt_ee (LE, vecmode, large, value); + emit_insn (gen_rtx_SET (VOIDmode, large, x)); + + x = gen_rtx_AND (vecmode, zero_or_two31, large); + emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x)); + + x = gen_rtx_MINUS (vecmode, value, zero_or_two31); + emit_insn (gen_rtx_SET (VOIDmode, value, x)); + + large = gen_rtx_REG (V4SImode, REGNO (large)); + emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31))); + + x = gen_rtx_REG (V4SImode, REGNO (value)); + if (vecmode == V4SFmode) + emit_insn (gen_sse2_cvttps2dq (x, value)); + else + emit_insn (gen_sse2_cvttpd2dq (x, value)); + value = x; + + emit_insn (gen_xorv4si3 (value, value, large)); +} + +/* Convert an unsigned DImode value into a DFmode, using only SSE. + Expects the 64-bit DImode to be supplied in a pair of integral + registers. Requires SSE2; will use SSE3 if available. For x86_32, + -mfpmath=sse, !optimize_size only. */ + +void +ix86_expand_convert_uns_didf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt; + rtx int_xmm, fp_xmm; + rtx biases, exponents; + rtx x; + + int_xmm = gen_reg_rtx (V4SImode); + if (TARGET_INTER_UNIT_MOVES) + emit_insn (gen_movdi_to_sse (int_xmm, input)); + else if (TARGET_SSE_SPLIT_REGS) + { + emit_insn (gen_rtx_CLOBBER (VOIDmode, int_xmm)); + emit_move_insn (gen_lowpart (DImode, int_xmm), input); + } + else + { + x = gen_reg_rtx (V2DImode); + ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0); + emit_move_insn (int_xmm, gen_lowpart (V4SImode, x)); + } + + x = gen_rtx_CONST_VECTOR (V4SImode, + gen_rtvec (4, GEN_INT (0x43300000UL), + GEN_INT (0x45300000UL), + const0_rtx, const0_rtx)); + exponents = validize_mem (force_const_mem (V4SImode, x)); + + /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */ + emit_insn (gen_sse2_punpckldq (int_xmm, int_xmm, exponents)); + + /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm) + yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)). + Similarly (0x45300000UL ## fp_value_hi_xmm) yields + (0x1.0p84 + double(fp_value_hi_xmm)). + Note these exponents differ by 32. */ + + fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm)); + + /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values + in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */ + real_ldexp (&bias_lo_rvt, &dconst1, 52); + real_ldexp (&bias_hi_rvt, &dconst1, 84); + biases = const_double_from_real_value (bias_lo_rvt, DFmode); + x = const_double_from_real_value (bias_hi_rvt, DFmode); + biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x)); + biases = validize_mem (force_const_mem (V2DFmode, biases)); + emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases)); + + /* Add the upper and lower DFmode values together. */ + if (TARGET_SSE3) + emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm)); + else + { + x = copy_to_mode_reg (V2DFmode, fp_xmm); + emit_insn (gen_sse2_unpckhpd (fp_xmm, fp_xmm, fp_xmm)); + emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x)); + } + + ix86_expand_vector_extract (false, target, fp_xmm, 0); +} + +/* Convert an unsigned SImode value into a DFmode. Only currently used + for SSE, but applicable anywhere. */ + +void +ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE TWO31r; + rtx x, fp; + + x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1), + NULL, 1, OPTAB_DIRECT); + + fp = gen_reg_rtx (DFmode); + emit_insn (gen_floatsidf2 (fp, x)); + + real_ldexp (&TWO31r, &dconst1, 31); + x = const_double_from_real_value (TWO31r, DFmode); + + x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert a signed DImode value into a DFmode. Only used for SSE in + 32-bit mode; otherwise we have a direct convert instruction. */ + +void +ix86_expand_convert_sign_didf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE TWO32r; + rtx fp_lo, fp_hi, x; + + fp_lo = gen_reg_rtx (DFmode); + fp_hi = gen_reg_rtx (DFmode); + + emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input))); + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, DFmode); + fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT); + + ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input)); + + x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert an unsigned SImode value into a SFmode, using only SSE. + For x86_32, -mfpmath=sse, !optimize_size only. */ +void +ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE ONE16r; + rtx fp_hi, fp_lo, int_hi, int_lo, x; + + real_ldexp (&ONE16r, &dconst1, 16); + x = const_double_from_real_value (ONE16r, SFmode); + int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff), + NULL, 0, OPTAB_DIRECT); + int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16), + NULL, 0, OPTAB_DIRECT); + fp_hi = gen_reg_rtx (SFmode); + fp_lo = gen_reg_rtx (SFmode); + emit_insn (gen_floatsisf2 (fp_hi, int_hi)); + emit_insn (gen_floatsisf2 (fp_lo, int_lo)); + fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, + 0, OPTAB_DIRECT); + fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (!rtx_equal_p (target, fp_hi)) + emit_move_insn (target, fp_hi); +} + +/* A subroutine of ix86_build_signbit_mask_vector. If VECT is true, + then replicate the value for all elements of the vector + register. */ + +rtx +ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) +{ + rtvec v; + switch (mode) + { + case SFmode: + if (vect) + v = gen_rtvec (4, value, value, value, value); + else + v = gen_rtvec (4, value, CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode)); + return gen_rtx_CONST_VECTOR (V4SFmode, v); + + case DFmode: + if (vect) + v = gen_rtvec (2, value, value); + else + v = gen_rtvec (2, value, CONST0_RTX (DFmode)); + return gen_rtx_CONST_VECTOR (V2DFmode, v); + + default: + gcc_unreachable (); + } +} + /* A subroutine of ix86_expand_fp_absneg_operator and copysign expanders. Create a mask for the sign bit in MODE for an SSE register. If VECT is true, then replicate the mask for all elements of the vector register. @@ -9540,7 +10349,7 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) enum machine_mode vec_mode; HOST_WIDE_INT hi, lo; int shift = 63; - rtvec v; + rtx v; rtx mask; /* Find the sign bit, sign extended to 2*HWI. */ @@ -9558,25 +10367,9 @@ ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) mask = immed_double_const (lo, hi, mode == SFmode ? SImode : DImode); mask = gen_lowpart (mode, mask); - if (mode == SFmode) - { - if (vect) - v = gen_rtvec (4, mask, mask, mask, mask); - else - v = gen_rtvec (4, mask, CONST0_RTX (SFmode), - CONST0_RTX (SFmode), CONST0_RTX (SFmode)); - vec_mode = V4SFmode; - } - else - { - if (vect) - v = gen_rtvec (2, mask, mask); - else - v = gen_rtvec (2, mask, CONST0_RTX (DFmode)); - vec_mode = V2DFmode; - } - - return force_reg (vec_mode, gen_rtx_CONST_VECTOR (vec_mode, v)); + v = ix86_build_const_vector (mode, vect, mask); + vec_mode = (mode == SFmode) ? V4SFmode : V2DFmode; + return force_reg (vec_mode, v); } /* Generate code for floating point ABS or NEG. */ @@ -9886,8 +10679,14 @@ ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED) enum machine_mode ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1) { - if (SCALAR_FLOAT_MODE_P (GET_MODE (op0))) - return ix86_fp_compare_mode (code); + enum machine_mode mode = GET_MODE (op0); + + if (SCALAR_FLOAT_MODE_P (mode)) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); + return ix86_fp_compare_mode (code); + } + switch (code) { /* Only zero flag is needed. */ @@ -9986,153 +10785,28 @@ ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2) } } -/* Return true if we should use an FCOMI instruction for this fp comparison. */ +/* Split comparison code CODE into comparisons we can do using branch + instructions. BYPASS_CODE is comparison code for branch that will + branch around FIRST_CODE and SECOND_CODE. If some of branches + is not required, set value to UNKNOWN. + We never require more than two branches. */ -int -ix86_use_fcomi_compare (enum rtx_code code ATTRIBUTE_UNUSED) +void +ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *bypass_code, + enum rtx_code *first_code, + enum rtx_code *second_code) { - enum rtx_code swapped_code = swap_condition (code); - return ((ix86_fp_comparison_cost (code) == ix86_fp_comparison_fcomi_cost (code)) - || (ix86_fp_comparison_cost (swapped_code) - == ix86_fp_comparison_fcomi_cost (swapped_code))); -} + *first_code = code; + *bypass_code = UNKNOWN; + *second_code = UNKNOWN; -/* Swap, force into registers, or otherwise massage the two operands - to a fp comparison. The operands are updated in place; the new - comparison code is returned. */ + /* The fcomi comparison sets flags as follows: -static enum rtx_code -ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) -{ - enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code); - rtx op0 = *pop0, op1 = *pop1; - enum machine_mode op_mode = GET_MODE (op0); - int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); - - /* All of the unordered compare instructions only work on registers. - The same is true of the fcomi compare instructions. The XFmode - compare instructions require registers except when comparing - against zero or when converting operand 1 from fixed point to - floating point. */ - - if (!is_sse - && (fpcmp_mode == CCFPUmode - || (op_mode == XFmode - && ! (standard_80387_constant_p (op0) == 1 - || standard_80387_constant_p (op1) == 1) - && GET_CODE (op1) != FLOAT) - || ix86_use_fcomi_compare (code))) - { - op0 = force_reg (op_mode, op0); - op1 = force_reg (op_mode, op1); - } - else - { - /* %%% We only allow op1 in memory; op0 must be st(0). So swap - things around if they appear profitable, otherwise force op0 - into a register. */ - - if (standard_80387_constant_p (op0) == 0 - || (GET_CODE (op0) == MEM - && ! (standard_80387_constant_p (op1) == 0 - || GET_CODE (op1) == MEM))) - { - rtx tmp; - tmp = op0, op0 = op1, op1 = tmp; - code = swap_condition (code); - } - - if (GET_CODE (op0) != REG) - op0 = force_reg (op_mode, op0); - - if (CONSTANT_P (op1)) - { - int tmp = standard_80387_constant_p (op1); - if (tmp == 0) - op1 = validize_mem (force_const_mem (op_mode, op1)); - else if (tmp == 1) - { - if (TARGET_CMOVE) - op1 = force_reg (op_mode, op1); - } - else - op1 = force_reg (op_mode, op1); - } - } - - /* Try to rearrange the comparison to make it cheaper. */ - if (ix86_fp_comparison_cost (code) - > ix86_fp_comparison_cost (swap_condition (code)) - && (GET_CODE (op1) == REG || !no_new_pseudos)) - { - rtx tmp; - tmp = op0, op0 = op1, op1 = tmp; - code = swap_condition (code); - if (GET_CODE (op0) != REG) - op0 = force_reg (op_mode, op0); - } - - *pop0 = op0; - *pop1 = op1; - return code; -} - -/* Convert comparison codes we use to represent FP comparison to integer - code that will result in proper branch. Return UNKNOWN if no such code - is available. */ - -enum rtx_code -ix86_fp_compare_code_to_integer (enum rtx_code code) -{ - switch (code) - { - case GT: - return GTU; - case GE: - return GEU; - case ORDERED: - case UNORDERED: - return code; - break; - case UNEQ: - return EQ; - break; - case UNLT: - return LTU; - break; - case UNLE: - return LEU; - break; - case LTGT: - return NE; - break; - default: - return UNKNOWN; - } -} - -/* Split comparison code CODE into comparisons we can do using branch - instructions. BYPASS_CODE is comparison code for branch that will - branch around FIRST_CODE and SECOND_CODE. If some of branches - is not required, set value to UNKNOWN. - We never require more than two branches. */ - -void -ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *bypass_code, - enum rtx_code *first_code, - enum rtx_code *second_code) -{ - *first_code = code; - *bypass_code = UNKNOWN; - *second_code = UNKNOWN; - - /* The fcomi comparison sets flags as follows: - - cmp ZF PF CF - > 0 0 0 - < 0 0 1 - = 1 0 0 - un 1 1 1 */ + cmp ZF PF CF + > 0 0 0 + < 0 0 1 + = 1 0 0 + un 1 1 1 */ switch (code) { @@ -10238,7 +10912,7 @@ ix86_fp_comparison_sahf_cost (enum rtx_code code) enum rtx_code bypass_code, first_code, second_code; /* Return arbitrarily high cost when instruction is not preferred - this avoids gcc from using it. */ - if (!TARGET_USE_SAHF && !optimize_size) + if (!(TARGET_SAHF && (TARGET_USE_SAHF || optimize_size))) return 1024; ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code); return (bypass_code != UNKNOWN || second_code != UNKNOWN) + 3; @@ -10263,6 +10937,134 @@ ix86_fp_comparison_cost (enum rtx_code code) return min; } +/* Return true if we should use an FCOMI instruction for this + fp comparison. */ + +int +ix86_use_fcomi_compare (enum rtx_code code ATTRIBUTE_UNUSED) +{ + enum rtx_code swapped_code = swap_condition (code); + + return ((ix86_fp_comparison_cost (code) + == ix86_fp_comparison_fcomi_cost (code)) + || (ix86_fp_comparison_cost (swapped_code) + == ix86_fp_comparison_fcomi_cost (swapped_code))); +} + +/* Swap, force into registers, or otherwise massage the two operands + to a fp comparison. The operands are updated in place; the new + comparison code is returned. */ + +static enum rtx_code +ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) +{ + enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code); + rtx op0 = *pop0, op1 = *pop1; + enum machine_mode op_mode = GET_MODE (op0); + int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode); + + /* All of the unordered compare instructions only work on registers. + The same is true of the fcomi compare instructions. The XFmode + compare instructions require registers except when comparing + against zero or when converting operand 1 from fixed point to + floating point. */ + + if (!is_sse + && (fpcmp_mode == CCFPUmode + || (op_mode == XFmode + && ! (standard_80387_constant_p (op0) == 1 + || standard_80387_constant_p (op1) == 1) + && GET_CODE (op1) != FLOAT) + || ix86_use_fcomi_compare (code))) + { + op0 = force_reg (op_mode, op0); + op1 = force_reg (op_mode, op1); + } + else + { + /* %%% We only allow op1 in memory; op0 must be st(0). So swap + things around if they appear profitable, otherwise force op0 + into a register. */ + + if (standard_80387_constant_p (op0) == 0 + || (MEM_P (op0) + && ! (standard_80387_constant_p (op1) == 0 + || MEM_P (op1)))) + { + rtx tmp; + tmp = op0, op0 = op1, op1 = tmp; + code = swap_condition (code); + } + + if (!REG_P (op0)) + op0 = force_reg (op_mode, op0); + + if (CONSTANT_P (op1)) + { + int tmp = standard_80387_constant_p (op1); + if (tmp == 0) + op1 = validize_mem (force_const_mem (op_mode, op1)); + else if (tmp == 1) + { + if (TARGET_CMOVE) + op1 = force_reg (op_mode, op1); + } + else + op1 = force_reg (op_mode, op1); + } + } + + /* Try to rearrange the comparison to make it cheaper. */ + if (ix86_fp_comparison_cost (code) + > ix86_fp_comparison_cost (swap_condition (code)) + && (REG_P (op1) || !no_new_pseudos)) + { + rtx tmp; + tmp = op0, op0 = op1, op1 = tmp; + code = swap_condition (code); + if (!REG_P (op0)) + op0 = force_reg (op_mode, op0); + } + + *pop0 = op0; + *pop1 = op1; + return code; +} + +/* Convert comparison codes we use to represent FP comparison to integer + code that will result in proper branch. Return UNKNOWN if no such code + is available. */ + +enum rtx_code +ix86_fp_compare_code_to_integer (enum rtx_code code) +{ + switch (code) + { + case GT: + return GTU; + case GE: + return GEU; + case ORDERED: + case UNORDERED: + return code; + break; + case UNEQ: + return EQ; + break; + case UNLT: + return LTU; + break; + case UNLE: + return LEU; + break; + case LTGT: + return NE; + break; + default: + return UNKNOWN; + } +} + /* Generate insn patterns to do a floating point compare of OPERANDS. */ static rtx @@ -10285,7 +11087,8 @@ ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch, ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code); /* Do fcomi/sahf based test when profitable. */ - if ((bypass_code == UNKNOWN || bypass_test) + if ((TARGET_CMOVE || TARGET_SAHF) + && (bypass_code == UNKNOWN || bypass_test) && (second_code == UNKNOWN || second_test) && ix86_fp_comparison_arithmetics_cost (code) > cost) { @@ -10468,8 +11271,11 @@ ix86_expand_compare (enum rtx_code code, rtx *second_test, rtx *bypass_test) ix86_compare_emitted = NULL_RTX; } else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0))) - ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX, - second_test, bypass_test); + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0))); + ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX, + second_test, bypass_test); + } else ret = ix86_expand_int_compare (code, op0, op1); @@ -10622,7 +11428,7 @@ ix86_expand_branch (enum rtx_code code, rtx label) op1 is a constant and the low word is zero, then we can just examine the high word. */ - if (GET_CODE (hi[1]) == CONST_INT && lo[1] == const0_rtx) + if (CONST_INT_P (hi[1]) && lo[1] == const0_rtx) switch (code) { case LT: case LTU: case GE: case GEU: @@ -10830,16 +11636,20 @@ ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) enum machine_mode mode = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1); - /* Do not handle DImode compares that go through special path. Also we can't - deal with FP compares yet. This is possible to add. */ + /* Do not handle DImode compares that go through special path. + Also we can't deal with FP compares yet. This is possible to add. */ if (mode == (TARGET_64BIT ? TImode : DImode)) return false; - if (FLOAT_MODE_P (mode)) + + if (SCALAR_FLOAT_MODE_P (mode)) { rtx second_test = NULL, bypass_test = NULL; rtx compare_op, compare_seq; - /* Shortcut: following common codes never translate into carry flag compares. */ + gcc_assert (!DECIMAL_FLOAT_MODE_P (mode)); + + /* Shortcut: following common codes never translate + into carry flag compares. */ if (code == EQ || code == NE || code == UNEQ || code == LTGT || code == ORDERED || code == UNORDERED) return false; @@ -10896,7 +11706,7 @@ ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) /* Convert a>b into b=b-1. */ case GTU: case LEU: - if (GET_CODE (op1) == CONST_INT) + if (CONST_INT_P (op1)) { op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); /* Bail out on overflow. We still can swap operands but that @@ -10973,8 +11783,8 @@ ix86_expand_int_movcc (rtx operands[]) if ((mode != HImode || TARGET_FAST_PREFIX) && (mode != (TARGET_64BIT ? TImode : DImode)) - && GET_CODE (operands[2]) == CONST_INT - && GET_CODE (operands[3]) == CONST_INT) + && CONST_INT_P (operands[2]) + && CONST_INT_P (operands[3])) { rtx out = operands[0]; HOST_WIDE_INT ct = INTVAL (operands[2]); @@ -11128,11 +11938,16 @@ ix86_expand_int_movcc (rtx operands[]) if (diff < 0) { + enum machine_mode cmp_mode = GET_MODE (ix86_compare_op0); + HOST_WIDE_INT tmp; tmp = ct, ct = cf, cf = tmp; diff = -diff; - if (FLOAT_MODE_P (GET_MODE (ix86_compare_op0))) + + if (SCALAR_FLOAT_MODE_P (cmp_mode)) { + gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); + /* We may be reversing unordered compare to normal compare, that is not valid in general (we may convert non-trapping condition to trapping one), however on i386 we currently emit all @@ -11149,7 +11964,7 @@ ix86_expand_int_movcc (rtx operands[]) compare_code = UNKNOWN; if (GET_MODE_CLASS (GET_MODE (ix86_compare_op0)) == MODE_INT - && GET_CODE (ix86_compare_op1) == CONST_INT) + && CONST_INT_P (ix86_compare_op1)) { if (ix86_compare_op1 == const0_rtx && (code == LT || code == GE)) @@ -11281,14 +12096,21 @@ ix86_expand_int_movcc (rtx operands[]) { if (cf == 0) { + enum machine_mode cmp_mode = GET_MODE (ix86_compare_op0); + cf = ct; ct = 0; - if (FLOAT_MODE_P (GET_MODE (ix86_compare_op0))) - /* We may be reversing unordered compare to normal compare, - that is not valid in general (we may convert non-trapping - condition to trapping one), however on i386 we currently - emit all comparisons unordered. */ - code = reverse_condition_maybe_unordered (code); + + if (SCALAR_FLOAT_MODE_P (cmp_mode)) + { + gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode)); + + /* We may be reversing unordered compare to normal compare, + that is not valid in general (we may convert non-trapping + condition to trapping one), however on i386 we currently + emit all comparisons unordered. */ + code = reverse_condition_maybe_unordered (code); + } else { code = reverse_condition (code); @@ -11361,7 +12183,7 @@ ix86_expand_int_movcc (rtx operands[]) /* If one of the two operands is an interesting constant, load a constant with the above and mask it in with a logical operation. */ - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { var = operands[3]; if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) @@ -11371,7 +12193,7 @@ ix86_expand_int_movcc (rtx operands[]) else return 0; /* FAIL */ } - else if (GET_CODE (operands[3]) == CONST_INT) + else if (CONST_INT_P (operands[3])) { var = operands[2]; if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) @@ -11886,11 +12708,11 @@ ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p) case V4SImode: if (high_p) unpack = gen_vec_interleave_highv4si; - else + else unpack = gen_vec_interleave_lowv4si; break; default: - gcc_unreachable (); + gcc_unreachable (); } dest = gen_lowpart (imode, operands[0]); @@ -12003,19 +12825,19 @@ ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode) else size = (GET_MODE_SIZE (mode) + 4) / 8; - gcc_assert (GET_CODE (operand) != REG || !MMX_REGNO_P (REGNO (operand))); + gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand))); gcc_assert (size >= 2 && size <= 3); /* Optimize constant pool reference to immediates. This is used by fp moves, that force all constants to memory to allow combining. */ - if (GET_CODE (operand) == MEM && MEM_READONLY_P (operand)) + if (MEM_P (operand) && MEM_READONLY_P (operand)) { rtx tmp = maybe_get_pool_constant (operand); if (tmp) operand = tmp; } - if (GET_CODE (operand) == MEM && !offsettable_memref_p (operand)) + if (MEM_P (operand) && !offsettable_memref_p (operand)) { /* The only non-offsetable memories we handle are pushes. */ int ok = push_operand (operand, VOIDmode); @@ -12164,7 +12986,7 @@ ix86_split_long_move (rtx operands[]) /* Optimize constant pool reference to immediates. This is used by fp moves, that force all constants to memory to allow combining. */ - if (GET_CODE (operands[1]) == MEM + if (MEM_P (operands[1]) && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0))) operands[1] = get_pool_constant (XEXP (operands[1], 0)); @@ -12184,14 +13006,14 @@ ix86_split_long_move (rtx operands[]) if (push_operand (operands[0], VOIDmode)) push = 1; else - gcc_assert (GET_CODE (operands[0]) != MEM + gcc_assert (!MEM_P (operands[0]) || offsettable_memref_p (operands[0])); nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0])); ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0])); /* When emitting push, take care for source operands on the stack. */ - if (push && GET_CODE (operands[1]) == MEM + if (push && MEM_P (operands[1]) && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) { if (nparts == 3) @@ -12203,7 +13025,7 @@ ix86_split_long_move (rtx operands[]) /* We need to do copy in the right order in case an address register of the source overlaps the destination. */ - if (REG_P (part[0][0]) && GET_CODE (part[1][0]) == MEM) + if (REG_P (part[0][0]) && MEM_P (part[1][0])) { if (reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))) collisions++; @@ -12338,25 +13160,25 @@ ix86_split_long_move (rtx operands[]) /* If optimizing for size, attempt to locally unCSE nonzero constants. */ if (optimize_size) { - if (GET_CODE (operands[5]) == CONST_INT + if (CONST_INT_P (operands[5]) && operands[5] != const0_rtx && REG_P (operands[2])) { - if (GET_CODE (operands[6]) == CONST_INT + if (CONST_INT_P (operands[6]) && INTVAL (operands[6]) == INTVAL (operands[5])) operands[6] = operands[2]; if (nparts == 3 - && GET_CODE (operands[7]) == CONST_INT + && CONST_INT_P (operands[7]) && INTVAL (operands[7]) == INTVAL (operands[5])) operands[7] = operands[2]; } if (nparts == 3 - && GET_CODE (operands[6]) == CONST_INT + && CONST_INT_P (operands[6]) && operands[6] != const0_rtx && REG_P (operands[3]) - && GET_CODE (operands[7]) == CONST_INT + && CONST_INT_P (operands[7]) && INTVAL (operands[7]) == INTVAL (operands[6])) operands[7] = operands[3]; } @@ -12406,7 +13228,7 @@ ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode) int count; const int single_width = mode == DImode ? 32 : 64; - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { (mode == DImode ? split_di : split_ti) (operands, 2, low, high); count = INTVAL (operands[2]) & (single_width * 2 - 1); @@ -12533,7 +13355,7 @@ ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode) int count; const int single_width = mode == DImode ? 32 : 64; - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { (mode == DImode ? split_di : split_ti) (operands, 2, low, high); count = INTVAL (operands[2]) & (single_width * 2 - 1); @@ -12612,7 +13434,7 @@ ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode) int count; const int single_width = mode == DImode ? 32 : 64; - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { (mode == DImode ? split_di : split_ti) (operands, 2, low, high); count = INTVAL (operands[2]) & (single_width * 2 - 1); @@ -12668,10 +13490,22 @@ ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode) } } +/* Predict just emitted jump instruction to be taken with probability PROB. */ +static void +predict_jump (int prob) +{ + rtx insn = get_last_insn (); + gcc_assert (JUMP_P (insn)); + REG_NOTES (insn) + = gen_rtx_EXPR_LIST (REG_BR_PROB, + GEN_INT (prob), + REG_NOTES (insn)); +} + /* Helper function for the string operations below. Dest VARIABLE whether it is aligned to VALUE bytes. If true, jump to the label. */ static rtx -ix86_expand_aligntest (rtx variable, int value) +ix86_expand_aligntest (rtx variable, int value, bool epilogue) { rtx label = gen_label_rtx (); rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); @@ -12681,6 +13515,10 @@ ix86_expand_aligntest (rtx variable, int value) emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), 1, label); + if (epilogue) + predict_jump (REG_BR_PROB_BASE * 50 / 100); + else + predict_jump (REG_BR_PROB_BASE * 90 / 100); return label; } @@ -12708,653 +13546,1302 @@ ix86_zero_extend_to_Pmode (rtx exp) return r; } -/* Expand string move (memcpy) operation. Use i386 string operations when - profitable. expand_clrmem contains similar code. */ -int -ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp) +/* Divide COUNTREG by SCALE. */ +static rtx +scale_counter (rtx countreg, int scale) { - rtx srcreg, destreg, countreg, srcexp, destexp; - enum machine_mode counter_mode; - HOST_WIDE_INT align = 0; - unsigned HOST_WIDE_INT count = 0; + rtx sc; + rtx piece_size_mask; - if (GET_CODE (align_exp) == CONST_INT) - align = INTVAL (align_exp); + if (scale == 1) + return countreg; + if (CONST_INT_P (countreg)) + return GEN_INT (INTVAL (countreg) / scale); + gcc_assert (REG_P (countreg)); - /* Can't use any of this if the user has appropriated esi or edi. */ - if (global_regs[4] || global_regs[5]) - return 0; + piece_size_mask = GEN_INT (scale - 1); + sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg, + GEN_INT (exact_log2 (scale)), + NULL, 1, OPTAB_DIRECT); + return sc; +} - /* This simple hack avoids all inlining code and simplifies code below. */ - if (!TARGET_ALIGN_STRINGOPS) - align = 64; +/* Return mode for the memcpy/memset loop counter. Prefer SImode over + DImode for constant loop counts. */ + +static enum machine_mode +counter_mode (rtx count_exp) +{ + if (GET_MODE (count_exp) != VOIDmode) + return GET_MODE (count_exp); + if (GET_CODE (count_exp) != CONST_INT) + return Pmode; + if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff)) + return DImode; + return SImode; +} + +/* When SRCPTR is non-NULL, output simple loop to move memory + pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times, + overall size is COUNT specified in bytes. When SRCPTR is NULL, output the + equivalent loop to set memory by VALUE (supposed to be in MODE). + + The size is rounded down to whole number of chunk size moved at once. + SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */ + + +static void +expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx value, + rtx count, enum machine_mode mode, int unroll, + int expected_size) +{ + rtx out_label, top_label, iter, tmp; + enum machine_mode iter_mode = counter_mode (count); + rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll); + rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); + rtx size; + rtx x_addr; + rtx y_addr; + int i; + + top_label = gen_label_rtx (); + out_label = gen_label_rtx (); + iter = gen_reg_rtx (iter_mode); - if (GET_CODE (count_exp) == CONST_INT) + size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, + NULL, 1, OPTAB_DIRECT); + /* Those two should combine. */ + if (piece_size == const1_rtx) { - count = INTVAL (count_exp); - if (!TARGET_INLINE_ALL_STRINGOPS && count > 64) - return 0; + emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, + true, out_label); + predict_jump (REG_BR_PROB_BASE * 10 / 100); } + emit_move_insn (iter, const0_rtx); - /* Figure out proper mode for counter. For 32bits it is always SImode, - for 64bits use SImode when possible, otherwise DImode. - Set count to number of bytes copied when known at compile time. */ - if (!TARGET_64BIT - || GET_MODE (count_exp) == SImode - || x86_64_zext_immediate_operand (count_exp, VOIDmode)) - counter_mode = SImode; - else - counter_mode = DImode; + emit_label (top_label); - gcc_assert (counter_mode == SImode || counter_mode == DImode); + tmp = convert_modes (Pmode, iter_mode, iter, true); + x_addr = gen_rtx_PLUS (Pmode, destptr, tmp); + destmem = change_address (destmem, mode, x_addr); - destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); - if (destreg != XEXP (dst, 0)) - dst = replace_equiv_address_nv (dst, destreg); - srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); - if (srcreg != XEXP (src, 0)) - src = replace_equiv_address_nv (src, srcreg); - - /* When optimizing for size emit simple rep ; movsb instruction for - counts not divisible by 4, except when (movsl;)*(movsw;)?(movsb;)? - sequence is shorter than mov{b,l} $count, %{ecx,cl}; rep; movsb. - Sice of (movsl;)*(movsw;)?(movsb;)? sequence is - count / 4 + (count & 3), the other sequence is either 4 or 7 bytes, - but we don't know whether upper 24 (resp. 56) bits of %ecx will be - known to be zero or not. The rep; movsb sequence causes higher - register pressure though, so take that into account. */ - - if ((!optimize || optimize_size) - && (count == 0 - || ((count & 0x03) - && (!optimize_size - || count > 5 * 4 - || (count & 3) + count / 4 > 6)))) - { - emit_insn (gen_cld ()); - countreg = ix86_zero_extend_to_Pmode (count_exp); - destexp = gen_rtx_PLUS (Pmode, destreg, countreg); - srcexp = gen_rtx_PLUS (Pmode, srcreg, countreg); - emit_insn (gen_rep_mov (destreg, dst, srcreg, src, countreg, - destexp, srcexp)); - } - - /* For constant aligned (or small unaligned) copies use rep movsl - followed by code copying the rest. For PentiumPro ensure 8 byte - alignment to allow rep movsl acceleration. */ - - else if (count != 0 - && (align >= 8 - || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) - || optimize_size || count < (unsigned int) 64)) - { - unsigned HOST_WIDE_INT offset = 0; - int size = TARGET_64BIT && !optimize_size ? 8 : 4; - rtx srcmem, dstmem; - - emit_insn (gen_cld ()); - if (count & ~(size - 1)) - { - if ((TARGET_SINGLE_STRINGOP || optimize_size) && count < 5 * 4) - { - enum machine_mode movs_mode = size == 4 ? SImode : DImode; + if (srcmem) + { + y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp)); + srcmem = change_address (srcmem, mode, y_addr); - while (offset < (count & ~(size - 1))) + /* When unrolling for chips that reorder memory reads and writes, + we can save registers by using single temporary. + Also using 4 temporaries is overkill in 32bit mode. */ + if (!TARGET_64BIT && 0) + { + for (i = 0; i < unroll; i++) + { + if (i) + { + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + srcmem = + adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, srcmem); + } + } + else + { + rtx tmpreg[4]; + gcc_assert (unroll <= 4); + for (i = 0; i < unroll; i++) + { + tmpreg[i] = gen_reg_rtx (mode); + if (i) + { + srcmem = + adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode)); + } + emit_move_insn (tmpreg[i], srcmem); + } + for (i = 0; i < unroll; i++) + { + if (i) { - srcmem = adjust_automodify_address_nv (src, movs_mode, - srcreg, offset); - dstmem = adjust_automodify_address_nv (dst, movs_mode, - destreg, offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - offset += size; + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); } + emit_move_insn (destmem, tmpreg[i]); + } + } + } + else + for (i = 0; i < unroll; i++) + { + if (i) + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + emit_move_insn (destmem, value); + } + + tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter, + true, OPTAB_LIB_WIDEN); + if (tmp != iter) + emit_move_insn (iter, tmp); + + emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, + true, top_label); + if (expected_size != -1) + { + expected_size /= GET_MODE_SIZE (mode) * unroll; + if (expected_size == 0) + predict_jump (0); + else if (expected_size > REG_BR_PROB_BASE) + predict_jump (REG_BR_PROB_BASE - 1); + else + predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size); + } + else + predict_jump (REG_BR_PROB_BASE * 80 / 100); + iter = ix86_zero_extend_to_Pmode (iter); + tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr, + true, OPTAB_LIB_WIDEN); + if (tmp != destptr) + emit_move_insn (destptr, tmp); + if (srcptr) + { + tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr, + true, OPTAB_LIB_WIDEN); + if (tmp != srcptr) + emit_move_insn (srcptr, tmp); + } + emit_label (out_label); +} + +/* Output "rep; mov" instruction. + Arguments have same meaning as for previous function */ +static void +expand_movmem_via_rep_mov (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, + rtx count, + enum machine_mode mode) +{ + rtx destexp; + rtx srcexp; + rtx countreg; + + /* If the size is known, it is shorter to use rep movs. */ + if (mode == QImode && CONST_INT_P (count) + && !(INTVAL (count) & 3)) + mode = SImode; + + if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) + destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); + if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) + srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); + if (mode != QImode) + { + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + srcexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); + } + else + { + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); + } + emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, + destexp, srcexp)); +} + +/* Output "rep; stos" instruction. + Arguments have same meaning as for previous function */ +static void +expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value, + rtx count, + enum machine_mode mode) +{ + rtx destexp; + rtx countreg; + + if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) + destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); + value = force_reg (mode, gen_lowpart (mode, value)); + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); + if (mode != QImode) + { + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + } + else + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); +} + +static void +emit_strmov (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, enum machine_mode mode, int offset) +{ + rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset); + rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); +} + +/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */ +static void +expand_movmem_epilogue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx count, int max_size) +{ + rtx src, dest; + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + int offset = 0; + + if ((countval & 0x10) && max_size > 16) + { + if (TARGET_64BIT) + { + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8); } else + gcc_unreachable (); + offset += 16; + } + if ((countval & 0x08) && max_size > 8) + { + if (TARGET_64BIT) + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + else { - countreg = GEN_INT ((count >> (size == 4 ? 2 : 3)) - & (TARGET_64BIT ? -1 : 0x3fffffff)); - countreg = copy_to_mode_reg (counter_mode, countreg); - countreg = ix86_zero_extend_to_Pmode (countreg); - - destexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (size == 4 ? 2 : 3)); - srcexp = gen_rtx_PLUS (Pmode, destexp, srcreg); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - - emit_insn (gen_rep_mov (destreg, dst, srcreg, src, - countreg, destexp, srcexp)); - offset = count & ~(size - 1); + emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset); + emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4); } + offset += 8; } - if (size == 8 && (count & 0x04)) + if ((countval & 0x04) && max_size > 4) { - srcmem = adjust_automodify_address_nv (src, SImode, srcreg, - offset); - dstmem = adjust_automodify_address_nv (dst, SImode, destreg, - offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset); offset += 4; } - if (count & 0x02) + if ((countval & 0x02) && max_size > 2) { - srcmem = adjust_automodify_address_nv (src, HImode, srcreg, - offset); - dstmem = adjust_automodify_address_nv (dst, HImode, destreg, - offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset); offset += 2; } - if (count & 0x01) + if ((countval & 0x01) && max_size > 1) { - srcmem = adjust_automodify_address_nv (src, QImode, srcreg, - offset); - dstmem = adjust_automodify_address_nv (dst, QImode, destreg, - offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset); + offset += 1; } + return; } - /* The generic code based on the glibc implementation: - - align destination to 4 bytes (8 byte alignment is used for PentiumPro - allowing accelerated copying there) - - copy the data using rep movsl - - copy the rest. */ - else + if (max_size > 8) { - rtx countreg2; - rtx label = NULL; - rtx srcmem, dstmem; - int desired_alignment = (TARGET_PENTIUMPRO - && (count == 0 || count >= (unsigned int) 260) - ? 8 : UNITS_PER_WORD); - /* Get rid of MEM_OFFSETs, they won't be accurate. */ - dst = change_address (dst, BLKmode, destreg); - src = change_address (src, BLKmode, srcreg); - - /* In case we don't know anything about the alignment, default to - library version, since it is usually equally fast and result in - shorter code. - - Also emit call when we know that the count is large and call overhead - will not be important. */ - if (!TARGET_INLINE_ALL_STRINGOPS - && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) - return 0; - - if (TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - - countreg2 = gen_reg_rtx (Pmode); - countreg = copy_to_mode_reg (counter_mode, count_exp); - - /* We don't use loops to align destination and to copy parts smaller - than 4 bytes, because gcc is able to optimize such code better (in - the case the destination or the count really is aligned, gcc is often - able to predict the branches) and also it is friendlier to the - hardware branch prediction. - - Using loops is beneficial for generic case, because we can - handle small counts using the loops. Many CPUs (such as Athlon) - have large REP prefix setup costs. - - This is quite costly. Maybe we can revisit this decision later or - add some customizability to this code. */ + count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), + count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL, + count, QImode, 1, 4); + return; + } - if (count == 0 && align < desired_alignment) + /* When there are stringops, we can cheaply increase dest and src pointers. + Otherwise we save code size by maintaining offset (zero is readily + available from preceding rep operation) and using x86 addressing modes. + */ + if (TARGET_SINGLE_STRINGOP) + { + if (max_size > 4) { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (countreg, GEN_INT (desired_alignment - 1), - LEU, 0, counter_mode, 1, label); + rtx label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_label (label); + LABEL_NUSES (label) = 1; } - if (align <= 1) + if (max_size > 2) { - rtx label = ix86_expand_aligntest (destreg, 1); - srcmem = change_address (src, QImode, srcreg); - dstmem = change_address (dst, QImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - ix86_adjust_counter (countreg, 1); + rtx label = ix86_expand_aligntest (count, 2, true); + src = change_address (srcmem, HImode, srcptr); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); emit_label (label); LABEL_NUSES (label) = 1; } - if (align <= 2) + if (max_size > 1) { - rtx label = ix86_expand_aligntest (destreg, 2); - srcmem = change_address (src, HImode, srcreg); - dstmem = change_address (dst, HImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - ix86_adjust_counter (countreg, 2); + rtx label = ix86_expand_aligntest (count, 1, true); + src = change_address (srcmem, QImode, srcptr); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); emit_label (label); LABEL_NUSES (label) = 1; } - if (align <= 4 && desired_alignment > 4) + } + else + { + rtx offset = force_reg (Pmode, const0_rtx); + rtx tmp; + + if (max_size > 4) { - rtx label = ix86_expand_aligntest (destreg, 4); - srcmem = change_address (src, SImode, srcreg); - dstmem = change_address (dst, SImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - ix86_adjust_counter (countreg, 4); + rtx label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); emit_label (label); LABEL_NUSES (label) = 1; } - - if (label && desired_alignment > 4 && !TARGET_64BIT) + if (max_size > 2) + { + rtx label = ix86_expand_aligntest (count, 2, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, HImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, HImode, tmp); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 1) { + rtx label = ix86_expand_aligntest (count, 1, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, QImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, QImode, tmp); + emit_move_insn (dest, src); emit_label (label); LABEL_NUSES (label) = 1; - label = NULL_RTX; } - if (!TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - if (TARGET_64BIT) + } +} + +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, + rtx count, int max_size) +{ + count = + expand_simple_binop (counter_mode (count), AND, count, + GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL, + gen_lowpart (QImode, value), count, QImode, + 1, max_size / 2); +} + +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size) +{ + rtx dest; + + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + int offset = 0; + + if ((countval & 0x10) && max_size > 16) { - emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), - GEN_INT (3))); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, GEN_INT (3)); + if (TARGET_64BIT) + { + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8); + emit_insn (gen_strset (destptr, dest, value)); + } + else + gcc_unreachable (); + offset += 16; } - else + if ((countval & 0x08) && max_size > 8) { - emit_insn (gen_lshrsi3 (countreg2, countreg, const2_rtx)); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, const2_rtx); + if (TARGET_64BIT) + { + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4); + emit_insn (gen_strset (destptr, dest, value)); + } + offset += 8; } - srcexp = gen_rtx_PLUS (Pmode, destexp, srcreg); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_mov (destreg, dst, srcreg, src, - countreg2, destexp, srcexp)); - - if (label) + if ((countval & 0x04) && max_size > 4) { - emit_label (label); - LABEL_NUSES (label) = 1; + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); + offset += 4; } - if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) + if ((countval & 0x02) && max_size > 2) { - srcmem = change_address (src, SImode, srcreg); - dstmem = change_address (dst, SImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); + offset += 2; } - if ((align <= 4 || count == 0) && TARGET_64BIT) + if ((countval & 0x01) && max_size > 1) { - rtx label = ix86_expand_aligntest (countreg, 4); - srcmem = change_address (src, SImode, srcreg); - dstmem = change_address (dst, SImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - emit_label (label); - LABEL_NUSES (label) = 1; + dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); + offset += 1; } - if (align > 2 && count != 0 && (count & 2)) + return; + } + if (max_size > 32) + { + expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size); + return; + } + if (max_size > 16) + { + rtx label = ix86_expand_aligntest (count, 16, true); + if (TARGET_64BIT) { - srcmem = change_address (src, HImode, srcreg); - dstmem = change_address (dst, HImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); } - if (align <= 2 || count == 0) + else { - rtx label = ix86_expand_aligntest (countreg, 2); - srcmem = change_address (src, HImode, srcreg); - dstmem = change_address (dst, HImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - emit_label (label); - LABEL_NUSES (label) = 1; + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); } - if (align > 1 && count != 0 && (count & 1)) + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 8) + { + rtx label = ix86_expand_aligntest (count, 8, true); + if (TARGET_64BIT) { - srcmem = change_address (src, QImode, srcreg); - dstmem = change_address (dst, QImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); } - if (align <= 1 || count == 0) + else { - rtx label = ix86_expand_aligntest (countreg, 1); - srcmem = change_address (src, QImode, srcreg); - dstmem = change_address (dst, QImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - emit_label (label); - LABEL_NUSES (label) = 1; + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 4) + { + rtx label = ix86_expand_aligntest (count, 4, true); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 2) + { + rtx label = ix86_expand_aligntest (count, 2, true); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; } + if (max_size > 1) + { + rtx label = ix86_expand_aligntest (count, 1, true); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } +} - return 1; +/* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to + DESIRED_ALIGNMENT. */ +static void +expand_movmem_prologue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx count, + int align, int desired_alignment) +{ + if (align <= 1 && desired_alignment > 1) + { + rtx label = ix86_expand_aligntest (destptr, 1, false); + srcmem = change_address (srcmem, QImode, srcptr); + destmem = change_address (destmem, QImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2 && desired_alignment > 2) + { + rtx label = ix86_expand_aligntest (destptr, 2, false); + srcmem = change_address (srcmem, HImode, srcptr); + destmem = change_address (destmem, HImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && desired_alignment > 4) + { + rtx label = ix86_expand_aligntest (destptr, 4, false); + srcmem = change_address (srcmem, SImode, srcptr); + destmem = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + gcc_assert (desired_alignment <= 8); } -/* Expand string clear operation (bzero). Use i386 string operations when - profitable. expand_movmem contains similar code. */ +/* Set enough from DEST to align DEST known to by aligned by ALIGN to + DESIRED_ALIGNMENT. */ +static void +expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count, + int align, int desired_alignment) +{ + if (align <= 1 && desired_alignment > 1) + { + rtx label = ix86_expand_aligntest (destptr, 1, false); + destmem = change_address (destmem, QImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value))); + ix86_adjust_counter (count, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2 && desired_alignment > 2) + { + rtx label = ix86_expand_aligntest (destptr, 2, false); + destmem = change_address (destmem, HImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value))); + ix86_adjust_counter (count, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && desired_alignment > 4) + { + rtx label = ix86_expand_aligntest (destptr, 4, false); + destmem = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value))); + ix86_adjust_counter (count, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + gcc_assert (desired_alignment <= 8); +} + +/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */ +static enum stringop_alg +decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, + int *dynamic_check) +{ + const struct stringop_algs * algs; + + *dynamic_check = -1; + if (memset) + algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + if (stringop_alg != no_stringop) + return stringop_alg; + /* rep; movq or rep; movl is the smallest variant. */ + else if (optimize_size) + { + if (!count || (count & 3)) + return rep_prefix_1_byte; + else + return rep_prefix_4_byte; + } + /* Very tiny blocks are best handled via the loop, REP is expensive to setup. + */ + else if (expected_size != -1 && expected_size < 4) + return loop_1_byte; + else if (expected_size != -1) + { + unsigned int i; + enum stringop_alg alg = libcall; + for (i = 0; i < NAX_STRINGOP_ALGS; i++) + { + gcc_assert (algs->size[i].max); + if (algs->size[i].max >= expected_size || algs->size[i].max == -1) + { + if (algs->size[i].alg != libcall) + alg = algs->size[i].alg; + /* Honor TARGET_INLINE_ALL_STRINGOPS by picking + last non-libcall inline algorithm. */ + if (TARGET_INLINE_ALL_STRINGOPS) + { + /* When the current size is best to be copied by a libcall, + but we are still forced to inline, run the heuristic bellow + that will pick code for medium sized blocks. */ + if (alg != libcall) + return alg; + break; + } + else + return algs->size[i].alg; + } + } + gcc_assert (TARGET_INLINE_ALL_STRINGOPS); + } + /* When asked to inline the call anyway, try to pick meaningful choice. + We look for maximal size of block that is faster to copy by hand and + take blocks of at most of that size guessing that average size will + be roughly half of the block. + + If this turns out to be bad, we might simply specify the preferred + choice in ix86_costs. */ + if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY) + && algs->unknown_size == libcall) + { + int max = -1; + enum stringop_alg alg; + int i; + + for (i = 0; i < NAX_STRINGOP_ALGS; i++) + if (algs->size[i].alg != libcall && algs->size[i].alg) + max = algs->size[i].max; + if (max == -1) + max = 4096; + alg = decide_alg (count, max / 2, memset, dynamic_check); + gcc_assert (*dynamic_check == -1); + gcc_assert (alg != libcall); + if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) + *dynamic_check = max; + return alg; + } + return algs->unknown_size; +} + +/* Decide on alignment. We know that the operand is already aligned to ALIGN + (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */ +static int +decide_alignment (int align, + enum stringop_alg alg, + int expected_size) +{ + int desired_align = 0; + switch (alg) + { + case no_stringop: + gcc_unreachable (); + case loop: + case unrolled_loop: + desired_align = GET_MODE_SIZE (Pmode); + break; + case rep_prefix_8_byte: + desired_align = 8; + break; + case rep_prefix_4_byte: + /* PentiumPro has special logic triggering for 8 byte aligned blocks. + copying whole cacheline at once. */ + if (TARGET_PENTIUMPRO) + desired_align = 8; + else + desired_align = 4; + break; + case rep_prefix_1_byte: + /* PentiumPro has special logic triggering for 8 byte aligned blocks. + copying whole cacheline at once. */ + if (TARGET_PENTIUMPRO) + desired_align = 8; + else + desired_align = 1; + break; + case loop_1_byte: + desired_align = 1; + break; + case libcall: + return 0; + } + + if (optimize_size) + desired_align = 1; + if (desired_align < align) + desired_align = align; + if (expected_size != -1 && expected_size < 4) + desired_align = align; + return desired_align; +} + +/* Return the smallest power of 2 greater than VAL. */ +static int +smallest_pow2_greater_than (int val) +{ + int ret = 1; + while (ret <= val) + ret <<= 1; + return ret; +} + +/* Expand string move (memcpy) operation. Use i386 string operations when + profitable. expand_clrmem contains similar code. The code depends upon + architecture, block size and alignment, but always has the same + overall structure: + + 1) Prologue guard: Conditional that jumps up to epilogues for small + blocks that can be handled by epilogue alone. This is faster but + also needed for correctness, since prologue assume the block is larger + than the desired alignment. + + Optional dynamic check for size and libcall for large + blocks is emitted here too, with -minline-stringops-dynamically. + + 2) Prologue: copy first few bytes in order to get destination aligned + to DESIRED_ALIGN. It is emitted only when ALIGN is less than + DESIRED_ALIGN and and up to DESIRED_ALIGN - ALIGN bytes can be copied. + We emit either a jump tree on power of two sized blocks, or a byte loop. + + 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks + with specified algorithm. + + 4) Epilogue: code copying tail of the block that is too small to be + handled by main body (or up to size guarded by prologue guard). */ + int -ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp) +ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, + rtx expected_align_exp, rtx expected_size_exp) { - rtx destreg, zeroreg, countreg, destexp; - enum machine_mode counter_mode; - HOST_WIDE_INT align = 0; + rtx destreg; + rtx srcreg; + rtx label = NULL; + rtx tmp; + rtx jump_around_label = NULL; + HOST_WIDE_INT align = 1; unsigned HOST_WIDE_INT count = 0; + HOST_WIDE_INT expected_size = -1; + int size_needed = 0, epilogue_size_needed; + int desired_align = 0; + enum stringop_alg alg; + int dynamic_check; - if (GET_CODE (align_exp) == CONST_INT) + if (CONST_INT_P (align_exp)) align = INTVAL (align_exp); + /* i386 can do misaligned access on reasonably increased cost. */ + if (CONST_INT_P (expected_align_exp) + && INTVAL (expected_align_exp) > align) + align = INTVAL (expected_align_exp); + if (CONST_INT_P (count_exp)) + count = expected_size = INTVAL (count_exp); + if (CONST_INT_P (expected_size_exp) && count == 0) + expected_size = INTVAL (expected_size_exp); - /* Can't use any of this if the user has appropriated esi. */ - if (global_regs[4]) - return 0; + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ + + alg = decide_alg (count, expected_size, false, &dynamic_check); + desired_align = decide_alignment (align, alg, expected_size); - /* This simple hack avoids all inlining code and simplifies code below. */ if (!TARGET_ALIGN_STRINGOPS) - align = 32; + align = desired_align; + + if (alg == libcall) + return 0; + gcc_assert (alg != no_stringop); + if (!count) + count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop: + size_needed = GET_MODE_SIZE (Pmode); + break; + case unrolled_loop: + size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2); + break; + case rep_prefix_8_byte: + size_needed = 8; + break; + case rep_prefix_4_byte: + size_needed = 4; + break; + case rep_prefix_1_byte: + case loop_1_byte: + size_needed = 1; + break; + } + + epilogue_size_needed = size_needed; + + /* Step 1: Prologue guard. */ + + /* Alignment code needs count to be in register. */ + if (CONST_INT_P (count_exp) && desired_align > align) + { + enum machine_mode mode = SImode; + if (TARGET_64BIT && (count & ~0xffffffff)) + mode = DImode; + count_exp = force_reg (mode, count_exp); + } + gcc_assert (desired_align >= 1 && align >= 1); + + /* Ensure that alignment prologue won't copy past end of block. */ + if (size_needed > 1 || (desired_align > 1 && desired_align > align)) + { + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); + + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (GET_CODE (count_exp) == CONST_INT) + ; + else if (expected_size == -1 || expected_size < epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); + } + /* Emit code to decide on runtime whether library call or inline should be + used. */ + if (dynamic_check != -1) + { + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, GET_MODE (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + emit_block_move_via_libcall (dst, src, count_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } - if (GET_CODE (count_exp) == CONST_INT) + /* Step 2: Alignment prologue. */ + + if (desired_align > align) + { + /* Except for the first move in epilogue, we no longer know + constant offset in aliasing info. It don't seems to worth + the pain to maintain it for the first move, so throw away + the info early. */ + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align, + desired_align); + } + if (label && size_needed == 1) + { + emit_label (label); + LABEL_NUSES (label) = 1; + label = NULL; + } + + /* Step 3: Main loop. */ + + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop_1_byte: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, QImode, 1, expected_size); + break; + case loop: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, Pmode, 1, expected_size); + break; + case unrolled_loop: + /* Unroll only by factor of 2 in 32bit mode, since we don't have enough + registers for 4 temporaries anyway. */ + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, Pmode, TARGET_64BIT ? 4 : 2, + expected_size); + break; + case rep_prefix_8_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + DImode); + break; + case rep_prefix_4_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + SImode); + break; + case rep_prefix_1_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + QImode); + break; + } + /* Adjust properly the offset of src and dest memory for aliasing. */ + if (CONST_INT_P (count_exp)) { - count = INTVAL (count_exp); - if (!TARGET_INLINE_ALL_STRINGOPS && count > 64) - return 0; + src = adjust_automodify_address_nv (src, BLKmode, srcreg, + (count / size_needed) * size_needed); + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, + (count / size_needed) * size_needed); } - /* Figure out proper mode for counter. For 32bits it is always SImode, - for 64bits use SImode when possible, otherwise DImode. - Set count to number of bytes copied when known at compile time. */ - if (!TARGET_64BIT - || GET_MODE (count_exp) == SImode - || x86_64_zext_immediate_operand (count_exp, VOIDmode)) - counter_mode = SImode; else - counter_mode = DImode; - - destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); - if (destreg != XEXP (dst, 0)) - dst = replace_equiv_address_nv (dst, destreg); - + { + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + } - /* When optimizing for size emit simple rep ; movsb instruction for - counts not divisible by 4. The movl $N, %ecx; rep; stosb - sequence is 7 bytes long, so if optimizing for size and count is - small enough that some stosl, stosw and stosb instructions without - rep are shorter, fall back into the next if. */ + /* Step 4: Epilogue to copy the remaining bytes. */ - if ((!optimize || optimize_size) - && (count == 0 - || ((count & 0x03) - && (!optimize_size || (count & 0x03) + (count >> 2) > 7)))) + if (label) { - emit_insn (gen_cld ()); + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ - countreg = ix86_zero_extend_to_Pmode (count_exp); - zeroreg = copy_to_mode_reg (QImode, const0_rtx); - destexp = gen_rtx_PLUS (Pmode, destreg, countreg); - emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp)); + if (size_needed < epilogue_size_needed) + { + tmp = + expand_simple_binop (counter_mode (count_exp), AND, count_exp, + GEN_INT (size_needed - 1), count_exp, 1, + OPTAB_DIRECT); + if (tmp != count_exp) + emit_move_insn (count_exp, tmp); + } + emit_label (label); + LABEL_NUSES (label) = 1; } - else if (count != 0 - && (align >= 8 - || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) - || optimize_size || count < (unsigned int) 64)) - { - int size = TARGET_64BIT && !optimize_size ? 8 : 4; - unsigned HOST_WIDE_INT offset = 0; - emit_insn (gen_cld ()); + if (count_exp != const0_rtx && epilogue_size_needed > 1) + expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, + epilogue_size_needed); + if (jump_around_label) + emit_label (jump_around_label); + return 1; +} - zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx); - if (count & ~(size - 1)) - { - unsigned HOST_WIDE_INT repcount; - unsigned int max_nonrep; +/* Helper function for memcpy. For QImode value 0xXY produce + 0xXYXYXYXY of wide specified by MODE. This is essentially + a * 0x10101010, but we can do slightly better than + synth_mult by unwinding the sequence by hand on CPUs with + slow multiply. */ +static rtx +promote_duplicated_reg (enum machine_mode mode, rtx val) +{ + enum machine_mode valmode = GET_MODE (val); + rtx tmp; + int nops = mode == DImode ? 3 : 2; - repcount = count >> (size == 4 ? 2 : 3); - if (!TARGET_64BIT) - repcount &= 0x3fffffff; + gcc_assert (mode == SImode || mode == DImode); + if (val == const0_rtx) + return copy_to_mode_reg (mode, const0_rtx); + if (CONST_INT_P (val)) + { + HOST_WIDE_INT v = INTVAL (val) & 255; - /* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes. - movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN - bytes. In both cases the latter seems to be faster for small - values of N. */ - max_nonrep = size == 4 ? 7 : 4; - if (!optimize_size) - switch (ix86_tune) - { - case PROCESSOR_PENTIUM4: - case PROCESSOR_NOCONA: - max_nonrep = 3; - break; - default: - break; - } + v |= v << 8; + v |= v << 16; + if (mode == DImode) + v |= (v << 16) << 16; + return copy_to_mode_reg (mode, gen_int_mode (v, mode)); + } - if (repcount <= max_nonrep) - while (repcount-- > 0) - { - rtx mem = adjust_automodify_address_nv (dst, - GET_MODE (zeroreg), - destreg, offset); - emit_insn (gen_strset (destreg, mem, zeroreg)); - offset += size; - } - else - { - countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount)); - countreg = ix86_zero_extend_to_Pmode (countreg); - destexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (size == 4 ? 2 : 3)); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, - destexp)); - offset = count & ~(size - 1); - } - } - if (size == 8 && (count & 0x04)) - { - rtx mem = adjust_automodify_address_nv (dst, SImode, destreg, - offset); - emit_insn (gen_strset (destreg, mem, - gen_rtx_SUBREG (SImode, zeroreg, 0))); - offset += 4; - } - if (count & 0x02) - { - rtx mem = adjust_automodify_address_nv (dst, HImode, destreg, - offset); - emit_insn (gen_strset (destreg, mem, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - offset += 2; - } - if (count & 0x01) - { - rtx mem = adjust_automodify_address_nv (dst, QImode, destreg, - offset); - emit_insn (gen_strset (destreg, mem, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - } + if (valmode == VOIDmode) + valmode = QImode; + if (valmode != QImode) + val = gen_lowpart (QImode, val); + if (mode == QImode) + return val; + if (!TARGET_PARTIAL_REG_STALL) + nops--; + if (ix86_cost->mult_init[mode == DImode ? 3 : 2] + + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) + <= (ix86_cost->shift_const + ix86_cost->add) * nops + + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) + { + rtx reg = convert_modes (mode, QImode, val, true); + tmp = promote_duplicated_reg (mode, const1_rtx); + return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, + OPTAB_DIRECT); } else { - rtx countreg2; - rtx label = NULL; - /* Compute desired alignment of the string operation. */ - int desired_alignment = (TARGET_PENTIUMPRO - && (count == 0 || count >= (unsigned int) 260) - ? 8 : UNITS_PER_WORD); - - /* In case we don't know anything about the alignment, default to - library version, since it is usually equally fast and result in - shorter code. - - Also emit call when we know that the count is large and call overhead - will not be important. */ - if (!TARGET_INLINE_ALL_STRINGOPS - && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) - return 0; - - if (TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - - countreg2 = gen_reg_rtx (Pmode); - countreg = copy_to_mode_reg (counter_mode, count_exp); - zeroreg = copy_to_mode_reg (Pmode, const0_rtx); - /* Get rid of MEM_OFFSET, it won't be accurate. */ - dst = change_address (dst, BLKmode, destreg); - - if (count == 0 && align < desired_alignment) - { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (countreg, GEN_INT (desired_alignment - 1), - LEU, 0, counter_mode, 1, label); - } - if (align <= 1) - { - rtx label = ix86_expand_aligntest (destreg, 1); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - ix86_adjust_counter (countreg, 1); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 2) - { - rtx label = ix86_expand_aligntest (destreg, 2); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - ix86_adjust_counter (countreg, 2); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 4 && desired_alignment > 4) - { - rtx label = ix86_expand_aligntest (destreg, 4); - emit_insn (gen_strset (destreg, dst, - (TARGET_64BIT - ? gen_rtx_SUBREG (SImode, zeroreg, 0) - : zeroreg))); - ix86_adjust_counter (countreg, 4); - emit_label (label); - LABEL_NUSES (label) = 1; - } - - if (label && desired_alignment > 4 && !TARGET_64BIT) - { - emit_label (label); - LABEL_NUSES (label) = 1; - label = NULL_RTX; - } + rtx reg = convert_modes (mode, QImode, val, true); - if (!TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - if (TARGET_64BIT) - { - emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), - GEN_INT (3))); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, GEN_INT (3)); - } + if (!TARGET_PARTIAL_REG_STALL) + if (mode == SImode) + emit_insn (gen_movsi_insv_1 (reg, reg)); + else + emit_insn (gen_movdi_insv_1_rex64 (reg, reg)); else { - emit_insn (gen_lshrsi3 (countreg2, countreg, const2_rtx)); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, const2_rtx); + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), + NULL, 1, OPTAB_DIRECT); + reg = + expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); } - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_stos (destreg, countreg2, dst, zeroreg, destexp)); + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + if (mode == SImode) + return reg; + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + return reg; + } +} - if (label) - { - emit_label (label); - LABEL_NUSES (label) = 1; - } +/* Duplicate value VAL using promote_duplicated_reg into maximal size that will + be needed by main loop copying SIZE_NEEDED chunks and prologue getting + alignment from ALIGN to DESIRED_ALIGN. */ +static rtx +promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align) +{ + rtx promoted_val; - if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (SImode, zeroreg, 0))); - if (TARGET_64BIT && (align <= 4 || count == 0)) - { - rtx label = ix86_expand_aligntest (countreg, 4); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (SImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 2 && count != 0 && (count & 2)) - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - if (align <= 2 || count == 0) - { - rtx label = ix86_expand_aligntest (countreg, 2); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 1 && count != 0 && (count & 1)) - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - if (align <= 1 || count == 0) - { - rtx label = ix86_expand_aligntest (countreg, 1); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - } - return 1; + if (TARGET_64BIT + && (size_needed > 4 || (desired_align > align && desired_align > 4))) + promoted_val = promote_duplicated_reg (DImode, val); + else if (size_needed > 2 || (desired_align > align && desired_align > 2)) + promoted_val = promote_duplicated_reg (SImode, val); + else if (size_needed > 1 || (desired_align > align && desired_align > 1)) + promoted_val = promote_duplicated_reg (HImode, val); + else + promoted_val = val; + + return promoted_val; } -/* Expand strlen. */ +/* Expand string clear operation (bzero). Use i386 string operations when + profitable. See expand_movmem comment for explanation of individual + steps performed. */ int -ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) +ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, + rtx expected_align_exp, rtx expected_size_exp) { - rtx addr, scratch1, scratch2, scratch3, scratch4; + rtx destreg; + rtx label = NULL; + rtx tmp; + rtx jump_around_label = NULL; + HOST_WIDE_INT align = 1; + unsigned HOST_WIDE_INT count = 0; + HOST_WIDE_INT expected_size = -1; + int size_needed = 0, epilogue_size_needed; + int desired_align = 0; + enum stringop_alg alg; + rtx promoted_val = NULL; + bool force_loopy_epilogue = false; + int dynamic_check; + + if (CONST_INT_P (align_exp)) + align = INTVAL (align_exp); + /* i386 can do misaligned access on reasonably increased cost. */ + if (CONST_INT_P (expected_align_exp) + && INTVAL (expected_align_exp) > align) + align = INTVAL (expected_align_exp); + if (CONST_INT_P (count_exp)) + count = expected_size = INTVAL (count_exp); + if (CONST_INT_P (expected_size_exp) && count == 0) + expected_size = INTVAL (expected_size_exp); - /* The generic case of strlen expander is long. Avoid it's - expanding unless TARGET_INLINE_ALL_STRINGOPS. */ + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ - if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 - && !TARGET_INLINE_ALL_STRINGOPS - && !optimize_size - && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)) - return 0; + alg = decide_alg (count, expected_size, true, &dynamic_check); + desired_align = decide_alignment (align, alg, expected_size); - addr = force_reg (Pmode, XEXP (src, 0)); - scratch1 = gen_reg_rtx (Pmode); + if (!TARGET_ALIGN_STRINGOPS) + align = desired_align; - if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 - && !optimize_size) + if (alg == libcall) + return 0; + gcc_assert (alg != no_stringop); + if (!count) + count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp); + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + switch (alg) { - /* Well it seems that some optimizer does not combine a call like - foo(strlen(bar), strlen(bar)); - when the move and the subtraction is done here. It does calculate - the length just once when these instructions are done inside of - output_strlen_unroll(). But I think since &bar[strlen(bar)] is - often used and I use one fewer register for the lifetime of - output_strlen_unroll() this is better. */ + case libcall: + case no_stringop: + gcc_unreachable (); + case loop: + size_needed = GET_MODE_SIZE (Pmode); + break; + case unrolled_loop: + size_needed = GET_MODE_SIZE (Pmode) * 4; + break; + case rep_prefix_8_byte: + size_needed = 8; + break; + case rep_prefix_4_byte: + size_needed = 4; + break; + case rep_prefix_1_byte: + case loop_1_byte: + size_needed = 1; + break; + } + epilogue_size_needed = size_needed; + + /* Step 1: Prologue guard. */ + + /* Alignment code needs count to be in register. */ + if (CONST_INT_P (count_exp) && desired_align > align) + { + enum machine_mode mode = SImode; + if (TARGET_64BIT && (count & ~0xffffffff)) + mode = DImode; + count_exp = force_reg (mode, count_exp); + } + /* Do the cheap promotion to allow better CSE across the + main loop and epilogue (ie one load of the big constant in the + front of all code. */ + if (CONST_INT_P (val_exp)) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + /* Ensure that alignment prologue won't copy past end of block. */ + if (size_needed > 1 || (desired_align > 1 && desired_align > align)) + { + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); + + /* To improve performance of small blocks, we jump around the VAL + promoting mode. This mean that if the promoted VAL is not constant, + we might not use it in the epilogue and have to use byte + loop variant. */ + if (epilogue_size_needed > 2 && !promoted_val) + force_loopy_epilogue = true; + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (GET_CODE (count_exp) == CONST_INT) + ; + else if (expected_size == -1 || expected_size <= epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); + } + if (dynamic_check != -1) + { + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, counter_mode (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + set_storage_via_libcall (dst, count_exp, val_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } - emit_move_insn (out, addr); + /* Step 2: Alignment prologue. */ - ix86_expand_strlensi_unroll_1 (out, src, align); + /* Do the expensive promotion once we branched off the small blocks. */ + if (!promoted_val) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + gcc_assert (desired_align >= 1 && align >= 1); - /* strlensi_unroll_1 returns the address of the zero at the end of - the string, like memchr(), so compute the length by subtracting - the start address. */ - if (TARGET_64BIT) - emit_insn (gen_subdi3 (out, out, addr)); - else - emit_insn (gen_subsi3 (out, out, addr)); + if (desired_align > align) + { + /* Except for the first move in epilogue, we no longer know + constant offset in aliasing info. It don't seems to worth + the pain to maintain it for the first move, so throw away + the info early. */ + dst = change_address (dst, BLKmode, destreg); + expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align, + desired_align); } - else + if (label && size_needed == 1) { - rtx unspec; - scratch2 = gen_reg_rtx (Pmode); - scratch3 = gen_reg_rtx (Pmode); - scratch4 = force_reg (Pmode, constm1_rtx); + emit_label (label); + LABEL_NUSES (label) = 1; + label = NULL; + } - emit_move_insn (scratch3, addr); - eoschar = force_reg (QImode, eoschar); + /* Step 3: Main loop. */ - emit_insn (gen_cld ()); - src = replace_equiv_address_nv (src, scratch3); + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop_1_byte: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, QImode, 1, expected_size); + break; + case loop: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, Pmode, 1, expected_size); + break; + case unrolled_loop: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, Pmode, 4, expected_size); + break; + case rep_prefix_8_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + DImode); + break; + case rep_prefix_4_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + SImode); + break; + case rep_prefix_1_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + QImode); + break; + } + /* Adjust properly the offset of src and dest memory for aliasing. */ + if (CONST_INT_P (count_exp)) + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, + (count / size_needed) * size_needed); + else + dst = change_address (dst, BLKmode, destreg); - /* If .md starts supporting :P, this can be done in .md. */ - unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align, - scratch4), UNSPEC_SCAS); - emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec)); - if (TARGET_64BIT) + /* Step 4: Epilogue to copy the remaining bytes. */ + + if (label) + { + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ + + if (size_needed < desired_align - align) { - emit_insn (gen_one_cmpldi2 (scratch2, scratch1)); - emit_insn (gen_adddi3 (out, scratch2, constm1_rtx)); + tmp = + expand_simple_binop (counter_mode (count_exp), AND, count_exp, + GEN_INT (size_needed - 1), count_exp, 1, + OPTAB_DIRECT); + size_needed = desired_align - align + 1; + if (tmp != count_exp) + emit_move_insn (count_exp, tmp); } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (count_exp != const0_rtx && epilogue_size_needed > 1) + { + if (force_loopy_epilogue) + expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, + size_needed); else - { - emit_insn (gen_one_cmplsi2 (scratch2, scratch1)); - emit_insn (gen_addsi3 (out, scratch2, constm1_rtx)); - } + expand_setmem_epilogue (dst, destreg, promoted_val, count_exp, + size_needed); } + if (jump_around_label) + emit_label (jump_around_label); return 1; } @@ -13384,7 +14871,7 @@ ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx) rtx cmp; align = 0; - if (GET_CODE (align_rtx) == CONST_INT) + if (CONST_INT_P (align_rtx)) align = INTVAL (align_rtx); /* Loop to check 1..3 bytes for null to get an aligned pointer. */ @@ -13546,7 +15033,95 @@ ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx) else emit_insn (gen_subsi3_carry (out, out, GEN_INT (3), cmp)); - emit_label (end_0_label); + emit_label (end_0_label); +} + +/* Expand strlen. */ + +int +ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) +{ + rtx addr, scratch1, scratch2, scratch3, scratch4; + + /* The generic case of strlen expander is long. Avoid it's + expanding unless TARGET_INLINE_ALL_STRINGOPS. */ + + if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 + && !TARGET_INLINE_ALL_STRINGOPS + && !optimize_size + && (!CONST_INT_P (align) || INTVAL (align) < 4)) + return 0; + + addr = force_reg (Pmode, XEXP (src, 0)); + scratch1 = gen_reg_rtx (Pmode); + + if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 + && !optimize_size) + { + /* Well it seems that some optimizer does not combine a call like + foo(strlen(bar), strlen(bar)); + when the move and the subtraction is done here. It does calculate + the length just once when these instructions are done inside of + output_strlen_unroll(). But I think since &bar[strlen(bar)] is + often used and I use one fewer register for the lifetime of + output_strlen_unroll() this is better. */ + + emit_move_insn (out, addr); + + ix86_expand_strlensi_unroll_1 (out, src, align); + + /* strlensi_unroll_1 returns the address of the zero at the end of + the string, like memchr(), so compute the length by subtracting + the start address. */ + if (TARGET_64BIT) + emit_insn (gen_subdi3 (out, out, addr)); + else + emit_insn (gen_subsi3 (out, out, addr)); + } + else + { + rtx unspec; + scratch2 = gen_reg_rtx (Pmode); + scratch3 = gen_reg_rtx (Pmode); + scratch4 = force_reg (Pmode, constm1_rtx); + + emit_move_insn (scratch3, addr); + eoschar = force_reg (QImode, eoschar); + + src = replace_equiv_address_nv (src, scratch3); + + /* If .md starts supporting :P, this can be done in .md. */ + unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align, + scratch4), UNSPEC_SCAS); + emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec)); + if (TARGET_64BIT) + { + emit_insn (gen_one_cmpldi2 (scratch2, scratch1)); + emit_insn (gen_adddi3 (out, scratch2, constm1_rtx)); + } + else + { + emit_insn (gen_one_cmplsi2 (scratch2, scratch1)); + emit_insn (gen_addsi3 (out, scratch2, constm1_rtx)); + } + } + return 1; +} + +/* For given symbol (function) construct code to compute address of it's PLT + entry in large x86-64 PIC model. */ +rtx +construct_plt_address (rtx symbol) +{ + rtx tmp = gen_reg_rtx (Pmode); + rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF); + + gcc_assert (GET_CODE (symbol) == SYMBOL_REF); + gcc_assert (ix86_cmodel == CM_LARGE_PIC); + + emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec)); + emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx)); + return tmp; } void @@ -13570,7 +15145,7 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, else { /* Static functions and indirect calls don't need the pic register. */ - if (! TARGET_64BIT && flag_pic + if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC) && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0))) use_reg (&use, pic_offset_table_rtx); @@ -13583,7 +15158,12 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, use_reg (&use, al); } - if (! call_insn_operand (XEXP (fnaddr, 0), Pmode)) + if (ix86_cmodel == CM_LARGE_PIC + && GET_CODE (fnaddr) == MEM + && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF + && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode)) + fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0))); + else if (! call_insn_operand (XEXP (fnaddr, 0), Pmode)) { fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0)); fnaddr = gen_rtx_MEM (QImode, fnaddr); @@ -13593,7 +15173,7 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, { rtx addr; addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0)); - fnaddr = gen_rtx_REG (Pmode, FIRST_REX_INT_REG + 3 /* R11 */); + fnaddr = gen_rtx_REG (Pmode, R11_REG); emit_move_insn (fnaddr, addr); fnaddr = gen_rtx_MEM (QImode, fnaddr); } @@ -13829,7 +15409,7 @@ ix86_attr_length_address_default (rtx insn) extract_insn_cached (insn); for (i = recog_data.n_operands - 1; i >= 0; --i) - if (GET_CODE (recog_data.operand[i]) == MEM) + if (MEM_P (recog_data.operand[i])) { return memory_address_length (XEXP (recog_data.operand[i], 0)); break; @@ -13852,11 +15432,15 @@ ix86_issue_rate (void) case PROCESSOR_PENTIUM4: case PROCESSOR_ATHLON: case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: case PROCESSOR_NOCONA: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: return 3; + case PROCESSOR_CORE2: + return 4; + default: return 1; } @@ -13893,7 +15477,7 @@ ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type) else return 0; - if (GET_CODE (set) != REG || REGNO (set) != FLAGS_REG) + if (!REG_P (set) || REGNO (set) != FLAGS_REG) return 0; /* This test is true if the dependent insn reads the flags but @@ -13932,7 +15516,7 @@ ix86_agi_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type) int i; extract_insn_cached (insn); for (i = recog_data.n_operands - 1; i >= 0; --i) - if (GET_CODE (recog_data.operand[i]) == MEM) + if (MEM_P (recog_data.operand[i])) { addr = XEXP (recog_data.operand[i], 0); goto found; @@ -13995,7 +15579,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) && (set = single_set (dep_insn)) != NULL_RTX && (set2 = single_set (insn)) != NULL_RTX && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) - && GET_CODE (SET_DEST (set2)) == MEM) + && MEM_P (SET_DEST (set2))) cost += 1; /* Show ability of reorder buffer to hide latency of load by executing @@ -14047,6 +15631,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) case PROCESSOR_ATHLON: case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: memory = get_attr_memory (insn); @@ -14133,7 +15718,7 @@ ix86_constant_alignment (tree exp, int align) int ix86_data_alignment (tree type, int align) { - int max_align = optimize_size ? BITS_PER_WORD : 256; + int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT); if (AGGREGATE_TYPE_P (type) && TYPE_SIZE (type) @@ -14542,7 +16127,6 @@ enum ix86_builtins IX86_BUILTIN_CMPNGEPD, IX86_BUILTIN_CMPORDPD, IX86_BUILTIN_CMPUNORDPD, - IX86_BUILTIN_CMPNEPD, IX86_BUILTIN_CMPEQSD, IX86_BUILTIN_CMPLTSD, IX86_BUILTIN_CMPLESD, @@ -14551,7 +16135,6 @@ enum ix86_builtins IX86_BUILTIN_CMPNLESD, IX86_BUILTIN_CMPORDSD, IX86_BUILTIN_CMPUNORDSD, - IX86_BUILTIN_CMPNESD, IX86_BUILTIN_COMIEQSD, IX86_BUILTIN_COMILTSD, @@ -14678,14 +16261,6 @@ enum ix86_builtins IX86_BUILTIN_PSHUFLW, IX86_BUILTIN_PSHUFD, - IX86_BUILTIN_PSLLW128, - IX86_BUILTIN_PSLLD128, - IX86_BUILTIN_PSLLQ128, - IX86_BUILTIN_PSRAW128, - IX86_BUILTIN_PSRAD128, - IX86_BUILTIN_PSRLW128, - IX86_BUILTIN_PSRLD128, - IX86_BUILTIN_PSRLQ128, IX86_BUILTIN_PSLLDQI128, IX86_BUILTIN_PSLLWI128, IX86_BUILTIN_PSLLDI128, @@ -14697,6 +16272,16 @@ enum ix86_builtins IX86_BUILTIN_PSRLDI128, IX86_BUILTIN_PSRLQI128, + IX86_BUILTIN_PSLLDQ128, + IX86_BUILTIN_PSLLW128, + IX86_BUILTIN_PSLLD128, + IX86_BUILTIN_PSLLQ128, + IX86_BUILTIN_PSRAW128, + IX86_BUILTIN_PSRAD128, + IX86_BUILTIN_PSRLW128, + IX86_BUILTIN_PSRLD128, + IX86_BUILTIN_PSRLQ128, + IX86_BUILTIN_PUNPCKHBW128, IX86_BUILTIN_PUNPCKHWD128, IX86_BUILTIN_PUNPCKHDQ128, @@ -14759,6 +16344,14 @@ enum ix86_builtins IX86_BUILTIN_PABSW128, IX86_BUILTIN_PABSD128, + /* AMDFAM10 - SSE4A New Instructions. */ + IX86_BUILTIN_MOVNTSD, + IX86_BUILTIN_MOVNTSS, + IX86_BUILTIN_EXTRQI, + IX86_BUILTIN_EXTRQ, + IX86_BUILTIN_INSERTQI, + IX86_BUILTIN_INSERTQ, + IX86_BUILTIN_VEC_INIT_V2SI, IX86_BUILTIN_VEC_INIT_V4HI, IX86_BUILTIN_VEC_INIT_V8QI, @@ -14775,13 +16368,41 @@ enum ix86_builtins IX86_BUILTIN_MAX }; -#define def_builtin(MASK, NAME, TYPE, CODE) \ -do { \ - if ((MASK) & target_flags \ - && (!((MASK) & MASK_64BIT) || TARGET_64BIT)) \ - add_builtin_function ((NAME), (TYPE), (CODE), BUILT_IN_MD, \ - NULL, NULL_TREE); \ -} while (0) +/* Table for the ix86 builtin decls. */ +static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX]; + +/* Add a ix86 target builtin function with CODE, NAME and TYPE. Do so, + * if the target_flags include one of MASK. Stores the function decl + * in the ix86_builtins array. + * Returns the function decl or NULL_TREE, if the builtin was not added. */ + +static inline tree +def_builtin (int mask, const char *name, tree type, enum ix86_builtins code) +{ + tree decl = NULL_TREE; + + if (mask & target_flags + && (!(mask & MASK_64BIT) || TARGET_64BIT)) + { + decl = add_builtin_function (name, type, code, BUILT_IN_MD, + NULL, NULL_TREE); + ix86_builtins[(int) code] = decl; + } + + return decl; +} + +/* Like def_builtin, but also marks the function decl "const". */ + +static inline tree +def_builtin_const (int mask, const char *name, tree type, + enum ix86_builtins code) +{ + tree decl = def_builtin (mask, name, type, code); + if (decl) + TREE_READONLY (decl) = 1; + return decl; +} /* Bits for builtin_description.flag. */ @@ -15170,8 +16791,8 @@ static const struct builtin_description bdesc_1arg[] = { MASK_SSE2, CODE_FOR_sse2_cvttps2dq, 0, IX86_BUILTIN_CVTTPS2DQ, 0, 0 }, /* SSE3 */ - { MASK_SSE3, CODE_FOR_sse3_movshdup, 0, IX86_BUILTIN_MOVSHDUP, 0, 0 }, - { MASK_SSE3, CODE_FOR_sse3_movsldup, 0, IX86_BUILTIN_MOVSLDUP, 0, 0 }, + { MASK_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, 0, 0 }, + { MASK_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, 0, 0 }, /* SSSE3 */ { MASK_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, 0, 0 }, @@ -15182,13 +16803,6 @@ static const struct builtin_description bdesc_1arg[] = { MASK_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, 0, 0 }, }; -static void -ix86_init_builtins (void) -{ - if (TARGET_MMX) - ix86_init_mmx_sse_builtins (); -} - /* Set up all the MMX/SSE builtins. This is not called if TARGET_MMX is zero. Otherwise, if TARGET_SSE is not set, only expand the MMX builtins. */ @@ -15198,7 +16812,7 @@ ix86_init_mmx_sse_builtins (void) const struct builtin_description * d; size_t i; - tree V16QI_type_node = build_vector_type_for_mode (intQI_type_node, V16QImode); + tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode); tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode); tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode); tree V2DI_type_node @@ -15207,7 +16821,7 @@ ix86_init_mmx_sse_builtins (void) tree V4SF_type_node = build_vector_type_for_mode (float_type_node, V4SFmode); tree V4SI_type_node = build_vector_type_for_mode (intSI_type_node, V4SImode); tree V4HI_type_node = build_vector_type_for_mode (intHI_type_node, V4HImode); - tree V8QI_type_node = build_vector_type_for_mode (intQI_type_node, V8QImode); + tree V8QI_type_node = build_vector_type_for_mode (char_type_node, V8QImode); tree V8HI_type_node = build_vector_type_for_mode (intHI_type_node, V8HImode); tree pchar_type_node = build_pointer_type (char_type_node); @@ -15459,12 +17073,6 @@ ix86_init_mmx_sse_builtins (void) tree v8hi_ftype_v8hi_int = build_function_type_list (V8HI_type_node, V8HI_type_node, integer_type_node, NULL_TREE); - tree v8hi_ftype_v8hi_v2di - = build_function_type_list (V8HI_type_node, - V8HI_type_node, V2DI_type_node, NULL_TREE); - tree v4si_ftype_v4si_v2di - = build_function_type_list (V4SI_type_node, - V4SI_type_node, V2DI_type_node, NULL_TREE); tree v4si_ftype_v8hi_v8hi = build_function_type_list (V4SI_type_node, V8HI_type_node, V8HI_type_node, NULL_TREE); @@ -15488,6 +17096,18 @@ ix86_init_mmx_sse_builtins (void) = build_function_type_list (void_type_node, pchar_type_node, V16QI_type_node, NULL_TREE); + tree v2di_ftype_v2di_unsigned_unsigned + = build_function_type_list (V2DI_type_node, V2DI_type_node, + unsigned_type_node, unsigned_type_node, + NULL_TREE); + tree v2di_ftype_v2di_v2di_unsigned_unsigned + = build_function_type_list (V2DI_type_node, V2DI_type_node, V2DI_type_node, + unsigned_type_node, unsigned_type_node, + NULL_TREE); + tree v2di_ftype_v2di_v16qi + = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node, + NULL_TREE); + tree float80_type; tree float128_type; tree ftype; @@ -15648,15 +17268,15 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE, "__builtin_ia32_ldmxcsr", void_ftype_unsigned, IX86_BUILTIN_LDMXCSR); def_builtin (MASK_SSE, "__builtin_ia32_stmxcsr", unsigned_ftype_void, IX86_BUILTIN_STMXCSR); - def_builtin (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS); - def_builtin (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI); - def_builtin (MASK_SSE, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS); - def_builtin (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS); - def_builtin (MASK_SSE, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI); - def_builtin (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64); - def_builtin (MASK_SSE, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI); - def_builtin (MASK_SSE, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI); - def_builtin (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS); + def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI); + def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI); + def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64); def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_maskmovq", void_ftype_v8qi_v8qi_pchar, IX86_BUILTIN_MASKMOVQ); @@ -15681,8 +17301,8 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS); def_builtin (MASK_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS); def_builtin (MASK_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS); - def_builtin (MASK_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS); - def_builtin (MASK_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS); + def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS); + def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS); def_builtin (MASK_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS); @@ -15736,35 +17356,35 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE2, "__builtin_ia32_pshufhw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFHW); def_builtin (MASK_SSE2, "__builtin_ia32_psadbw128", v2di_ftype_v16qi_v16qi, IX86_BUILTIN_PSADBW128); - def_builtin (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD); - def_builtin (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD); def_builtin (MASK_SSE2, "__builtin_ia32_shufpd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_SHUFPD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2ps", v4sf_ftype_v2df, IX86_BUILTIN_CVTPD2PS); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTTPD2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTTPD2PI); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2ps", v4sf_ftype_v2df, IX86_BUILTIN_CVTPD2PS); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTTPD2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTTPD2PI); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpi2pd", v2df_ftype_v2si, IX86_BUILTIN_CVTPI2PD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpi2pd", v2df_ftype_v2si, IX86_BUILTIN_CVTPI2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI); - def_builtin (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64); - def_builtin (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI); + def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64); + def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD); - def_builtin (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD); + def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD); def_builtin (MASK_SSE2, "__builtin_ia32_clflush", void_ftype_pcvoid, IX86_BUILTIN_CLFLUSH); def_builtin (MASK_SSE2, "__builtin_ia32_lfence", void_ftype_void, IX86_BUILTIN_LFENCE); @@ -15776,29 +17396,26 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE2, "__builtin_ia32_pmuludq", di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ); def_builtin (MASK_SSE2, "__builtin_ia32_pmuludq128", v2di_ftype_v4si_v4si, IX86_BUILTIN_PMULUDQ128); - def_builtin (MASK_SSE2, "__builtin_ia32_psllw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSLLW128); - def_builtin (MASK_SSE2, "__builtin_ia32_pslld128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSLLD128); - def_builtin (MASK_SSE2, "__builtin_ia32_psllq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSLLQ128); - - def_builtin (MASK_SSE2, "__builtin_ia32_psrlw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSRLW128); - def_builtin (MASK_SSE2, "__builtin_ia32_psrld128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSRLD128); - def_builtin (MASK_SSE2, "__builtin_ia32_psrlq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSRLQ128); - - def_builtin (MASK_SSE2, "__builtin_ia32_psraw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSRAW128); - def_builtin (MASK_SSE2, "__builtin_ia32_psrad128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSRAD128); - def_builtin (MASK_SSE2, "__builtin_ia32_pslldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLDQI128); def_builtin (MASK_SSE2, "__builtin_ia32_psllwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSLLWI128); def_builtin (MASK_SSE2, "__builtin_ia32_pslldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSLLDI128); def_builtin (MASK_SSE2, "__builtin_ia32_psllqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLQI128); + def_builtin (MASK_SSE2, "__builtin_ia32_psllw128", v8hi_ftype_v8hi_v8hi, IX86_BUILTIN_PSLLW128); + def_builtin (MASK_SSE2, "__builtin_ia32_pslld128", v4si_ftype_v4si_v4si, IX86_BUILTIN_PSLLD128); + def_builtin (MASK_SSE2, "__builtin_ia32_psllq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSLLQ128); def_builtin (MASK_SSE2, "__builtin_ia32_psrldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLDQI128); def_builtin (MASK_SSE2, "__builtin_ia32_psrlwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRLWI128); def_builtin (MASK_SSE2, "__builtin_ia32_psrldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRLDI128); def_builtin (MASK_SSE2, "__builtin_ia32_psrlqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLQI128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrlw128", v8hi_ftype_v8hi_v8hi, IX86_BUILTIN_PSRLW128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrld128", v4si_ftype_v4si_v4si, IX86_BUILTIN_PSRLD128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrlq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSRLQ128); def_builtin (MASK_SSE2, "__builtin_ia32_psrawi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRAWI128); def_builtin (MASK_SSE2, "__builtin_ia32_psradi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRADI128); + def_builtin (MASK_SSE2, "__builtin_ia32_psraw128", v8hi_ftype_v8hi_v8hi, IX86_BUILTIN_PSRAW128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrad128", v4si_ftype_v4si_v4si, IX86_BUILTIN_PSRAD128); def_builtin (MASK_SSE2, "__builtin_ia32_pmaddwd128", v4si_ftype_v8hi_v8hi, IX86_BUILTIN_PMADDWD128); @@ -15809,12 +17426,6 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE3, "__builtin_ia32_mwait", void_ftype_unsigned_unsigned, IX86_BUILTIN_MWAIT); - def_builtin (MASK_SSE3, "__builtin_ia32_movshdup", - v4sf_ftype_v4sf, - IX86_BUILTIN_MOVSHDUP); - def_builtin (MASK_SSE3, "__builtin_ia32_movsldup", - v4sf_ftype_v4sf, - IX86_BUILTIN_MOVSLDUP); def_builtin (MASK_SSE3, "__builtin_ia32_lddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LDDQU); @@ -15824,6 +17435,20 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int, IX86_BUILTIN_PALIGNR); + /* AMDFAM10 SSE4A New built-ins */ + def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd", + void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD); + def_builtin (MASK_SSE4A, "__builtin_ia32_movntss", + void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS); + def_builtin (MASK_SSE4A, "__builtin_ia32_extrqi", + v2di_ftype_v2di_unsigned_unsigned, IX86_BUILTIN_EXTRQI); + def_builtin (MASK_SSE4A, "__builtin_ia32_extrq", + v2di_ftype_v2di_v16qi, IX86_BUILTIN_EXTRQ); + def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi", + v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI); + def_builtin (MASK_SSE4A, "__builtin_ia32_insertq", + v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ); + /* Access to the vec_init patterns. */ ftype = build_function_type_list (V2SI_type_node, integer_type_node, integer_type_node, NULL_TREE); @@ -15896,6 +17521,13 @@ ix86_init_mmx_sse_builtins (void) ftype, IX86_BUILTIN_VEC_SET_V4HI); } +static void +ix86_init_builtins (void) +{ + if (TARGET_MMX) + ix86_init_mmx_sse_builtins (); +} + /* Errors in the source file can cause expand_expr to return const0_rtx where we expect a vector. To avoid crashing, use one of the vector clear instructions. */ @@ -15910,11 +17542,11 @@ safe_vector_operand (rtx x, enum machine_mode mode) /* Subroutine of ix86_expand_builtin to take care of binop insns. */ static rtx -ix86_expand_binop_builtin (enum insn_code icode, tree arglist, rtx target) +ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) { rtx pat, xops[3]; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); enum machine_mode tmode = insn_data[icode].operand[0].mode; @@ -15978,11 +17610,11 @@ ix86_expand_binop_builtin (enum insn_code icode, tree arglist, rtx target) /* Subroutine of ix86_expand_builtin to take care of stores. */ static rtx -ix86_expand_store_builtin (enum insn_code icode, tree arglist) +ix86_expand_store_builtin (enum insn_code icode, tree exp) { rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); enum machine_mode mode0 = insn_data[icode].operand[0].mode; @@ -16003,11 +17635,11 @@ ix86_expand_store_builtin (enum insn_code icode, tree arglist) /* Subroutine of ix86_expand_builtin to take care of unop insns. */ static rtx -ix86_expand_unop_builtin (enum insn_code icode, tree arglist, +ix86_expand_unop_builtin (enum insn_code icode, tree exp, rtx target, int do_load) { rtx pat; - tree arg0 = TREE_VALUE (arglist); + tree arg0 = CALL_EXPR_ARG (exp, 0); rtx op0 = expand_normal (arg0); enum machine_mode tmode = insn_data[icode].operand[0].mode; enum machine_mode mode0 = insn_data[icode].operand[1].mode; @@ -16039,10 +17671,10 @@ ix86_expand_unop_builtin (enum insn_code icode, tree arglist, sqrtss, rsqrtss, rcpss. */ static rtx -ix86_expand_unop1_builtin (enum insn_code icode, tree arglist, rtx target) +ix86_expand_unop1_builtin (enum insn_code icode, tree exp, rtx target) { rtx pat; - tree arg0 = TREE_VALUE (arglist); + tree arg0 = CALL_EXPR_ARG (exp, 0); rtx op1, op0 = expand_normal (arg0); enum machine_mode tmode = insn_data[icode].operand[0].mode; enum machine_mode mode0 = insn_data[icode].operand[1].mode; @@ -16073,12 +17705,12 @@ ix86_expand_unop1_builtin (enum insn_code icode, tree arglist, rtx target) /* Subroutine of ix86_expand_builtin to take care of comparison insns. */ static rtx -ix86_expand_sse_compare (const struct builtin_description *d, tree arglist, +ix86_expand_sse_compare (const struct builtin_description *d, tree exp, rtx target) { rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); rtx op2; @@ -16125,12 +17757,12 @@ ix86_expand_sse_compare (const struct builtin_description *d, tree arglist, /* Subroutine of ix86_expand_builtin to take care of comi insns. */ static rtx -ix86_expand_sse_comi (const struct builtin_description *d, tree arglist, +ix86_expand_sse_comi (const struct builtin_description *d, tree exp, rtx target) { rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); rtx op2; @@ -16205,7 +17837,7 @@ get_element_number (tree vec_type, tree arg) these sorts of instructions. */ static rtx -ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target) +ix86_expand_vec_init_builtin (tree type, tree exp, rtx target) { enum machine_mode tmode = TYPE_MODE (type); enum machine_mode inner_mode = GET_MODE_INNER (tmode); @@ -16213,15 +17845,14 @@ ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target) rtvec v = rtvec_alloc (n_elt); gcc_assert (VECTOR_MODE_P (tmode)); + gcc_assert (call_expr_nargs (exp) == n_elt); - for (i = 0; i < n_elt; ++i, arglist = TREE_CHAIN (arglist)) + for (i = 0; i < n_elt; ++i) { - rtx x = expand_normal (TREE_VALUE (arglist)); + rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); } - gcc_assert (arglist == NULL); - if (!target || !register_operand (target, tmode)) target = gen_reg_rtx (tmode); @@ -16234,15 +17865,15 @@ ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target) had a language-level syntax for referencing vector elements. */ static rtx -ix86_expand_vec_ext_builtin (tree arglist, rtx target) +ix86_expand_vec_ext_builtin (tree exp, rtx target) { enum machine_mode tmode, mode0; tree arg0, arg1; int elt; rtx op0; - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); elt = get_element_number (TREE_TYPE (arg0), arg1); @@ -16266,16 +17897,16 @@ ix86_expand_vec_ext_builtin (tree arglist, rtx target) a language-level syntax for referencing vector elements. */ static rtx -ix86_expand_vec_set_builtin (tree arglist) +ix86_expand_vec_set_builtin (tree exp) { enum machine_mode tmode, mode1; tree arg0, arg1, arg2; int elt; - rtx op0, op1; + rtx op0, op1, target; - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); tmode = TYPE_MODE (TREE_TYPE (arg0)); mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); @@ -16291,9 +17922,13 @@ ix86_expand_vec_set_builtin (tree arglist) op0 = force_reg (tmode, op0); op1 = force_reg (mode1, op1); - ix86_expand_vector_set (true, op0, op1, elt); + /* OP0 is the source of these builtin functions and shouldn't be + modified. Create a copy, use it and return it as target. */ + target = gen_reg_rtx (tmode); + emit_move_insn (target, op0); + ix86_expand_vector_set (true, target, op1, elt); - return op0; + return target; } /* Expand an expression EXP that calls a built-in function, @@ -16310,11 +17945,10 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, const struct builtin_description *d; size_t i; enum insn_code icode; - tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0); - tree arglist = TREE_OPERAND (exp, 1); - tree arg0, arg1, arg2; - rtx op0, op1, op2, pat; - enum machine_mode tmode, mode0, mode1, mode2, mode3; + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + tree arg0, arg1, arg2, arg3; + rtx op0, op1, op2, op3, pat; + enum machine_mode tmode, mode0, mode1, mode2, mode3, mode4; unsigned int fcode = DECL_FUNCTION_CODE (fndecl); switch (fcode) @@ -16333,9 +17967,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, ? CODE_FOR_mmx_maskmovq : CODE_FOR_sse2_maskmovdqu); /* Note the arg order is different from the operand order. */ - arg1 = TREE_VALUE (arglist); - arg2 = TREE_VALUE (TREE_CHAIN (arglist)); - arg0 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg1 = CALL_EXPR_ARG (exp, 0); + arg2 = CALL_EXPR_ARG (exp, 1); + arg0 = CALL_EXPR_ARG (exp, 2); op0 = expand_normal (arg0); op1 = expand_normal (arg1); op2 = expand_normal (arg2); @@ -16359,17 +17993,17 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_SQRTSS: - return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, exp, target); case IX86_BUILTIN_RSQRTSS: - return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrsqrtv4sf2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrsqrtv4sf2, exp, target); case IX86_BUILTIN_RCPSS: - return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrcpv4sf2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrcpv4sf2, exp, target); case IX86_BUILTIN_LOADUPS: - return ix86_expand_unop_builtin (CODE_FOR_sse_movups, arglist, target, 1); + return ix86_expand_unop_builtin (CODE_FOR_sse_movups, exp, target, 1); case IX86_BUILTIN_STOREUPS: - return ix86_expand_store_builtin (CODE_FOR_sse_movups, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse_movups, exp); case IX86_BUILTIN_LOADHPS: case IX86_BUILTIN_LOADLPS: @@ -16379,8 +18013,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_loadlps : fcode == IX86_BUILTIN_LOADHPD ? CODE_FOR_sse2_loadhpd : CODE_FOR_sse2_loadlpd); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); tmode = insn_data[icode].operand[0].mode; @@ -16403,8 +18037,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_STORELPS: icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_storehps : CODE_FOR_sse_storelps); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); mode0 = insn_data[icode].operand[0].mode; @@ -16420,12 +18054,12 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return const0_rtx; case IX86_BUILTIN_MOVNTPS: - return ix86_expand_store_builtin (CODE_FOR_sse_movntv4sf, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse_movntv4sf, exp); case IX86_BUILTIN_MOVNTQ: - return ix86_expand_store_builtin (CODE_FOR_sse_movntdi, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse_movntdi, exp); case IX86_BUILTIN_LDMXCSR: - op0 = expand_normal (TREE_VALUE (arglist)); + op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); target = assign_386_stack_local (SImode, SLOT_TEMP); emit_move_insn (target, op0); emit_insn (gen_sse_ldmxcsr (target)); @@ -16441,9 +18075,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, icode = (fcode == IX86_BUILTIN_SHUFPS ? CODE_FOR_sse_shufps : CODE_FOR_sse2_shufpd); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); op0 = expand_normal (arg0); op1 = expand_normal (arg1); op2 = expand_normal (arg2); @@ -16481,8 +18115,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, : fcode == IX86_BUILTIN_PSHUFLW ? CODE_FOR_sse2_pshuflw : fcode == IX86_BUILTIN_PSHUFD ? CODE_FOR_sse2_pshufd : CODE_FOR_mmx_pshufw); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); tmode = insn_data[icode].operand[0].mode; @@ -16507,12 +18141,109 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, emit_insn (pat); return target; + case IX86_BUILTIN_PSLLWI128: + icode = CODE_FOR_ashlv8hi3; + goto do_pshifti; + case IX86_BUILTIN_PSLLDI128: + icode = CODE_FOR_ashlv4si3; + goto do_pshifti; + case IX86_BUILTIN_PSLLQI128: + icode = CODE_FOR_ashlv2di3; + goto do_pshifti; + case IX86_BUILTIN_PSRAWI128: + icode = CODE_FOR_ashrv8hi3; + goto do_pshifti; + case IX86_BUILTIN_PSRADI128: + icode = CODE_FOR_ashrv4si3; + goto do_pshifti; + case IX86_BUILTIN_PSRLWI128: + icode = CODE_FOR_lshrv8hi3; + goto do_pshifti; + case IX86_BUILTIN_PSRLDI128: + icode = CODE_FOR_lshrv4si3; + goto do_pshifti; + case IX86_BUILTIN_PSRLQI128: + icode = CODE_FOR_lshrv2di3; + goto do_pshifti; + do_pshifti: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + + if (!CONST_INT_P (op1)) + { + error ("shift must be an immediate"); + return const0_rtx; + } + if (INTVAL (op1) < 0 || INTVAL (op1) > 255) + op1 = GEN_INT (255); + + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_reg (op0); + + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1); + if (!pat) + return 0; + emit_insn (pat); + return target; + + case IX86_BUILTIN_PSLLW128: + icode = CODE_FOR_ashlv8hi3; + goto do_pshift; + case IX86_BUILTIN_PSLLD128: + icode = CODE_FOR_ashlv4si3; + goto do_pshift; + case IX86_BUILTIN_PSLLQ128: + icode = CODE_FOR_ashlv2di3; + goto do_pshift; + case IX86_BUILTIN_PSRAW128: + icode = CODE_FOR_ashrv8hi3; + goto do_pshift; + case IX86_BUILTIN_PSRAD128: + icode = CODE_FOR_ashrv4si3; + goto do_pshift; + case IX86_BUILTIN_PSRLW128: + icode = CODE_FOR_lshrv8hi3; + goto do_pshift; + case IX86_BUILTIN_PSRLD128: + icode = CODE_FOR_lshrv4si3; + goto do_pshift; + case IX86_BUILTIN_PSRLQ128: + icode = CODE_FOR_lshrv2di3; + goto do_pshift; + do_pshift: + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_reg (op0); + + op1 = simplify_gen_subreg (TImode, op1, GET_MODE (op1), 0); + if (! (*insn_data[icode].operand[2].predicate) (op1, TImode)) + op1 = copy_to_reg (op1); + + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1); + if (!pat) + return 0; + emit_insn (pat); + return target; + case IX86_BUILTIN_PSLLDQI128: case IX86_BUILTIN_PSRLDQI128: - icode = ( fcode == IX86_BUILTIN_PSLLDQI128 ? CODE_FOR_sse2_ashlti3 + icode = (fcode == IX86_BUILTIN_PSLLDQI128 ? CODE_FOR_sse2_ashlti3 : CODE_FOR_sse2_lshrti3); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); tmode = insn_data[icode].operand[0].mode; @@ -16530,7 +18261,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return const0_rtx; } target = gen_reg_rtx (V2DImode); - pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, V2DImode, 0), op0, op1); + pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, V2DImode, 0), + op0, op1); if (! pat) return 0; emit_insn (pat); @@ -16541,86 +18273,86 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return NULL_RTX; case IX86_BUILTIN_PAVGUSB: - return ix86_expand_binop_builtin (CODE_FOR_mmx_uavgv8qi3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_uavgv8qi3, exp, target); case IX86_BUILTIN_PF2ID: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2id, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2id, exp, target, 0); case IX86_BUILTIN_PFACC: - return ix86_expand_binop_builtin (CODE_FOR_mmx_haddv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_haddv2sf3, exp, target); case IX86_BUILTIN_PFADD: - return ix86_expand_binop_builtin (CODE_FOR_mmx_addv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_addv2sf3, exp, target); case IX86_BUILTIN_PFCMPEQ: - return ix86_expand_binop_builtin (CODE_FOR_mmx_eqv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_eqv2sf3, exp, target); case IX86_BUILTIN_PFCMPGE: - return ix86_expand_binop_builtin (CODE_FOR_mmx_gev2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_gev2sf3, exp, target); case IX86_BUILTIN_PFCMPGT: - return ix86_expand_binop_builtin (CODE_FOR_mmx_gtv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_gtv2sf3, exp, target); case IX86_BUILTIN_PFMAX: - return ix86_expand_binop_builtin (CODE_FOR_mmx_smaxv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_smaxv2sf3, exp, target); case IX86_BUILTIN_PFMIN: - return ix86_expand_binop_builtin (CODE_FOR_mmx_sminv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_sminv2sf3, exp, target); case IX86_BUILTIN_PFMUL: - return ix86_expand_binop_builtin (CODE_FOR_mmx_mulv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_mulv2sf3, exp, target); case IX86_BUILTIN_PFRCP: - return ix86_expand_unop_builtin (CODE_FOR_mmx_rcpv2sf2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_rcpv2sf2, exp, target, 0); case IX86_BUILTIN_PFRCPIT1: - return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit1v2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit1v2sf3, exp, target); case IX86_BUILTIN_PFRCPIT2: - return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit2v2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit2v2sf3, exp, target); case IX86_BUILTIN_PFRSQIT1: - return ix86_expand_binop_builtin (CODE_FOR_mmx_rsqit1v2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_rsqit1v2sf3, exp, target); case IX86_BUILTIN_PFRSQRT: - return ix86_expand_unop_builtin (CODE_FOR_mmx_rsqrtv2sf2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_rsqrtv2sf2, exp, target, 0); case IX86_BUILTIN_PFSUB: - return ix86_expand_binop_builtin (CODE_FOR_mmx_subv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_subv2sf3, exp, target); case IX86_BUILTIN_PFSUBR: - return ix86_expand_binop_builtin (CODE_FOR_mmx_subrv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_subrv2sf3, exp, target); case IX86_BUILTIN_PI2FD: - return ix86_expand_unop_builtin (CODE_FOR_mmx_floatv2si2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_floatv2si2, exp, target, 0); case IX86_BUILTIN_PMULHRW: - return ix86_expand_binop_builtin (CODE_FOR_mmx_pmulhrwv4hi3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_pmulhrwv4hi3, exp, target); case IX86_BUILTIN_PF2IW: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2iw, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2iw, exp, target, 0); case IX86_BUILTIN_PFNACC: - return ix86_expand_binop_builtin (CODE_FOR_mmx_hsubv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_hsubv2sf3, exp, target); case IX86_BUILTIN_PFPNACC: - return ix86_expand_binop_builtin (CODE_FOR_mmx_addsubv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_addsubv2sf3, exp, target); case IX86_BUILTIN_PI2FW: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pi2fw, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pi2fw, exp, target, 0); case IX86_BUILTIN_PSWAPDSI: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2si2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2si2, exp, target, 0); case IX86_BUILTIN_PSWAPDSF: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2sf2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2sf2, exp, target, 0); case IX86_BUILTIN_SQRTSD: - return ix86_expand_unop1_builtin (CODE_FOR_sse2_vmsqrtv2df2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse2_vmsqrtv2df2, exp, target); case IX86_BUILTIN_LOADUPD: - return ix86_expand_unop_builtin (CODE_FOR_sse2_movupd, arglist, target, 1); + return ix86_expand_unop_builtin (CODE_FOR_sse2_movupd, exp, target, 1); case IX86_BUILTIN_STOREUPD: - return ix86_expand_store_builtin (CODE_FOR_sse2_movupd, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movupd, exp); case IX86_BUILTIN_MFENCE: emit_insn (gen_sse2_mfence ()); @@ -16630,7 +18362,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_CLFLUSH: - arg0 = TREE_VALUE (arglist); + arg0 = CALL_EXPR_ARG (exp, 0); op0 = expand_normal (arg0); icode = CODE_FOR_sse2_clflush; if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode)) @@ -16640,21 +18372,21 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_MOVNTPD: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, exp); case IX86_BUILTIN_MOVNTDQ: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, exp); case IX86_BUILTIN_MOVNTI: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, exp); case IX86_BUILTIN_LOADDQU: - return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, arglist, target, 1); + return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, exp, target, 1); case IX86_BUILTIN_STOREDQU: - return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, exp); case IX86_BUILTIN_MONITOR: - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); op0 = expand_normal (arg0); op1 = expand_normal (arg1); op2 = expand_normal (arg2); @@ -16671,8 +18403,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_MWAIT: - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); if (!REG_P (op0)) @@ -16683,7 +18415,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_LDDQU: - return ix86_expand_unop_builtin (CODE_FOR_sse3_lddqu, arglist, + return ix86_expand_unop_builtin (CODE_FOR_sse3_lddqu, exp, target, 1); case IX86_BUILTIN_PALIGNR: @@ -16698,9 +18430,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, icode = CODE_FOR_ssse3_palignrti; mode = V2DImode; } - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0); op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0); op2 = expand_expr (arg2, NULL_RTX, VOIDmode, 0); @@ -16732,10 +18464,118 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, emit_insn (pat); return target; + case IX86_BUILTIN_MOVNTSD: + return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, exp); + + case IX86_BUILTIN_MOVNTSS: + return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv4sf, exp); + + case IX86_BUILTIN_INSERTQ: + case IX86_BUILTIN_EXTRQ: + icode = (fcode == IX86_BUILTIN_EXTRQ + ? CODE_FOR_sse4a_extrq + : CODE_FOR_sse4a_insertq); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + op1 = copy_to_mode_reg (mode2, op1); + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + + case IX86_BUILTIN_EXTRQI: + icode = CODE_FOR_sse4a_extrqi; + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + { + error ("index mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) + { + error ("length mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + + case IX86_BUILTIN_INSERTQI: + icode = CODE_FOR_sse4a_insertqi; + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + arg3 = CALL_EXPR_ARG (exp, 3); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + mode4 = insn_data[icode].operand[4].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + op1 = copy_to_mode_reg (mode2, op1); + + if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) + { + error ("index mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (! (*insn_data[icode].operand[4].predicate) (op3, mode4)) + { + error ("length mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2, op3); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + case IX86_BUILTIN_VEC_INIT_V2SI: case IX86_BUILTIN_VEC_INIT_V4HI: case IX86_BUILTIN_VEC_INIT_V8QI: - return ix86_expand_vec_init_builtin (TREE_TYPE (exp), arglist, target); + return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); case IX86_BUILTIN_VEC_EXT_V2DF: case IX86_BUILTIN_VEC_EXT_V2DI: @@ -16744,11 +18584,11 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_VEC_EXT_V8HI: case IX86_BUILTIN_VEC_EXT_V2SI: case IX86_BUILTIN_VEC_EXT_V4HI: - return ix86_expand_vec_ext_builtin (arglist, target); + return ix86_expand_vec_ext_builtin (exp, target); case IX86_BUILTIN_VEC_SET_V8HI: case IX86_BUILTIN_VEC_SET_V4HI: - return ix86_expand_vec_set_builtin (arglist); + return ix86_expand_vec_set_builtin (exp); default: break; @@ -16762,22 +18602,103 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, || d->icode == CODE_FOR_sse_vmmaskcmpv4sf3 || d->icode == CODE_FOR_sse2_maskcmpv2df3 || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3) - return ix86_expand_sse_compare (d, arglist, target); + return ix86_expand_sse_compare (d, exp, target); - return ix86_expand_binop_builtin (d->icode, arglist, target); + return ix86_expand_binop_builtin (d->icode, exp, target); } for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++) if (d->code == fcode) - return ix86_expand_unop_builtin (d->icode, arglist, target, 0); + return ix86_expand_unop_builtin (d->icode, exp, target, 0); for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++) if (d->code == fcode) - return ix86_expand_sse_comi (d, arglist, target); + return ix86_expand_sse_comi (d, exp, target); gcc_unreachable (); } +/* Returns a function decl for a vectorized version of the builtin function + with builtin function code FN and the result vector type TYPE, or NULL_TREE + if it is not available. */ + +static tree +ix86_builtin_vectorized_function (enum built_in_function fn, tree type_out, + tree type_in) +{ + enum machine_mode in_mode, out_mode; + int in_n, out_n; + + if (TREE_CODE (type_out) != VECTOR_TYPE + || TREE_CODE (type_in) != VECTOR_TYPE) + return NULL_TREE; + + out_mode = TYPE_MODE (TREE_TYPE (type_out)); + out_n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + + switch (fn) + { + case BUILT_IN_SQRT: + if (out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return ix86_builtins[IX86_BUILTIN_SQRTPD]; + return NULL_TREE; + + case BUILT_IN_SQRTF: + if (out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return ix86_builtins[IX86_BUILTIN_SQRTPS]; + return NULL_TREE; + + case BUILT_IN_LRINTF: + if (out_mode == SImode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ]; + return NULL_TREE; + + default: + ; + } + + return NULL_TREE; +} + +/* Returns a decl of a function that implements conversion of the + input vector of type TYPE, or NULL_TREE if it is not available. */ + +static tree +ix86_builtin_conversion (enum tree_code code, tree type) +{ + if (TREE_CODE (type) != VECTOR_TYPE) + return NULL_TREE; + + switch (code) + { + case FLOAT_EXPR: + switch (TYPE_MODE (type)) + { + case V4SImode: + return ix86_builtins[IX86_BUILTIN_CVTDQ2PS]; + default: + return NULL_TREE; + } + + case FIX_TRUNC_EXPR: + switch (TYPE_MODE (type)) + { + case V4SFmode: + return ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]; + default: + return NULL_TREE; + } + default: + return NULL_TREE; + + } +} + /* Store OPERAND to the memory after reload is completed. This means that we can't easily use assign_stack_local. */ rtx @@ -16967,7 +18888,7 @@ ix86_preferred_output_reload_class (rtx x, enum reg_class class) if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode)) return MAYBE_SSE_CLASS_P (class) ? SSE_REGS : NO_REGS; - if (TARGET_80387 && SCALAR_FLOAT_MODE_P (mode)) + if (X87_FLOAT_MODE_P (mode)) { if (class == FP_TOP_SSE_REGS) return FP_TOP_REG; @@ -17023,18 +18944,12 @@ ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2, /* If the target says that inter-unit moves are more expensive than moving through memory, then don't generate them. */ - if (!TARGET_INTER_UNIT_MOVES && !optimize_size) + if (!TARGET_INTER_UNIT_MOVES) return true; /* Between SSE and general, we have moves no larger than word size. */ if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) return true; - - /* ??? For the cost of one register reformat penalty, we could use - the same instructions to move SFmode and DFmode data, but the - relevant move patterns don't support those alternatives. */ - if (mode == SFmode || mode == DFmode) - return true; } return false; @@ -17235,15 +19150,17 @@ ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2) /* If MODE2 is only appropriate for an SSE register, then tie with any other mode acceptable to SSE registers. */ - if (GET_MODE_SIZE (mode2) >= 8 + if (GET_MODE_SIZE (mode2) == 16 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2)) - return ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1); + return (GET_MODE_SIZE (mode1) == 16 + && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1)); - /* If MODE2 is appropriate for an MMX (or SSE) register, then tie + /* If MODE2 is appropriate for an MMX register, then tie with any other mode acceptable to MMX registers. */ if (GET_MODE_SIZE (mode2) == 8 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2)) - return ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1); + return (GET_MODE_SIZE (mode1) == 8 + && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1)); return false; } @@ -17407,7 +19324,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) return false; case ASHIFT: - if (GET_CODE (XEXP (x, 1)) == CONST_INT + if (CONST_INT_P (XEXP (x, 1)) && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT)) { HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); @@ -17431,7 +19348,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) case ROTATERT: if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode) { - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) { if (INTVAL (XEXP (x, 1)) > 32) *total = ix86_cost->shift_const + COSTS_N_INSNS (2); @@ -17448,7 +19365,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) } else { - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) *total = ix86_cost->shift_const; else *total = ix86_cost->shift_var; @@ -17456,8 +19373,20 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) return false; case MULT: - if (FLOAT_MODE_P (mode)) + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + { + /* ??? SSE scalar cost should be used here. */ + *total = ix86_cost->fmul; + return false; + } + else if (X87_FLOAT_MODE_P (mode)) + { + *total = ix86_cost->fmul; + return false; + } + else if (FLOAT_MODE_P (mode)) { + /* ??? SSE vector cost should be used here. */ *total = ix86_cost->fmul; return false; } @@ -17466,7 +19395,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) rtx op0 = XEXP (x, 0); rtx op1 = XEXP (x, 1); int nbits; - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) { unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); for (nbits = 0; value != 0; value &= value - 1) @@ -17486,7 +19415,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) if (GET_CODE (op0) == GET_CODE (op1)) is_mulwiden = 1, op1 = XEXP (op1, 0); - else if (GET_CODE (op1) == CONST_INT) + else if (CONST_INT_P (op1)) { if (GET_CODE (op0) == SIGN_EXTEND) is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode) @@ -17510,21 +19439,25 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) case UDIV: case MOD: case UMOD: - if (FLOAT_MODE_P (mode)) + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + /* ??? SSE cost should be used here. */ + *total = ix86_cost->fdiv; + else if (X87_FLOAT_MODE_P (mode)) + *total = ix86_cost->fdiv; + else if (FLOAT_MODE_P (mode)) + /* ??? SSE vector cost should be used here. */ *total = ix86_cost->fdiv; else *total = ix86_cost->divide[MODE_INDEX (mode)]; return false; case PLUS: - if (FLOAT_MODE_P (mode)) - *total = ix86_cost->fadd; - else if (GET_MODE_CLASS (mode) == MODE_INT + if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode)) { if (GET_CODE (XEXP (x, 0)) == PLUS && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT - && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1)) && CONSTANT_P (XEXP (x, 1))) { HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1)); @@ -17539,7 +19472,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) } } else if (GET_CODE (XEXP (x, 0)) == MULT - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT) + && CONST_INT_P (XEXP (XEXP (x, 0), 1))) { HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1)); if (val == 2 || val == 4 || val == 8) @@ -17562,11 +19495,23 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) /* FALLTHRU */ case MINUS: - if (FLOAT_MODE_P (mode)) + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + { + /* ??? SSE cost should be used here. */ + *total = ix86_cost->fadd; + return false; + } + else if (X87_FLOAT_MODE_P (mode)) { *total = ix86_cost->fadd; return false; } + else if (FLOAT_MODE_P (mode)) + { + /* ??? SSE vector cost should be used here. */ + *total = ix86_cost->fadd; + return false; + } /* FALLTHRU */ case AND: @@ -17584,8 +19529,20 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) /* FALLTHRU */ case NEG: - if (FLOAT_MODE_P (mode)) + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + { + /* ??? SSE cost should be used here. */ + *total = ix86_cost->fchs; + return false; + } + else if (X87_FLOAT_MODE_P (mode)) + { + *total = ix86_cost->fchs; + return false; + } + else if (FLOAT_MODE_P (mode)) { + /* ??? SSE vector cost should be used here. */ *total = ix86_cost->fchs; return false; } @@ -17601,7 +19558,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) case COMPARE: if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT && XEXP (XEXP (x, 0), 1) == const1_rtx - && GET_CODE (XEXP (XEXP (x, 0), 2)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (x, 0), 2)) && XEXP (x, 1) == const0_rtx) { /* This kind of construct is implemented using test[bwl]. @@ -17614,34 +19571,29 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) return false; case FLOAT_EXTEND: - if (!TARGET_SSE_MATH - || mode == XFmode - || (mode == DFmode && !TARGET_SSE2)) - /* For standard 80387 constants, raise the cost to prevent - compress_float_constant() to generate load from memory. */ - switch (standard_80387_constant_p (XEXP (x, 0))) - { - case -1: - case 0: - *total = 0; - break; - case 1: /* 0.0 */ - *total = 1; - break; - default: - *total = (x86_ext_80387_constants & TUNEMASK - || optimize_size - ? 1 : 0); - } + if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)) + *total = 0; return false; case ABS: - if (FLOAT_MODE_P (mode)) + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + /* ??? SSE cost should be used here. */ + *total = ix86_cost->fabs; + else if (X87_FLOAT_MODE_P (mode)) + *total = ix86_cost->fabs; + else if (FLOAT_MODE_P (mode)) + /* ??? SSE vector cost should be used here. */ *total = ix86_cost->fabs; return false; case SQRT: - if (FLOAT_MODE_P (mode)) + if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH) + /* ??? SSE cost should be used here. */ + *total = ix86_cost->fsqrt; + else if (X87_FLOAT_MODE_P (mode)) + *total = ix86_cost->fsqrt; + else if (FLOAT_MODE_P (mode)) + /* ??? SSE vector cost should be used here. */ *total = ix86_cost->fsqrt; return false; @@ -17824,37 +19776,29 @@ static rtx x86_this_parameter (tree function) { tree type = TREE_TYPE (function); + bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0; if (TARGET_64BIT) { - int n = aggregate_value_p (TREE_TYPE (type), type) != 0; - return gen_rtx_REG (DImode, x86_64_int_parameter_registers[n]); + const int *parm_regs; + + if (TARGET_64BIT_MS_ABI) + parm_regs = x86_64_ms_abi_int_parameter_registers; + else + parm_regs = x86_64_int_parameter_registers; + return gen_rtx_REG (DImode, parm_regs[aggr]); } - if (ix86_function_regparm (type, function) > 0) + if (ix86_function_regparm (type, function) > 0 + && !type_has_variadic_args_p (type)) { - tree parm; - - parm = TYPE_ARG_TYPES (type); - /* Figure out whether or not the function has a variable number of - arguments. */ - for (; parm; parm = TREE_CHAIN (parm)) - if (TREE_VALUE (parm) == void_type_node) - break; - /* If not, the this parameter is in the first argument. */ - if (parm) - { - int regno = 0; - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) - regno = 2; - return gen_rtx_REG (SImode, regno); - } + int regno = 0; + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) + regno = 2; + return gen_rtx_REG (SImode, regno); } - if (aggregate_value_p (TREE_TYPE (type), type)) - return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 8)); - else - return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 4)); + return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4)); } /* Determine whether x86_output_mi_thunk can succeed. */ @@ -17922,7 +19866,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, { if (!x86_64_general_operand (xops[0], DImode)) { - tmp = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 2 /* R10 */); + tmp = gen_rtx_REG (DImode, R10_REG); xops[1] = tmp; output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", xops); xops[0] = tmp; @@ -17938,12 +19882,12 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, if (vcall_offset) { if (TARGET_64BIT) - tmp = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 2 /* R10 */); + tmp = gen_rtx_REG (DImode, R10_REG); else { int tmp_regno = 2 /* ECX */; if (lookup_attribute ("fastcall", - TYPE_ATTRIBUTES (TREE_TYPE (function)))) + TYPE_ATTRIBUTES (TREE_TYPE (function)))) tmp_regno = 0 /* EAX */; tmp = gen_rtx_REG (SImode, tmp_regno); } @@ -17959,7 +19903,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, xops[0] = gen_rtx_MEM (Pmode, plus_constant (tmp, vcall_offset)); if (TARGET_64BIT && !memory_operand (xops[0], Pmode)) { - rtx tmp2 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */); + rtx tmp2 = gen_rtx_REG (DImode, R11_REG); xops[0] = GEN_INT (vcall_offset); xops[1] = tmp2; output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops); @@ -17985,6 +19929,10 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, { if (!flag_pic || (*targetm.binds_local_p) (function)) output_asm_insn ("jmp\t%P0", xops); + /* All thunks should be in the same object as their target, + and thus binds_local_p should be true. */ + else if (TARGET_64BIT_MS_ABI) + gcc_unreachable (); else { tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, xops[0]), UNSPEC_GOTPCREL); @@ -18061,20 +20009,16 @@ void x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) { if (TARGET_64BIT) - if (flag_pic) - { + { #ifndef NO_PROFILE_COUNTERS - fprintf (file, "\tleaq\t%sP%d@(%%rip),%%r11\n", LPREFIX, labelno); + fprintf (file, "\tleaq\t%sP%d@(%%rip),%%r11\n", LPREFIX, labelno); #endif + + if (!TARGET_64BIT_MS_ABI && flag_pic) fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", MCOUNT_NAME); - } - else - { -#ifndef NO_PROFILE_COUNTERS - fprintf (file, "\tmovq\t$%sP%d,%%r11\n", LPREFIX, labelno); -#endif + else fprintf (file, "\tcall\t%s\n", MCOUNT_NAME); - } + } else if (flag_pic) { #ifndef NO_PROFILE_COUNTERS @@ -18110,14 +20054,14 @@ min_insn_size (rtx insn) if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN) return 0; - if (GET_CODE (insn) == JUMP_INSN + if (JUMP_P (insn) && (GET_CODE (PATTERN (insn)) == ADDR_VEC || GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC)) return 0; /* Important case - calls are always 5 bytes. It is common to have many calls in the row. */ - if (GET_CODE (insn) == CALL_INSN + if (CALL_P (insn) && symbolic_reference_mentioned_p (PATTERN (insn)) && !SIBLING_CALL_P (insn)) return 5; @@ -18127,7 +20071,7 @@ min_insn_size (rtx insn) /* For normal instructions we may rely on the sizes of addresses and the presence of symbol to require 4 bytes of encoding. This is not the case for jumps where references are PC relative. */ - if (GET_CODE (insn) != JUMP_INSN) + if (!JUMP_P (insn)) { l = get_attr_length_address (insn); if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn))) @@ -18166,10 +20110,10 @@ ix86_avoid_jump_misspredicts (void) if (dump_file) fprintf(dump_file, "Insn %i estimated to %i bytes\n", INSN_UID (insn), min_insn_size (insn)); - if ((GET_CODE (insn) == JUMP_INSN + if ((JUMP_P (insn) && GET_CODE (PATTERN (insn)) != ADDR_VEC && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC) - || GET_CODE (insn) == CALL_INSN) + || CALL_P (insn)) njumps++; else continue; @@ -18177,10 +20121,10 @@ ix86_avoid_jump_misspredicts (void) while (njumps > 3) { start = NEXT_INSN (start); - if ((GET_CODE (start) == JUMP_INSN + if ((JUMP_P (start) && GET_CODE (PATTERN (start)) != ADDR_VEC && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC) - || GET_CODE (start) == CALL_INSN) + || CALL_P (start)) njumps--, isjump = 1; else isjump = 0; @@ -18220,13 +20164,13 @@ ix86_pad_returns (void) rtx prev; bool replace = false; - if (GET_CODE (ret) != JUMP_INSN || GET_CODE (PATTERN (ret)) != RETURN + if (!JUMP_P (ret) || GET_CODE (PATTERN (ret)) != RETURN || !maybe_hot_bb_p (bb)) continue; for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev)) - if (active_insn_p (prev) || GET_CODE (prev) == CODE_LABEL) + if (active_insn_p (prev) || LABEL_P (prev)) break; - if (prev && GET_CODE (prev) == CODE_LABEL) + if (prev && LABEL_P (prev)) { edge e; edge_iterator ei; @@ -18240,8 +20184,8 @@ ix86_pad_returns (void) { prev = prev_active_insn (ret); if (prev - && ((GET_CODE (prev) == JUMP_INSN && any_condjump_p (prev)) - || GET_CODE (prev) == CALL_INSN)) + && ((JUMP_P (prev) && any_condjump_p (prev)) + || CALL_P (prev))) replace = true; /* Empty functions get branch mispredict even when the jump destination is not visible to us. */ @@ -18318,21 +20262,25 @@ x86_emit_floatuns (rtx operands[2]) mode = GET_MODE (out); neglab = gen_label_rtx (); donelab = gen_label_rtx (); - i1 = gen_reg_rtx (Pmode); f0 = gen_reg_rtx (mode); - emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, Pmode, 0, neglab); + emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab); + + expand_float (out, in, 0); - emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_FLOAT (mode, in))); emit_jump_insn (gen_jump (donelab)); emit_barrier (); emit_label (neglab); - i0 = expand_simple_binop (Pmode, LSHIFTRT, in, const1_rtx, NULL, 1, OPTAB_DIRECT); - i1 = expand_simple_binop (Pmode, AND, in, const1_rtx, NULL, 1, OPTAB_DIRECT); - i0 = expand_simple_binop (Pmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT); + i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT); + expand_float (f0, i0, 0); + emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0))); emit_label (donelab); @@ -19194,44 +21142,13 @@ ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED, clobbers); clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"), clobbers); - clobbers = tree_cons (NULL_TREE, build_string (7, "dirflag"), - clobbers); return clobbers; } -/* Return true if this goes in small data/bss. */ - -static bool -ix86_in_large_data_p (tree exp) -{ - if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC) - return false; - - /* Functions are never large data. */ - if (TREE_CODE (exp) == FUNCTION_DECL) - return false; - - if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp)) - { - const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp)); - if (strcmp (section, ".ldata") == 0 - || strcmp (section, ".lbss") == 0) - return true; - return false; - } - else - { - HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp)); - - /* If this is an incomplete type with size 0, then we can't put it - in data because it might be too big when completed. */ - if (!size || size > ix86_section_threshold) - return true; - } +/* Implements target vector targetm.asm.encode_section_info. This + is not used by netware. */ - return false; -} -static void +static void ATTRIBUTE_UNUSED ix86_encode_section_info (tree decl, rtx rtl, int first) { default_encode_section_info (decl, rtl, first); @@ -19258,16 +21175,36 @@ ix86_reverse_condition (enum rtx_code code, enum machine_mode mode) const char * output_387_reg_move (rtx insn, rtx *operands) { - if (REG_P (operands[1]) - && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + if (REG_P (operands[0])) + { + if (REG_P (operands[1]) + && find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + { + if (REGNO (operands[0]) == FIRST_STACK_REG) + return output_387_ffreep (operands, 0); + return "fstp\t%y0"; + } + if (STACK_TOP_P (operands[0])) + return "fld%z1\t%y1"; + return "fst\t%y0"; + } + else if (MEM_P (operands[0])) { - if (REGNO (operands[0]) == FIRST_STACK_REG) - return output_387_ffreep (operands, 0); - return "fstp\t%y0"; + gcc_assert (REG_P (operands[1])); + if (find_regno_note (insn, REG_DEAD, REGNO (operands[1]))) + return "fstp%z0\t%y0"; + else + { + /* There is no non-popping store to memory for XFmode. + So if we need one, follow the store with a load. */ + if (GET_MODE (operands[0]) == XFmode) + return "fstp%z0\t%y0\n\tfld%z0\t%y0"; + else + return "fst%z0\t%y0"; + } } - if (STACK_TOP_P (operands[0])) - return "fld%z1\t%y1"; - return "fst\t%y0"; + else + gcc_unreachable(); } /* Output code to perform a conditional jump to LABEL, if C2 flag in @@ -19281,7 +21218,7 @@ ix86_emit_fp_unordered_jump (rtx label) emit_insn (gen_x86_fnstsw_1 (reg)); - if (TARGET_USE_SAHF) + if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_size)) { emit_insn (gen_x86_sahf_1 (reg)); @@ -19300,7 +21237,9 @@ ix86_emit_fp_unordered_jump (rtx label) gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx); temp = gen_rtx_SET (VOIDmode, pc_rtx, temp); + emit_jump_insn (temp); + predict_jump (REG_BR_PROB_BASE * 10 / 100); } /* Output code to perform a log1p XFmode calculation. */ @@ -19321,21 +21260,21 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1) emit_jump_insn (gen_bge (label1)); emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */ - emit_insn (gen_fyl2xp1_xf3 (op0, tmp2, op1)); + emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2)); emit_jump (label2); emit_label (label1); emit_move_insn (tmp, CONST1_RTX (XFmode)); emit_insn (gen_addxf3 (tmp, op1, tmp)); emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */ - emit_insn (gen_fyl2x_xf3 (op0, tmp2, tmp)); + emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2)); emit_label (label2); } /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */ -static void +static void ATTRIBUTE_UNUSED i386_solaris_elf_named_section (const char *name, unsigned int flags, tree decl) { @@ -19985,4 +21924,210 @@ ix86_expand_round (rtx operand0, rtx operand1) emit_move_insn (operand0, res); } + +/* Table of valid machine attributes. */ +static const struct attribute_spec ix86_attribute_table[] = +{ + /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */ + /* Stdcall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* Fastcall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* Cdecl attribute says the callee is a normal C declaration */ + { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* Regparm attribute specifies how many integer arguments are to be + passed in registers. */ + { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute }, + /* Sseregparm attribute says we are using x86_64 calling conventions + for FP arguments. */ + { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* force_align_arg_pointer says this function realigns the stack at entry. */ + { (const char *)&ix86_force_align_arg_pointer_string, 0, 0, + false, true, true, ix86_handle_cconv_attribute }, +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES + { "dllimport", 0, 0, false, false, false, handle_dll_attribute }, + { "dllexport", 0, 0, false, false, false, handle_dll_attribute }, + { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute }, +#endif + { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute }, + { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute }, +#ifdef SUBTARGET_ATTRIBUTE_TABLE + SUBTARGET_ATTRIBUTE_TABLE, +#endif + { NULL, 0, 0, false, false, false, NULL } +}; + +/* Initialize the GCC target structure. */ +#undef TARGET_ATTRIBUTE_TABLE +#define TARGET_ATTRIBUTE_TABLE ix86_attribute_table +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES +# undef TARGET_MERGE_DECL_ATTRIBUTES +# define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes +#endif + +#undef TARGET_COMP_TYPE_ATTRIBUTES +#define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes + +#undef TARGET_INIT_BUILTINS +#define TARGET_INIT_BUILTINS ix86_init_builtins +#undef TARGET_EXPAND_BUILTIN +#define TARGET_EXPAND_BUILTIN ix86_expand_builtin + +#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION +#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION ix86_builtin_vectorized_function +#undef TARGET_VECTORIZE_BUILTIN_CONVERSION +#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_builtin_conversion + +#undef TARGET_ASM_FUNCTION_EPILOGUE +#define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue + +#undef TARGET_ENCODE_SECTION_INFO +#ifndef SUBTARGET_ENCODE_SECTION_INFO +#define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info +#else +#define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO +#endif + +#undef TARGET_ASM_OPEN_PAREN +#define TARGET_ASM_OPEN_PAREN "" +#undef TARGET_ASM_CLOSE_PAREN +#define TARGET_ASM_CLOSE_PAREN "" + +#undef TARGET_ASM_ALIGNED_HI_OP +#define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT +#undef TARGET_ASM_ALIGNED_SI_OP +#define TARGET_ASM_ALIGNED_SI_OP ASM_LONG +#ifdef ASM_QUAD +#undef TARGET_ASM_ALIGNED_DI_OP +#define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD +#endif + +#undef TARGET_ASM_UNALIGNED_HI_OP +#define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP +#undef TARGET_ASM_UNALIGNED_SI_OP +#define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP +#undef TARGET_ASM_UNALIGNED_DI_OP +#define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP + +#undef TARGET_SCHED_ADJUST_COST +#define TARGET_SCHED_ADJUST_COST ix86_adjust_cost +#undef TARGET_SCHED_ISSUE_RATE +#define TARGET_SCHED_ISSUE_RATE ix86_issue_rate +#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD +#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ + ia32_multipass_dfa_lookahead + +#undef TARGET_FUNCTION_OK_FOR_SIBCALL +#define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall + +#ifdef HAVE_AS_TLS +#undef TARGET_HAVE_TLS +#define TARGET_HAVE_TLS true +#endif +#undef TARGET_CANNOT_FORCE_CONST_MEM +#define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem +#undef TARGET_USE_BLOCKS_FOR_CONSTANT_P +#define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_rtx_true + +#undef TARGET_DELEGITIMIZE_ADDRESS +#define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address + +#undef TARGET_MS_BITFIELD_LAYOUT_P +#define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p + +#if TARGET_MACHO +#undef TARGET_BINDS_LOCAL_P +#define TARGET_BINDS_LOCAL_P darwin_binds_local_p +#endif +#if TARGET_DLLIMPORT_DECL_ATTRIBUTES +#undef TARGET_BINDS_LOCAL_P +#define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p +#endif + +#undef TARGET_ASM_OUTPUT_MI_THUNK +#define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk +#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK +#define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk + +#undef TARGET_ASM_FILE_START +#define TARGET_ASM_FILE_START x86_file_start + +#undef TARGET_DEFAULT_TARGET_FLAGS +#define TARGET_DEFAULT_TARGET_FLAGS \ + (TARGET_DEFAULT \ + | TARGET_64BIT_DEFAULT \ + | TARGET_SUBTARGET_DEFAULT \ + | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT) + +#undef TARGET_HANDLE_OPTION +#define TARGET_HANDLE_OPTION ix86_handle_option + +#undef TARGET_RTX_COSTS +#define TARGET_RTX_COSTS ix86_rtx_costs +#undef TARGET_ADDRESS_COST +#define TARGET_ADDRESS_COST ix86_address_cost + +#undef TARGET_FIXED_CONDITION_CODE_REGS +#define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs +#undef TARGET_CC_MODES_COMPATIBLE +#define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible + +#undef TARGET_MACHINE_DEPENDENT_REORG +#define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg + +#undef TARGET_BUILD_BUILTIN_VA_LIST +#define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list + +#undef TARGET_MD_ASM_CLOBBERS +#define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers + +#undef TARGET_PROMOTE_PROTOTYPES +#define TARGET_PROMOTE_PROTOTYPES hook_bool_tree_true +#undef TARGET_STRUCT_VALUE_RTX +#define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx +#undef TARGET_SETUP_INCOMING_VARARGS +#define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs +#undef TARGET_MUST_PASS_IN_STACK +#define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack +#undef TARGET_PASS_BY_REFERENCE +#define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference +#undef TARGET_INTERNAL_ARG_POINTER +#define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer +#undef TARGET_DWARF_HANDLE_FRAME_UNSPEC +#define TARGET_DWARF_HANDLE_FRAME_UNSPEC ix86_dwarf_handle_frame_unspec +#undef TARGET_STRICT_ARGUMENT_NAMING +#define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true + +#undef TARGET_GIMPLIFY_VA_ARG_EXPR +#define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg + +#undef TARGET_SCALAR_MODE_SUPPORTED_P +#define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p + +#undef TARGET_VECTOR_MODE_SUPPORTED_P +#define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p + +#ifdef HAVE_AS_TLS +#undef TARGET_ASM_OUTPUT_DWARF_DTPREL +#define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel +#endif + +#ifdef SUBTARGET_INSERT_ATTRIBUTES +#undef TARGET_INSERT_ATTRIBUTES +#define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES +#endif + +#undef TARGET_MANGLE_FUNDAMENTAL_TYPE +#define TARGET_MANGLE_FUNDAMENTAL_TYPE ix86_mangle_fundamental_type + +#undef TARGET_STACK_PROTECT_FAIL +#define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail + +#undef TARGET_FUNCTION_VALUE +#define TARGET_FUNCTION_VALUE ix86_function_value + +struct gcc_target targetm = TARGET_INITIALIZER; + #include "gt-i386.h"