X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Fconfig%2Fi386%2Fi386.c;h=a88c6152a14c38912bd1cb0811166da9370ea310;hb=cf9d2c343781ec1f69c8cc6921466a4fac7f12ef;hp=0a36e6026611fda17d3a31f30d3a2608a6052f16;hpb=c3faca2458e212d326ecabf0e6e6cd8a135a60c6;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 0a36e602661..a88c6152a14 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1,6 +1,6 @@ /* Subroutines used for code generation on IA-32. Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc. + 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -50,6 +50,7 @@ Boston, MA 02110-1301, USA. */ #include "tree-gimple.h" #include "dwarf2.h" #include "tm-constrs.h" +#include "params.h" #ifndef CHECK_STACK_LIMIT #define CHECK_STACK_LIMIT (-1) @@ -67,6 +68,8 @@ Boston, MA 02110-1301, USA. */ /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */ #define COSTS_N_BYTES(N) ((N) * 2) +#define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}} + static const struct processor_costs size_cost = { /* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of an add instruction */ @@ -118,6 +121,10 @@ struct processor_costs size_cost = { /* costs for tuning for size */ COSTS_N_BYTES (2), /* cost of FABS instruction. */ COSTS_N_BYTES (2), /* cost of FCHS instruction. */ COSTS_N_BYTES (2), /* cost of FSQRT instruction. */ + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}} }; /* Processor costs (relative to an add) */ @@ -172,6 +179,10 @@ struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (22), /* cost of FABS instruction. */ COSTS_N_INSNS (24), /* cost of FCHS instruction. */ COSTS_N_INSNS (122), /* cost of FSQRT instruction. */ + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, }; static const @@ -225,6 +236,10 @@ struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (83), /* cost of FSQRT instruction. */ + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -278,6 +293,10 @@ struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (70), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -331,6 +350,17 @@ struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure + the alignment). For small blocks inline loop is still a noticeable win, for bigger + blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently + more expensive startup time in CPU, but after 4K the difference is down in the noise. + */ + {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop}, + {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{rep_prefix_4_byte, {{1024, unrolled_loop}, + {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -385,6 +415,10 @@ struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of FABS instruction. */ COSTS_N_INSNS (1), /* cost of FCHS instruction. */ COSTS_N_INSNS (54), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -438,6 +472,10 @@ struct processor_costs k6_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (56), /* cost of FSQRT instruction. */ + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -491,6 +529,13 @@ struct processor_costs athlon_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + /* For some reason, Athlon deals better with REP prefix (relative to loops) + compared to K8. Alignment becomes important after 8 bytes for memcpy and + 128 bytes for memset. */ + {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS} }; static const @@ -536,7 +581,85 @@ struct processor_costs k8_cost = { in SImode, DImode and TImode */ 5, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ - 6, /* number of parallel prefetches */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 5, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + /* K8 has optimized REP instruction for medium sized blocks, but for very small + blocks it is better to use loop. For large blocks, libcall can do + nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} +}; + +struct processor_costs amdfam10_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8 + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10 + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ 5, /* Branch cost */ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ COSTS_N_INSNS (4), /* cost of FMUL instruction. */ @@ -544,6 +667,15 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + + /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; static const @@ -597,6 +729,11 @@ struct processor_costs pentium4_cost = { COSTS_N_INSNS (2), /* cost of FABS instruction. */ COSTS_N_INSNS (2), /* cost of FCHS instruction. */ COSTS_N_INSNS (43), /* cost of FSQRT instruction. */ + {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, + {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, }; static const @@ -650,6 +787,72 @@ struct processor_costs nocona_cost = { COSTS_N_INSNS (3), /* cost of FABS instruction. */ COSTS_N_INSNS (3), /* cost of FCHS instruction. */ COSTS_N_INSNS (44), /* cost of FSQRT instruction. */ + {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}}, + {libcall, {{32, loop}, {20000, rep_prefix_8_byte}, + {100000, unrolled_loop}, {-1, libcall}}}}, + {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte}, + {-1, libcall}}}, + {libcall, {{24, loop}, {64, unrolled_loop}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}} +}; + +static const +struct processor_costs core2_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (3), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (3), /* DI */ + COSTS_N_INSNS (3)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (22), /* HI */ + COSTS_N_INSNS (22), /* SI */ + COSTS_N_INSNS (22), /* DI */ + COSTS_N_INSNS (22)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 16, /* MOVE_RATIO */ + 2, /* cost for loading QImode using movzbl */ + {6, 6, 6}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {4, 4, 4}, /* cost of storing integer registers */ + 2, /* cost of reg,reg fld/fst */ + {6, 6, 6}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {4, 4, 4}, /* cost of loading integer registers */ + 2, /* cost of moving MMX register */ + {6, 6}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {6, 6, 6}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 4}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 2, /* MMX or SSE register to integer */ + 128, /* size of prefetch block */ + 8, /* number of parallel prefetches */ + 3, /* Branch cost */ + COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (5), /* cost of FMUL instruction. */ + COSTS_N_INSNS (32), /* cost of FDIV instruction. */ + COSTS_N_INSNS (1), /* cost of FABS instruction. */ + COSTS_N_INSNS (1), /* cost of FCHS instruction. */ + COSTS_N_INSNS (58), /* cost of FSQRT instruction. */ + {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{32, loop}, {64, rep_prefix_4_byte}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {15, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{24, loop}, {32, unrolled_loop}, + {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; /* Generic64 should produce code tuned for Nocona and K8. */ @@ -710,6 +913,10 @@ struct processor_costs generic64_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {DUMMY_STRINGOP_ALGS, + {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}} }; /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */ @@ -764,6 +971,10 @@ struct processor_costs generic32_cost = { COSTS_N_INSNS (8), /* cost of FABS instruction. */ COSTS_N_INSNS (8), /* cost of FCHS instruction. */ COSTS_N_INSNS (40), /* cost of FSQRT instruction. */ + {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, + {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}}, + DUMMY_STRINGOP_ALGS}, }; const struct processor_costs *ix86_cost = &pentium_cost; @@ -780,33 +991,44 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_PENT4 (1<simultaneous_prefetches); + if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE)) + set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block); } /* switch to the appropriate section for output of DECL. @@ -2215,7 +2590,7 @@ x86_elf_aligned_common (FILE *file, fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n", size, align / BITS_PER_UNIT); } - +#endif /* Utility function for targets to use in implementing ASM_OUTPUT_ALIGNED_BSS. */ @@ -2239,7 +2614,6 @@ x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED, #endif /* ASM_DECLARE_OBJECT_NAME */ ASM_OUTPUT_SKIP (file, size ? size : 1); } -#endif void optimization_options (int level, int size ATTRIBUTE_UNUSED) @@ -2323,7 +2697,7 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) func = decl; else { - func = TREE_TYPE (TREE_OPERAND (exp, 0)); + func = TREE_TYPE (CALL_EXPR_FN (exp)); if (POINTER_TYPE_P (func)) func = TREE_TYPE (func); } @@ -2358,7 +2732,7 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) tree type; /* We're looking at the CALL_EXPR, we need the type of the function. */ - type = TREE_OPERAND (exp, 0); /* pointer expression */ + type = CALL_EXPR_FN (exp); /* pointer expression */ type = TREE_TYPE (type); /* pointer type */ type = TREE_TYPE (type); /* function type */ @@ -2609,10 +2983,10 @@ ix86_function_regparm (tree type, tree decl) return regparm; } -/* Return 1 or 2, if we can pass up to 8 SFmode (1) and DFmode (2) arguments - in SSE registers for a function with the indicated TYPE and DECL. - DECL may be NULL when calling function indirectly - or considering a libcall. Otherwise return 0. */ +/* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and + DFmode (2) arguments in SSE registers for a function with the + indicated TYPE and DECL. DECL may be NULL when calling function + indirectly or considering a libcall. Otherwise return 0. */ static int ix86_function_sseregparm (tree type, tree decl) @@ -2637,9 +3011,9 @@ ix86_function_sseregparm (tree type, tree decl) return 2; } - /* For local functions, pass SFmode (and DFmode for SSE2) arguments - in SSE registers even for 32-bit mode and not just 3, but up to - 8 SSE arguments in registers. */ + /* For local functions, pass up to SSE_REGPARM_MAX SFmode + (and DFmode for SSE2) arguments in SSE registers, + even for 32-bit targets. */ if (!TARGET_64BIT && decl && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag) { @@ -2727,15 +3101,29 @@ ix86_function_arg_regno_p (int regno) { int i; if (!TARGET_64BIT) - return (regno < REGPARM_MAX - || (TARGET_MMX && MMX_REGNO_P (regno) - && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX)) - || (TARGET_SSE && SSE_REGNO_P (regno) - && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))); - - if (TARGET_SSE && SSE_REGNO_P (regno) - && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)) - return true; + { + if (TARGET_MACHO) + return (regno < REGPARM_MAX + || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno])); + else + return (regno < REGPARM_MAX + || (TARGET_MMX && MMX_REGNO_P (regno) + && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX)) + || (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))); + } + + if (TARGET_MACHO) + { + if (SSE_REGNO_P (regno) && TARGET_SSE) + return true; + } + else + { + if (TARGET_SSE && SSE_REGNO_P (regno) + && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)) + return true; + } /* RAX is used as hidden argument to va_arg functions. */ if (!regno) return true; @@ -3839,16 +4227,31 @@ ix86_function_arg_boundary (enum machine_mode mode, tree type) bool ix86_function_value_regno_p (int regno) { - if (regno == 0 - || (regno == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387) - || (regno == FIRST_SSE_REG && TARGET_SSE)) - return true; + if (TARGET_MACHO) + { + if (!TARGET_64BIT) + { + return ((regno) == 0 + || ((regno) == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387) + || ((regno) == FIRST_SSE_REG && TARGET_SSE)); + } + return ((regno) == 0 || (regno) == FIRST_FLOAT_REG + || ((regno) == FIRST_SSE_REG && TARGET_SSE) + || ((regno) == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387)); + } + else + { + if (regno == 0 + || (regno == FIRST_FLOAT_REG && TARGET_FLOAT_RETURNS_IN_80387) + || (regno == FIRST_SSE_REG && TARGET_SSE)) + return true; - if (!TARGET_64BIT - && (regno == FIRST_MMX_REG && TARGET_MMX)) - return true; + if (!TARGET_64BIT + && (regno == FIRST_MMX_REG && TARGET_MMX)) + return true; - return false; + return false; + } } /* Define how to find the value returned by a function. @@ -4237,7 +4640,7 @@ ix86_va_start (tree valist, rtx nextarg) if (cfun->va_list_gpr_size) { type = TREE_TYPE (gpr); - t = build2 (MODIFY_EXPR, type, gpr, + t = build2 (GIMPLE_MODIFY_STMT, type, gpr, build_int_cst (type, n_gpr * 8)); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); @@ -4246,7 +4649,7 @@ ix86_va_start (tree valist, rtx nextarg) if (cfun->va_list_fpr_size) { type = TREE_TYPE (fpr); - t = build2 (MODIFY_EXPR, type, fpr, + t = build2 (GIMPLE_MODIFY_STMT, type, fpr, build_int_cst (type, n_fpr * 16 + 8*REGPARM_MAX)); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); @@ -4258,7 +4661,7 @@ ix86_va_start (tree valist, rtx nextarg) if (words != 0) t = build2 (PLUS_EXPR, type, t, build_int_cst (type, words * UNITS_PER_WORD)); - t = build2 (MODIFY_EXPR, type, ovf, t); + t = build2 (GIMPLE_MODIFY_STMT, type, ovf, t); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); @@ -4268,7 +4671,7 @@ ix86_va_start (tree valist, rtx nextarg) Prologue of the function save it right above stack frame. */ type = TREE_TYPE (sav); t = make_tree (type, frame_pointer_rtx); - t = build2 (MODIFY_EXPR, type, sav, t); + t = build2 (GIMPLE_MODIFY_STMT, type, sav, t); TREE_SIDE_EFFECTS (t) = 1; expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); } @@ -4405,7 +4808,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) /* int_addr = gpr + sav; */ t = fold_convert (ptr_type_node, gpr); t = build2 (PLUS_EXPR, ptr_type_node, sav, t); - t = build2 (MODIFY_EXPR, void_type_node, int_addr, t); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, int_addr, t); gimplify_and_add (t, pre_p); } if (needed_sseregs) @@ -4413,7 +4816,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) /* sse_addr = fpr + sav; */ t = fold_convert (ptr_type_node, fpr); t = build2 (PLUS_EXPR, ptr_type_node, sav, t); - t = build2 (MODIFY_EXPR, void_type_node, sse_addr, t); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, sse_addr, t); gimplify_and_add (t, pre_p); } if (need_temp) @@ -4423,7 +4826,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) /* addr = &temp; */ t = build1 (ADDR_EXPR, build_pointer_type (type), temp); - t = build2 (MODIFY_EXPR, void_type_node, addr, t); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t); gimplify_and_add (t, pre_p); for (i = 0; i < XVECLEN (container, 0); i++) @@ -4457,7 +4860,7 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) size_int (INTVAL (XEXP (slot, 1))))); dest = build_va_arg_indirect_ref (dest_addr); - t = build2 (MODIFY_EXPR, void_type_node, dest, src); + t = build2 (GIMPLE_MODIFY_STMT, void_type_node, dest, src); gimplify_and_add (t, pre_p); } } @@ -4466,14 +4869,14 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) { t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr, build_int_cst (TREE_TYPE (gpr), needed_intregs * 8)); - t = build2 (MODIFY_EXPR, TREE_TYPE (gpr), gpr, t); + t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (gpr), gpr, t); gimplify_and_add (t, pre_p); } if (needed_sseregs) { t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr, build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16)); - t = build2 (MODIFY_EXPR, TREE_TYPE (fpr), fpr, t); + t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (fpr), fpr, t); gimplify_and_add (t, pre_p); } @@ -4500,12 +4903,12 @@ ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p) } gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue); - t2 = build2 (MODIFY_EXPR, void_type_node, addr, t); + t2 = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t); gimplify_and_add (t2, pre_p); t = build2 (PLUS_EXPR, TREE_TYPE (t), t, build_int_cst (TREE_TYPE (t), rsize * UNITS_PER_WORD)); - t = build2 (MODIFY_EXPR, TREE_TYPE (ovf), ovf, t); + t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (ovf), ovf, t); gimplify_and_add (t, pre_p); if (container) @@ -4537,7 +4940,7 @@ ix86_check_movabs (rtx insn, int opnum) mem = XEXP (set, opnum); while (GET_CODE (mem) == SUBREG) mem = SUBREG_REG (mem); - gcc_assert (GET_CODE (mem) == MEM); + gcc_assert (MEM_P (mem)); return (volatile_ok || !MEM_VOLATILE_P (mem)); } @@ -4573,6 +4976,8 @@ init_ext_80387_constants (void) int standard_80387_constant_p (rtx x) { + REAL_VALUE_TYPE r; + if (GET_CODE (x) != CONST_DOUBLE || !FLOAT_MODE_P (GET_MODE (x))) return -1; @@ -4581,23 +4986,30 @@ standard_80387_constant_p (rtx x) if (x == CONST1_RTX (GET_MODE (x))) return 2; + REAL_VALUE_FROM_CONST_DOUBLE (r, x); + /* For XFmode constants, try to find a special 80387 instruction when optimizing for size or on those CPUs that benefit from them. */ if (GET_MODE (x) == XFmode && (optimize_size || x86_ext_80387_constants & TUNEMASK)) { - REAL_VALUE_TYPE r; int i; if (! ext_80387_constants_init) init_ext_80387_constants (); - REAL_VALUE_FROM_CONST_DOUBLE (r, x); for (i = 0; i < 5; i++) if (real_identical (&r, &ext_80387_constants_table[i])) return i + 3; } + /* Load of the constant -0.0 or -1.0 will be split as + fldz;fchs or fld1;fchs sequence. */ + if (real_isnegzero (&r)) + return 8; + if (real_identical (&r, &dconstm1)) + return 9; + return 0; } @@ -4623,6 +5035,9 @@ standard_80387_constant_opcode (rtx x) return "fldl2t"; case 7: return "fldpi"; + case 8: + case 9: + return "#"; default: gcc_unreachable (); } @@ -5216,18 +5631,22 @@ ix86_compute_frame_layout (struct ix86_frame *frame) frame->to_allocate -= frame->red_zone_size; frame->stack_pointer_offset -= frame->red_zone_size; #if 0 - fprintf (stderr, "nregs: %i\n", frame->nregs); - fprintf (stderr, "size: %i\n", size); - fprintf (stderr, "alignment1: %i\n", stack_alignment_needed); - fprintf (stderr, "padding1: %i\n", frame->padding1); - fprintf (stderr, "va_arg: %i\n", frame->va_arg_size); - fprintf (stderr, "padding2: %i\n", frame->padding2); - fprintf (stderr, "to_allocate: %i\n", frame->to_allocate); - fprintf (stderr, "red_zone_size: %i\n", frame->red_zone_size); - fprintf (stderr, "frame_pointer_offset: %i\n", frame->frame_pointer_offset); - fprintf (stderr, "hard_frame_pointer_offset: %i\n", - frame->hard_frame_pointer_offset); - fprintf (stderr, "stack_pointer_offset: %i\n", frame->stack_pointer_offset); + fprintf (stderr, "\n"); + fprintf (stderr, "nregs: %ld\n", (long)frame->nregs); + fprintf (stderr, "size: %ld\n", (long)size); + fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed); + fprintf (stderr, "padding1: %ld\n", (long)frame->padding1); + fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size); + fprintf (stderr, "padding2: %ld\n", (long)frame->padding2); + fprintf (stderr, "to_allocate: %ld\n", (long)frame->to_allocate); + fprintf (stderr, "red_zone_size: %ld\n", (long)frame->red_zone_size); + fprintf (stderr, "frame_pointer_offset: %ld\n", (long)frame->frame_pointer_offset); + fprintf (stderr, "hard_frame_pointer_offset: %ld\n", + (long)frame->hard_frame_pointer_offset); + fprintf (stderr, "stack_pointer_offset: %ld\n", (long)frame->stack_pointer_offset); + fprintf (stderr, "current_function_is_leaf: %ld\n", (long)current_function_is_leaf); + fprintf (stderr, "current_function_calls_alloca: %ld\n", (long)current_function_calls_alloca); + fprintf (stderr, "x86_current_function_calls_tls_descriptor: %ld\n", (long)ix86_current_function_calls_tls_descriptor); #endif } @@ -5289,7 +5708,7 @@ pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style) shouldn't be used together with huge frame sizes in one function because of the frame_size check in sibcall.c. */ gcc_assert (style); - r11 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */); + r11 = gen_rtx_REG (DImode, R11_REG); insn = emit_insn (gen_rtx_SET (DImode, r11, offset)); if (style < 0) RTX_FRAME_RELATED_P (insn) = 1; @@ -5546,7 +5965,7 @@ ix86_emit_restore_regs_using_mov (rtx pointer, HOST_WIDE_INT offset, { rtx r11; - r11 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */); + r11 = gen_rtx_REG (DImode, R11_REG); emit_move_insn (r11, GEN_INT (offset)); emit_insn (gen_adddi3 (r11, r11, pointer)); base_address = gen_rtx_MEM (Pmode, r11); @@ -5772,7 +6191,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) int retval = 1; enum ix86_address_seg seg = SEG_DEFAULT; - if (GET_CODE (addr) == REG || GET_CODE (addr) == SUBREG) + if (REG_P (addr) || GET_CODE (addr) == SUBREG) base = addr; else if (GET_CODE (addr) == PLUS) { @@ -5849,7 +6268,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) /* We're called for lea too, which implements ashift on occasion. */ index = XEXP (addr, 0); tmp = XEXP (addr, 1); - if (GET_CODE (tmp) != CONST_INT) + if (!CONST_INT_P (tmp)) return 0; scale = INTVAL (tmp); if ((unsigned HOST_WIDE_INT) scale > 3) @@ -5863,7 +6282,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) /* Extract the integral value of scale. */ if (scale_rtx) { - if (GET_CODE (scale_rtx) != CONST_INT) + if (!CONST_INT_P (scale_rtx)) return 0; scale = INTVAL (scale_rtx); } @@ -5992,7 +6411,7 @@ ix86_find_base_term (rtx x) return x; term = XEXP (x, 0); if (GET_CODE (term) == PLUS - && (GET_CODE (XEXP (term, 1)) == CONST_INT + && (CONST_INT_P (XEXP (term, 1)) || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE)) term = XEXP (term, 0); if (GET_CODE (term) != UNSPEC @@ -6052,7 +6471,7 @@ legitimate_constant_p (rtx x) if (GET_CODE (x) == PLUS) { - if (GET_CODE (XEXP (x, 1)) != CONST_INT) + if (!CONST_INT_P (XEXP (x, 1))) return false; x = XEXP (x, 0); } @@ -6155,7 +6574,7 @@ legitimate_pic_operand_p (rtx x) case CONST: inner = XEXP (x, 0); if (GET_CODE (inner) == PLUS - && GET_CODE (XEXP (inner, 1)) == CONST_INT) + && CONST_INT_P (XEXP (inner, 1))) inner = XEXP (inner, 0); /* Only some unspecs are valid as "constants". */ @@ -6206,7 +6625,7 @@ legitimate_pic_address_disp_p (rtx disp) break; op0 = XEXP (XEXP (disp, 0), 0); op1 = XEXP (XEXP (disp, 0), 1); - if (GET_CODE (op1) != CONST_INT + if (!CONST_INT_P (op1) || INTVAL (op1) >= 16*1024*1024 || INTVAL (op1) < -16*1024*1024) break; @@ -6250,7 +6669,7 @@ legitimate_pic_address_disp_p (rtx disp) saw_plus = false; if (GET_CODE (disp) == PLUS) { - if (GET_CODE (XEXP (disp, 1)) != CONST_INT) + if (!CONST_INT_P (XEXP (disp, 1))) return 0; disp = XEXP (disp, 0); saw_plus = true; @@ -6478,7 +6897,7 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) if (GET_CODE (disp) != CONST || GET_CODE (XEXP (disp, 0)) != PLUS || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC - || GET_CODE (XEXP (XEXP (disp, 0), 1)) != CONST_INT + || !CONST_INT_P (XEXP (XEXP (disp, 0), 1)) || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF)) { @@ -6515,7 +6934,7 @@ legitimate_address_p (enum machine_mode mode, rtx addr, int strict) correct fix for crash to disable this test. */ } else if (GET_CODE (disp) != LABEL_REF - && GET_CODE (disp) != CONST_INT + && !CONST_INT_P (disp) && (GET_CODE (disp) != CONST || !legitimate_constant_p (disp)) && (GET_CODE (disp) != SYMBOL_REF @@ -6691,7 +7110,7 @@ legitimize_pic_address (rtx orig, rtx reg) } else { - if (GET_CODE (addr) == CONST_INT + if (CONST_INT_P (addr) && !x86_64_immediate_operand (addr, VOIDmode)) { if (reg) @@ -6722,7 +7141,7 @@ legitimize_pic_address (rtx orig, rtx reg) /* Check first to see if this is a constant offset from a @GOTOFF symbol reference. */ if (local_symbolic_operand (op0, Pmode) - && GET_CODE (op1) == CONST_INT) + && CONST_INT_P (op1)) { if (!TARGET_64BIT) { @@ -6757,7 +7176,7 @@ legitimize_pic_address (rtx orig, rtx reg) new = legitimize_pic_address (XEXP (addr, 1), base == reg ? NULL_RTX : reg); - if (GET_CODE (new) == CONST_INT) + if (CONST_INT_P (new)) new = plus_constant (base, INTVAL (new)); else { @@ -6999,7 +7418,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) /* Canonicalize shifts by 0, 1, 2, 3 into multiply */ if (GET_CODE (x) == ASHIFT - && GET_CODE (XEXP (x, 1)) == CONST_INT + && CONST_INT_P (XEXP (x, 1)) && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4) { changed = 1; @@ -7013,7 +7432,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */ if (GET_CODE (XEXP (x, 0)) == ASHIFT - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (x, 0), 1)) && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4) { changed = 1; @@ -7024,7 +7443,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) } if (GET_CODE (XEXP (x, 1)) == ASHIFT - && GET_CODE (XEXP (XEXP (x, 1), 1)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (x, 1), 1)) && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4) { changed = 1; @@ -7067,12 +7486,12 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) rtx constant; rtx other = NULL_RTX; - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) { constant = XEXP (x, 1); other = XEXP (XEXP (XEXP (x, 0), 1), 1); } - else if (GET_CODE (XEXP (XEXP (XEXP (x, 0), 1), 1)) == CONST_INT) + else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1))) { constant = XEXP (XEXP (XEXP (x, 0), 1), 1); other = XEXP (x, 1); @@ -7106,8 +7525,8 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) } if (changed - && GET_CODE (XEXP (x, 1)) == REG - && GET_CODE (XEXP (x, 0)) == REG) + && REG_P (XEXP (x, 1)) + && REG_P (XEXP (x, 0))) return x; if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1))) @@ -7119,7 +7538,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) if (changed && legitimate_address_p (mode, x, FALSE)) return x; - if (GET_CODE (XEXP (x, 0)) == REG) + if (REG_P (XEXP (x, 0))) { rtx temp = gen_reg_rtx (Pmode); rtx val = force_operand (XEXP (x, 1), temp); @@ -7130,7 +7549,7 @@ legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode) return x; } - else if (GET_CODE (XEXP (x, 1)) == REG) + else if (REG_P (XEXP (x, 1))) { rtx temp = gen_reg_rtx (Pmode); rtx val = force_operand (XEXP (x, 0), temp); @@ -7205,7 +7624,7 @@ output_pic_addr_const (FILE *file, rtx x, int code) case PLUS: /* Some assemblers need integer constants to appear first. */ - if (GET_CODE (XEXP (x, 0)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 0))) { output_pic_addr_const (file, XEXP (x, 0), code); putc ('+', file); @@ -7213,7 +7632,7 @@ output_pic_addr_const (FILE *file, rtx x, int code) } else { - gcc_assert (GET_CODE (XEXP (x, 1)) == CONST_INT); + gcc_assert (CONST_INT_P (XEXP (x, 1))); output_pic_addr_const (file, XEXP (x, 1), code); putc ('+', file); output_pic_addr_const (file, XEXP (x, 0), code); @@ -7322,7 +7741,7 @@ ix86_delegitimize_address (rtx orig_x) /* This is the result, or NULL. */ rtx result = NULL_RTX; - if (GET_CODE (x) == MEM) + if (MEM_P (x)) x = XEXP (x, 0); if (TARGET_64BIT) @@ -7330,7 +7749,7 @@ ix86_delegitimize_address (rtx orig_x) if (GET_CODE (x) != CONST || GET_CODE (XEXP (x, 0)) != UNSPEC || XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL - || GET_CODE (orig_x) != MEM) + || !MEM_P (orig_x)) return orig_x; return XVECEXP (XEXP (x, 0), 0, 0); } @@ -7339,7 +7758,7 @@ ix86_delegitimize_address (rtx orig_x) || GET_CODE (XEXP (x, 1)) != CONST) return orig_x; - if (GET_CODE (XEXP (x, 0)) == REG + if (REG_P (XEXP (x, 0)) && REGNO (XEXP (x, 0)) == PIC_OFFSET_TABLE_REGNUM) /* %ebx + GOT/GOTOFF */ ; @@ -7347,15 +7766,15 @@ ix86_delegitimize_address (rtx orig_x) { /* %ebx + %reg * scale + GOT/GOTOFF */ reg_addend = XEXP (x, 0); - if (GET_CODE (XEXP (reg_addend, 0)) == REG + if (REG_P (XEXP (reg_addend, 0)) && REGNO (XEXP (reg_addend, 0)) == PIC_OFFSET_TABLE_REGNUM) reg_addend = XEXP (reg_addend, 1); - else if (GET_CODE (XEXP (reg_addend, 1)) == REG + else if (REG_P (XEXP (reg_addend, 1)) && REGNO (XEXP (reg_addend, 1)) == PIC_OFFSET_TABLE_REGNUM) reg_addend = XEXP (reg_addend, 0); else return orig_x; - if (GET_CODE (reg_addend) != REG + if (!REG_P (reg_addend) && GET_CODE (reg_addend) != MULT && GET_CODE (reg_addend) != ASHIFT) return orig_x; @@ -7365,19 +7784,19 @@ ix86_delegitimize_address (rtx orig_x) x = XEXP (XEXP (x, 1), 0); if (GET_CODE (x) == PLUS - && GET_CODE (XEXP (x, 1)) == CONST_INT) + && CONST_INT_P (XEXP (x, 1))) { const_addend = XEXP (x, 1); x = XEXP (x, 0); } if (GET_CODE (x) == UNSPEC - && ((XINT (x, 1) == UNSPEC_GOT && GET_CODE (orig_x) == MEM) - || (XINT (x, 1) == UNSPEC_GOTOFF && GET_CODE (orig_x) != MEM))) + && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x)) + || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x)))) result = XVECEXP (x, 0, 0); if (TARGET_MACHO && darwin_local_data_pic (x) - && GET_CODE (orig_x) != MEM) + && !MEM_P (orig_x)) result = XEXP (x, 0); if (! result) @@ -7675,7 +8094,7 @@ print_operand (FILE *file, rtx x, int code) case ASM_INTEL: /* Intel syntax. For absolute addresses, registers should not be surrounded by braces. */ - if (GET_CODE (x) != REG) + if (!REG_P (x)) { putc ('[', file); PRINT_OPERAND (file, x, 0); @@ -7735,6 +8154,10 @@ print_operand (FILE *file, rtx x, int code) /* This is the size of op from size of operand. */ switch (GET_MODE_SIZE (GET_MODE (x))) { + case 1: + putc ('b', file); + return; + case 2: #ifdef HAVE_GAS_FILDS_FISTS putc ('s', file); @@ -7785,7 +8208,7 @@ print_operand (FILE *file, rtx x, int code) break; case 's': - if (GET_CODE (x) == CONST_INT || ! SHIFT_DOUBLE_OMITS_COUNT) + if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT) { PRINT_OPERAND (file, x, 0); putc (',', file); @@ -7923,10 +8346,10 @@ print_operand (FILE *file, rtx x, int code) } } - if (GET_CODE (x) == REG) + if (REG_P (x)) print_reg (x, code, file); - else if (GET_CODE (x) == MEM) + else if (MEM_P (x)) { /* No `byte ptr' prefix for call instructions. */ if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P') @@ -7959,7 +8382,7 @@ print_operand (FILE *file, rtx x, int code) x = XEXP (x, 0); /* Avoid (%rip) for call operands. */ if (CONSTANT_ADDRESS_P (x) && code == 'P' - && GET_CODE (x) != CONST_INT) + && !CONST_INT_P (x)) output_addr_const (file, x); else if (this_is_asm_operands && ! address_operand (x, VOIDmode)) output_operand_lossage ("invalid constraints for operand"); @@ -8011,7 +8434,7 @@ print_operand (FILE *file, rtx x, int code) if (code != 'P') { - if (GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST_DOUBLE) + if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE) { if (ASSEMBLER_DIALECT == ASM_ATT) putc ('$', file); @@ -8025,7 +8448,7 @@ print_operand (FILE *file, rtx x, int code) fputs ("OFFSET FLAT:", file); } } - if (GET_CODE (x) == CONST_INT) + if (CONST_INT_P (x)) fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); else if (flag_pic) output_pic_addr_const (file, x, code); @@ -8069,7 +8492,7 @@ print_operand_address (FILE *file, rtx addr) { /* Displacement only requires special attention. */ - if (GET_CODE (disp) == CONST_INT) + if (CONST_INT_P (disp)) { if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT) { @@ -8089,7 +8512,7 @@ print_operand_address (FILE *file, rtx addr) { if (GET_CODE (disp) == CONST && GET_CODE (XEXP (disp, 0)) == PLUS - && GET_CODE (XEXP (XEXP (disp, 0), 1)) == CONST_INT) + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) disp = XEXP (XEXP (disp, 0), 0); if (GET_CODE (disp) == LABEL_REF || (GET_CODE (disp) == SYMBOL_REF @@ -8132,7 +8555,7 @@ print_operand_address (FILE *file, rtx addr) /* Pull out the offset of a symbol; print any symbol itself. */ if (GET_CODE (disp) == CONST && GET_CODE (XEXP (disp, 0)) == PLUS - && GET_CODE (XEXP (XEXP (disp, 0), 1)) == CONST_INT) + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) { offset = XEXP (XEXP (disp, 0), 1); disp = gen_rtx_CONST (VOIDmode, @@ -8143,7 +8566,7 @@ print_operand_address (FILE *file, rtx addr) output_pic_addr_const (file, disp, 0); else if (GET_CODE (disp) == LABEL_REF) output_asm_label (disp); - else if (GET_CODE (disp) == CONST_INT) + else if (CONST_INT_P (disp)) offset = disp; else output_addr_const (file, disp); @@ -8242,7 +8665,7 @@ split_di (rtx operands[], int num, rtx lo_half[], rtx hi_half[]) /* simplify_subreg refuse to split volatile memory addresses, but we still have to handle it. */ - if (GET_CODE (op) == MEM) + if (MEM_P (op)) { lo_half[num] = adjust_address (op, SImode, 0); hi_half[num] = adjust_address (op, SImode, 4); @@ -8273,7 +8696,7 @@ split_ti (rtx operands[], int num, rtx lo_half[], rtx hi_half[]) /* simplify_subreg refuse to split volatile memory addresses, but we still have to handle it. */ - if (GET_CODE (op) == MEM) + if (MEM_P (op)) { lo_half[num] = adjust_address (op, DImode, 0); hi_half[num] = adjust_address (op, DImode, 8); @@ -8317,10 +8740,10 @@ output_387_binary_op (rtx insn, rtx *operands) if (STACK_REG_P (operands[0]) && ((REG_P (operands[1]) && REGNO (operands[0]) == REGNO (operands[1]) - && (STACK_REG_P (operands[2]) || GET_CODE (operands[2]) == MEM)) + && (STACK_REG_P (operands[2]) || MEM_P (operands[2]))) || (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]) - && (STACK_REG_P (operands[1]) || GET_CODE (operands[1]) == MEM))) + && (STACK_REG_P (operands[1]) || MEM_P (operands[1])))) && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2]))) ; /* ok */ else @@ -8393,7 +8816,7 @@ output_387_binary_op (rtx insn, rtx *operands) /* know operands[0] == operands[1]. */ - if (GET_CODE (operands[2]) == MEM) + if (MEM_P (operands[2])) { p = "%z2\t%2"; break; @@ -8423,13 +8846,13 @@ output_387_binary_op (rtx insn, rtx *operands) case MINUS: case DIV: - if (GET_CODE (operands[1]) == MEM) + if (MEM_P (operands[1])) { p = "r%z1\t%1"; break; } - if (GET_CODE (operands[2]) == MEM) + if (MEM_P (operands[2])) { p = "%z2\t%2"; break; @@ -8575,7 +8998,7 @@ emit_i387_cw_initialization (int mode) rtx reg = gen_reg_rtx (HImode); emit_insn (gen_x86_fnstcw_1 (stored_mode)); - emit_move_insn (reg, stored_mode); + emit_move_insn (reg, copy_rtx (stored_mode)); if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL || optimize_size) { @@ -8668,7 +9091,7 @@ output_fix_trunc (rtx insn, rtx *operands, int fisttp) output_asm_insn ("fld\t%y1", operands); gcc_assert (STACK_TOP_P (operands[1])); - gcc_assert (GET_CODE (operands[0]) == MEM); + gcc_assert (MEM_P (operands[0])); if (fisttp) output_asm_insn ("fisttp%z0\t%0", operands); @@ -8956,7 +9379,7 @@ ix86_expand_move (enum machine_mode mode, rtx operands[]) if (MACHOPIC_PURE) { rtx temp = ((reload_in_progress - || ((op0 && GET_CODE (op0) == REG) + || ((op0 && REG_P (op0)) && mode == Pmode)) ? op0 : gen_reg_rtx (Pmode)); op1 = machopic_indirect_data_reference (op1, temp); @@ -8971,7 +9394,7 @@ ix86_expand_move (enum machine_mode mode, rtx operands[]) } else { - if (GET_CODE (op0) == MEM) + if (MEM_P (op0)) op1 = force_reg (Pmode, op1); else op1 = legitimize_address (op1, op1, Pmode); @@ -8979,10 +9402,10 @@ ix86_expand_move (enum machine_mode mode, rtx operands[]) } else { - if (GET_CODE (op0) == MEM + if (MEM_P (op0) && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode) || !push_operand (op0, mode)) - && GET_CODE (op1) == MEM) + && MEM_P (op1)) op1 = force_reg (mode, op1); if (push_operand (op0, mode) @@ -9084,8 +9507,16 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) } if (TARGET_SSE2 && mode == V2DFmode) - { - rtx zero; + { + rtx zero; + + if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + return; + } /* When SSE registers are split into halves, we can avoid writing to the top half twice. */ @@ -9113,7 +9544,15 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) emit_insn (gen_sse2_loadhpd (op0, op0, m)); } else - { + { + if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_sse_movups (op0, op1)); + return; + } + if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) emit_move_insn (op0, CONST0_RTX (mode)); else @@ -9189,6 +9628,43 @@ ix86_expand_push (enum machine_mode mode, rtx x) emit_move_insn (tmp, x); } +/* Helper function of ix86_fixup_binary_operands to canonicalize + operand order. Returns true if the operands should be swapped. */ + +static bool +ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + + /* If the operation is not commutative, we can't do anything. */ + if (GET_RTX_CLASS (code) != RTX_COMM_ARITH) + return false; + + /* Highest priority is that src1 should match dst. */ + if (rtx_equal_p (dst, src1)) + return false; + if (rtx_equal_p (dst, src2)) + return true; + + /* Next highest priority is that immediate constants come second. */ + if (immediate_operand (src2, mode)) + return false; + if (immediate_operand (src1, mode)) + return true; + + /* Lowest priority is that memory references should come second. */ + if (MEM_P (src2)) + return false; + if (MEM_P (src1)) + return true; + + return false; +} + + /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the destination to use for the operation. If different from the true destination in operands[0], a copy operation will be required. */ @@ -9197,55 +9673,46 @@ rtx ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode, rtx operands[]) { - int matching_memory; - rtx src1, src2, dst; - - dst = operands[0]; - src1 = operands[1]; - src2 = operands[2]; + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; - /* Recognize = for commutative operators */ - if (GET_RTX_CLASS (code) == RTX_COMM_ARITH - && (rtx_equal_p (dst, src2) - || immediate_operand (src1, mode))) + /* Canonicalize operand order. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) { rtx temp = src1; src1 = src2; src2 = temp; } - /* If the destination is memory, and we do not have matching source - operands, do things in registers. */ - matching_memory = 0; - if (GET_CODE (dst) == MEM) - { - if (rtx_equal_p (dst, src1)) - matching_memory = 1; - else if (GET_RTX_CLASS (code) == RTX_COMM_ARITH - && rtx_equal_p (dst, src2)) - matching_memory = 2; - else - dst = gen_reg_rtx (mode); - } - /* Both source operands cannot be in memory. */ - if (GET_CODE (src1) == MEM && GET_CODE (src2) == MEM) + if (MEM_P (src1) && MEM_P (src2)) { - if (matching_memory != 2) - src2 = force_reg (mode, src2); + /* Optimization: Only read from memory once. */ + if (rtx_equal_p (src1, src2)) + { + src2 = force_reg (mode, src2); + src1 = src2; + } else - src1 = force_reg (mode, src1); + src2 = force_reg (mode, src2); } - /* If the operation is not commutable, source 1 cannot be a constant - or non-matching memory. */ - if ((CONSTANT_P (src1) - || (!matching_memory && GET_CODE (src1) == MEM)) - && GET_RTX_CLASS (code) != RTX_COMM_ARITH) + /* If the destination is memory, and we do not have matching source + operands, do things in registers. */ + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + dst = gen_reg_rtx (mode); + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) + src1 = force_reg (mode, src1); + + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) src1 = force_reg (mode, src1); - src1 = operands[1] = src1; - src2 = operands[2] = src2; + operands[1] = src1; + operands[2] = src2; return dst; } @@ -9299,28 +9766,37 @@ ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode, appropriate constraints. */ int -ix86_binary_operator_ok (enum rtx_code code, - enum machine_mode mode ATTRIBUTE_UNUSED, +ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode, rtx operands[3]) { + rtx dst = operands[0]; + rtx src1 = operands[1]; + rtx src2 = operands[2]; + /* Both source operands cannot be in memory. */ - if (GET_CODE (operands[1]) == MEM && GET_CODE (operands[2]) == MEM) - return 0; - /* If the operation is not commutable, source 1 cannot be a constant. */ - if (CONSTANT_P (operands[1]) && GET_RTX_CLASS (code) != RTX_COMM_ARITH) + if (MEM_P (src1) && MEM_P (src2)) return 0; + + /* Canonicalize operand order for commutative operators. */ + if (ix86_swap_binary_operands_p (code, mode, operands)) + { + rtx temp = src1; + src1 = src2; + src2 = temp; + } + /* If the destination is memory, we must have a matching source operand. */ - if (GET_CODE (operands[0]) == MEM - && ! (rtx_equal_p (operands[0], operands[1]) - || (GET_RTX_CLASS (code) == RTX_COMM_ARITH - && rtx_equal_p (operands[0], operands[2])))) + if (MEM_P (dst) && !rtx_equal_p (dst, src1)) + return 0; + + /* Source 1 cannot be a constant. */ + if (CONSTANT_P (src1)) return 0; - /* If the operation is not commutable and the source 1 is memory, we must - have a matching destination. */ - if (GET_CODE (operands[1]) == MEM - && GET_RTX_CLASS (code) != RTX_COMM_ARITH - && ! rtx_equal_p (operands[0], operands[1])) + + /* Source 1 cannot be a non-matching memory. */ + if (MEM_P (src1) && !rtx_equal_p (dst, src1)) return 0; + return 1; } @@ -9383,97 +9859,314 @@ ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED, rtx operands[2] ATTRIBUTE_UNUSED) { /* If one of operands is memory, source and destination must match. */ - if ((GET_CODE (operands[0]) == MEM - || GET_CODE (operands[1]) == MEM) + if ((MEM_P (operands[0]) + || MEM_P (operands[1])) && ! rtx_equal_p (operands[0], operands[1])) return FALSE; return TRUE; } -/* A subroutine of ix86_expand_fp_absneg_operator and copysign expanders. - Create a mask for the sign bit in MODE for an SSE register. If VECT is - true, then replicate the mask for all elements of the vector register. - If INVERT is true, then create a mask excluding the sign bit. */ +/* Post-reload splitter for converting an SF or DFmode value in an + SSE register into an unsigned SImode. */ -rtx -ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) +void +ix86_split_convert_uns_si_sse (rtx operands[]) { - enum machine_mode vec_mode; - HOST_WIDE_INT hi, lo; - int shift = 63; - rtvec v; - rtx mask; - - /* Find the sign bit, sign extended to 2*HWI. */ - if (mode == SFmode) - lo = 0x80000000, hi = lo < 0; - else if (HOST_BITS_PER_WIDE_INT >= 64) - lo = (HOST_WIDE_INT)1 << shift, hi = -1; - else - lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT); - - if (invert) - lo = ~lo, hi = ~hi; + enum machine_mode vecmode; + rtx value, large, zero_or_two31, input, two31, x; - /* Force this value into the low part of a fp vector constant. */ - mask = immed_double_const (lo, hi, mode == SFmode ? SImode : DImode); - mask = gen_lowpart (mode, mask); + large = operands[1]; + zero_or_two31 = operands[2]; + input = operands[3]; + two31 = operands[4]; + vecmode = GET_MODE (large); + value = gen_rtx_REG (vecmode, REGNO (operands[0])); - if (mode == SFmode) + /* Load up the value into the low element. We must ensure that the other + elements are valid floats -- zero is the easiest such value. */ + if (MEM_P (input)) { - if (vect) - v = gen_rtvec (4, mask, mask, mask, mask); + if (vecmode == V4SFmode) + emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input)); else - v = gen_rtvec (4, mask, CONST0_RTX (SFmode), - CONST0_RTX (SFmode), CONST0_RTX (SFmode)); - vec_mode = V4SFmode; + emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input)); } else { - if (vect) - v = gen_rtvec (2, mask, mask); + input = gen_rtx_REG (vecmode, REGNO (input)); + emit_move_insn (value, CONST0_RTX (vecmode)); + if (vecmode == V4SFmode) + emit_insn (gen_sse_movss (value, value, input)); else - v = gen_rtvec (2, mask, CONST0_RTX (DFmode)); - vec_mode = V2DFmode; + emit_insn (gen_sse2_movsd (value, value, input)); } - return force_reg (vec_mode, gen_rtx_CONST_VECTOR (vec_mode, v)); + emit_move_insn (large, two31); + emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31); + + x = gen_rtx_fmt_ee (LE, vecmode, large, value); + emit_insn (gen_rtx_SET (VOIDmode, large, x)); + + x = gen_rtx_AND (vecmode, zero_or_two31, large); + emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x)); + + x = gen_rtx_MINUS (vecmode, value, zero_or_two31); + emit_insn (gen_rtx_SET (VOIDmode, value, x)); + + large = gen_rtx_REG (V4SImode, REGNO (large)); + emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31))); + + x = gen_rtx_REG (V4SImode, REGNO (value)); + if (vecmode == V4SFmode) + emit_insn (gen_sse2_cvttps2dq (x, value)); + else + emit_insn (gen_sse2_cvttpd2dq (x, value)); + value = x; + + emit_insn (gen_xorv4si3 (value, value, large)); } -/* Generate code for floating point ABS or NEG. */ +/* Convert an unsigned DImode value into a DFmode, using only SSE. + Expects the 64-bit DImode to be supplied in a pair of integral + registers. Requires SSE2; will use SSE3 if available. For x86_32, + -mfpmath=sse, !optimize_size only. */ void -ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, - rtx operands[]) +ix86_expand_convert_uns_didf_sse (rtx target, rtx input) { - rtx mask, set, use, clob, dst, src; - bool matching_memory; - bool use_sse = false; - bool vector_mode = VECTOR_MODE_P (mode); - enum machine_mode elt_mode = mode; + REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt; + rtx int_xmm, fp_xmm; + rtx biases, exponents; + rtx x; - if (vector_mode) + int_xmm = gen_reg_rtx (V4SImode); + if (TARGET_INTER_UNIT_MOVES) + emit_insn (gen_movdi_to_sse (int_xmm, input)); + else if (TARGET_SSE_SPLIT_REGS) { - elt_mode = GET_MODE_INNER (mode); - use_sse = true; + emit_insn (gen_rtx_CLOBBER (VOIDmode, int_xmm)); + emit_move_insn (gen_lowpart (DImode, int_xmm), input); } - else if (TARGET_SSE_MATH) - use_sse = SSE_FLOAT_MODE_P (mode); - - /* NEG and ABS performed with SSE use bitwise mask operations. - Create the appropriate mask now. */ - if (use_sse) - mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS); else - mask = NULL_RTX; + { + x = gen_reg_rtx (V2DImode); + ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0); + emit_move_insn (int_xmm, gen_lowpart (V4SImode, x)); + } - dst = operands[0]; - src = operands[1]; + x = gen_rtx_CONST_VECTOR (V4SImode, + gen_rtvec (4, GEN_INT (0x43300000UL), + GEN_INT (0x45300000UL), + const0_rtx, const0_rtx)); + exponents = validize_mem (force_const_mem (V4SImode, x)); - /* If the destination is memory, and we don't have matching source - operands or we're using the x87, do things in registers. */ - matching_memory = false; - if (MEM_P (dst)) + /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */ + emit_insn (gen_sse2_punpckldq (int_xmm, int_xmm, exponents)); + + /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm) + yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)). + Similarly (0x45300000UL ## fp_value_hi_xmm) yields + (0x1.0p84 + double(fp_value_hi_xmm)). + Note these exponents differ by 32. */ + + fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm)); + + /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values + in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */ + real_ldexp (&bias_lo_rvt, &dconst1, 52); + real_ldexp (&bias_hi_rvt, &dconst1, 84); + biases = const_double_from_real_value (bias_lo_rvt, DFmode); + x = const_double_from_real_value (bias_hi_rvt, DFmode); + biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x)); + biases = validize_mem (force_const_mem (V2DFmode, biases)); + emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases)); + + /* Add the upper and lower DFmode values together. */ + if (TARGET_SSE3) + emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm)); + else + { + x = copy_to_mode_reg (V2DFmode, fp_xmm); + emit_insn (gen_sse2_unpckhpd (fp_xmm, fp_xmm, fp_xmm)); + emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x)); + } + + ix86_expand_vector_extract (false, target, fp_xmm, 0); +} + +/* Convert an unsigned SImode value into a DFmode. Only currently used + for SSE, but applicable anywhere. */ + +void +ix86_expand_convert_uns_sidf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE TWO31r; + rtx x, fp; + + x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1), + NULL, 1, OPTAB_DIRECT); + + fp = gen_reg_rtx (DFmode); + emit_insn (gen_floatsidf2 (fp, x)); + + real_ldexp (&TWO31r, &dconst1, 31); + x = const_double_from_real_value (TWO31r, DFmode); + + x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert a signed DImode value into a DFmode. Only used for SSE in + 32-bit mode; otherwise we have a direct convert instruction. */ + +void +ix86_expand_convert_sign_didf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE TWO32r; + rtx fp_lo, fp_hi, x; + + fp_lo = gen_reg_rtx (DFmode); + fp_hi = gen_reg_rtx (DFmode); + + emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input))); + + real_ldexp (&TWO32r, &dconst1, 32); + x = const_double_from_real_value (TWO32r, DFmode); + fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT); + + ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input)); + + x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (x != target) + emit_move_insn (target, x); +} + +/* Convert an unsigned SImode value into a SFmode, using only SSE. + For x86_32, -mfpmath=sse, !optimize_size only. */ +void +ix86_expand_convert_uns_sisf_sse (rtx target, rtx input) +{ + REAL_VALUE_TYPE ONE16r; + rtx fp_hi, fp_lo, int_hi, int_lo, x; + + real_ldexp (&ONE16r, &dconst1, 16); + x = const_double_from_real_value (ONE16r, SFmode); + int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff), + NULL, 0, OPTAB_DIRECT); + int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16), + NULL, 0, OPTAB_DIRECT); + fp_hi = gen_reg_rtx (SFmode); + fp_lo = gen_reg_rtx (SFmode); + emit_insn (gen_floatsisf2 (fp_hi, int_hi)); + emit_insn (gen_floatsisf2 (fp_lo, int_lo)); + fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi, + 0, OPTAB_DIRECT); + fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target, + 0, OPTAB_DIRECT); + if (!rtx_equal_p (target, fp_hi)) + emit_move_insn (target, fp_hi); +} + +/* A subroutine of ix86_build_signbit_mask_vector. If VECT is true, + then replicate the value for all elements of the vector + register. */ + +rtx +ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value) +{ + rtvec v; + switch (mode) + { + case SFmode: + if (vect) + v = gen_rtvec (4, value, value, value, value); + else + v = gen_rtvec (4, value, CONST0_RTX (SFmode), + CONST0_RTX (SFmode), CONST0_RTX (SFmode)); + return gen_rtx_CONST_VECTOR (V4SFmode, v); + + case DFmode: + if (vect) + v = gen_rtvec (2, value, value); + else + v = gen_rtvec (2, value, CONST0_RTX (DFmode)); + return gen_rtx_CONST_VECTOR (V2DFmode, v); + + default: + gcc_unreachable (); + } +} + +/* A subroutine of ix86_expand_fp_absneg_operator and copysign expanders. + Create a mask for the sign bit in MODE for an SSE register. If VECT is + true, then replicate the mask for all elements of the vector register. + If INVERT is true, then create a mask excluding the sign bit. */ + +rtx +ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert) +{ + enum machine_mode vec_mode; + HOST_WIDE_INT hi, lo; + int shift = 63; + rtx v; + rtx mask; + + /* Find the sign bit, sign extended to 2*HWI. */ + if (mode == SFmode) + lo = 0x80000000, hi = lo < 0; + else if (HOST_BITS_PER_WIDE_INT >= 64) + lo = (HOST_WIDE_INT)1 << shift, hi = -1; + else + lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT); + + if (invert) + lo = ~lo, hi = ~hi; + + /* Force this value into the low part of a fp vector constant. */ + mask = immed_double_const (lo, hi, mode == SFmode ? SImode : DImode); + mask = gen_lowpart (mode, mask); + + v = ix86_build_const_vector (mode, vect, mask); + vec_mode = (mode == SFmode) ? V4SFmode : V2DFmode; + return force_reg (vec_mode, v); +} + +/* Generate code for floating point ABS or NEG. */ + +void +ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, + rtx operands[]) +{ + rtx mask, set, use, clob, dst, src; + bool matching_memory; + bool use_sse = false; + bool vector_mode = VECTOR_MODE_P (mode); + enum machine_mode elt_mode = mode; + + if (vector_mode) + { + elt_mode = GET_MODE_INNER (mode); + use_sse = true; + } + else if (TARGET_SSE_MATH) + use_sse = SSE_FLOAT_MODE_P (mode); + + /* NEG and ABS performed with SSE use bitwise mask operations. + Create the appropriate mask now. */ + if (use_sse) + mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS); + else + mask = NULL_RTX; + + dst = operands[0]; + src = operands[1]; + + /* If the destination is memory, and we don't have matching source + operands or we're using the x87, do things in registers. */ + matching_memory = false; + if (MEM_P (dst)) { if (use_sse && rtx_equal_p (dst, src)) matching_memory = true; @@ -9894,16 +10587,16 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) into a register. */ if (standard_80387_constant_p (op0) == 0 - || (GET_CODE (op0) == MEM + || (MEM_P (op0) && ! (standard_80387_constant_p (op1) == 0 - || GET_CODE (op1) == MEM))) + || MEM_P (op1)))) { rtx tmp; tmp = op0, op0 = op1, op1 = tmp; code = swap_condition (code); } - if (GET_CODE (op0) != REG) + if (!REG_P (op0)) op0 = force_reg (op_mode, op0); if (CONSTANT_P (op1)) @@ -9924,12 +10617,12 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1) /* Try to rearrange the comparison to make it cheaper. */ if (ix86_fp_comparison_cost (code) > ix86_fp_comparison_cost (swap_condition (code)) - && (GET_CODE (op1) == REG || !no_new_pseudos)) + && (REG_P (op1) || !no_new_pseudos)) { rtx tmp; tmp = op0, op0 = op1, op1 = tmp; code = swap_condition (code); - if (GET_CODE (op0) != REG) + if (!REG_P (op0)) op0 = force_reg (op_mode, op0); } @@ -10483,7 +11176,7 @@ ix86_expand_branch (enum rtx_code code, rtx label) op1 is a constant and the low word is zero, then we can just examine the high word. */ - if (GET_CODE (hi[1]) == CONST_INT && lo[1] == const0_rtx) + if (CONST_INT_P (hi[1]) && lo[1] == const0_rtx) switch (code) { case LT: case LTU: case GE: case GEU: @@ -10757,7 +11450,7 @@ ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop) /* Convert a>b into b=b-1. */ case GTU: case LEU: - if (GET_CODE (op1) == CONST_INT) + if (CONST_INT_P (op1)) { op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0)); /* Bail out on overflow. We still can swap operands but that @@ -10834,8 +11527,8 @@ ix86_expand_int_movcc (rtx operands[]) if ((mode != HImode || TARGET_FAST_PREFIX) && (mode != (TARGET_64BIT ? TImode : DImode)) - && GET_CODE (operands[2]) == CONST_INT - && GET_CODE (operands[3]) == CONST_INT) + && CONST_INT_P (operands[2]) + && CONST_INT_P (operands[3])) { rtx out = operands[0]; HOST_WIDE_INT ct = INTVAL (operands[2]); @@ -11010,7 +11703,7 @@ ix86_expand_int_movcc (rtx operands[]) compare_code = UNKNOWN; if (GET_MODE_CLASS (GET_MODE (ix86_compare_op0)) == MODE_INT - && GET_CODE (ix86_compare_op1) == CONST_INT) + && CONST_INT_P (ix86_compare_op1)) { if (ix86_compare_op1 == const0_rtx && (code == LT || code == GE)) @@ -11222,7 +11915,7 @@ ix86_expand_int_movcc (rtx operands[]) /* If one of the two operands is an interesting constant, load a constant with the above and mask it in with a logical operation. */ - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { var = operands[3]; if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx) @@ -11232,7 +11925,7 @@ ix86_expand_int_movcc (rtx operands[]) else return 0; /* FAIL */ } - else if (GET_CODE (operands[3]) == CONST_INT) + else if (CONST_INT_P (operands[3])) { var = operands[2]; if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx) @@ -11719,6 +12412,52 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +/* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is + true if we should do zero extension, else sign extension. HIGH_P is + true if we want the N/2 high elements, else the low elements. */ + +void +ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p) +{ + enum machine_mode imode = GET_MODE (operands[1]); + rtx (*unpack)(rtx, rtx, rtx); + rtx se, dest; + + switch (imode) + { + case V16QImode: + if (high_p) + unpack = gen_vec_interleave_highv16qi; + else + unpack = gen_vec_interleave_lowv16qi; + break; + case V8HImode: + if (high_p) + unpack = gen_vec_interleave_highv8hi; + else + unpack = gen_vec_interleave_lowv8hi; + break; + case V4SImode: + if (high_p) + unpack = gen_vec_interleave_highv4si; + else + unpack = gen_vec_interleave_lowv4si; + break; + default: + gcc_unreachable (); + } + + dest = gen_lowpart (imode, operands[0]); + + if (unsigned_p) + se = force_reg (imode, CONST0_RTX (imode)); + else + se = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode), + operands[1], pc_rtx, pc_rtx); + + emit_insn (unpack (dest, operands[1], se)); +} + /* Expand conditional increment or decrement using adb/sbb instructions. The default case using setcc followed by the conditional move can be done by generic code. */ @@ -11818,19 +12557,19 @@ ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode) else size = (GET_MODE_SIZE (mode) + 4) / 8; - gcc_assert (GET_CODE (operand) != REG || !MMX_REGNO_P (REGNO (operand))); + gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand))); gcc_assert (size >= 2 && size <= 3); /* Optimize constant pool reference to immediates. This is used by fp moves, that force all constants to memory to allow combining. */ - if (GET_CODE (operand) == MEM && MEM_READONLY_P (operand)) + if (MEM_P (operand) && MEM_READONLY_P (operand)) { rtx tmp = maybe_get_pool_constant (operand); if (tmp) operand = tmp; } - if (GET_CODE (operand) == MEM && !offsettable_memref_p (operand)) + if (MEM_P (operand) && !offsettable_memref_p (operand)) { /* The only non-offsetable memories we handle are pushes. */ int ok = push_operand (operand, VOIDmode); @@ -11979,7 +12718,7 @@ ix86_split_long_move (rtx operands[]) /* Optimize constant pool reference to immediates. This is used by fp moves, that force all constants to memory to allow combining. */ - if (GET_CODE (operands[1]) == MEM + if (MEM_P (operands[1]) && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0))) operands[1] = get_pool_constant (XEXP (operands[1], 0)); @@ -11999,14 +12738,14 @@ ix86_split_long_move (rtx operands[]) if (push_operand (operands[0], VOIDmode)) push = 1; else - gcc_assert (GET_CODE (operands[0]) != MEM + gcc_assert (!MEM_P (operands[0]) || offsettable_memref_p (operands[0])); nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0])); ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0])); /* When emitting push, take care for source operands on the stack. */ - if (push && GET_CODE (operands[1]) == MEM + if (push && MEM_P (operands[1]) && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1])) { if (nparts == 3) @@ -12018,7 +12757,7 @@ ix86_split_long_move (rtx operands[]) /* We need to do copy in the right order in case an address register of the source overlaps the destination. */ - if (REG_P (part[0][0]) && GET_CODE (part[1][0]) == MEM) + if (REG_P (part[0][0]) && MEM_P (part[1][0])) { if (reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))) collisions++; @@ -12153,25 +12892,25 @@ ix86_split_long_move (rtx operands[]) /* If optimizing for size, attempt to locally unCSE nonzero constants. */ if (optimize_size) { - if (GET_CODE (operands[5]) == CONST_INT + if (CONST_INT_P (operands[5]) && operands[5] != const0_rtx && REG_P (operands[2])) { - if (GET_CODE (operands[6]) == CONST_INT + if (CONST_INT_P (operands[6]) && INTVAL (operands[6]) == INTVAL (operands[5])) operands[6] = operands[2]; if (nparts == 3 - && GET_CODE (operands[7]) == CONST_INT + && CONST_INT_P (operands[7]) && INTVAL (operands[7]) == INTVAL (operands[5])) operands[7] = operands[2]; } if (nparts == 3 - && GET_CODE (operands[6]) == CONST_INT + && CONST_INT_P (operands[6]) && operands[6] != const0_rtx && REG_P (operands[3]) - && GET_CODE (operands[7]) == CONST_INT + && CONST_INT_P (operands[7]) && INTVAL (operands[7]) == INTVAL (operands[6])) operands[7] = operands[3]; } @@ -12221,7 +12960,7 @@ ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode) int count; const int single_width = mode == DImode ? 32 : 64; - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { (mode == DImode ? split_di : split_ti) (operands, 2, low, high); count = INTVAL (operands[2]) & (single_width * 2 - 1); @@ -12348,7 +13087,7 @@ ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode) int count; const int single_width = mode == DImode ? 32 : 64; - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { (mode == DImode ? split_di : split_ti) (operands, 2, low, high); count = INTVAL (operands[2]) & (single_width * 2 - 1); @@ -12427,7 +13166,7 @@ ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode) int count; const int single_width = mode == DImode ? 32 : 64; - if (GET_CODE (operands[2]) == CONST_INT) + if (CONST_INT_P (operands[2])) { (mode == DImode ? split_di : split_ti) (operands, 2, low, high); count = INTVAL (operands[2]) & (single_width * 2 - 1); @@ -12483,10 +13222,22 @@ ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode) } } +/* Predict just emitted jump instruction to be taken with probability PROB. */ +static void +predict_jump (int prob) +{ + rtx insn = get_last_insn (); + gcc_assert (JUMP_P (insn)); + REG_NOTES (insn) + = gen_rtx_EXPR_LIST (REG_BR_PROB, + GEN_INT (prob), + REG_NOTES (insn)); +} + /* Helper function for the string operations below. Dest VARIABLE whether it is aligned to VALUE bytes. If true, jump to the label. */ static rtx -ix86_expand_aligntest (rtx variable, int value) +ix86_expand_aligntest (rtx variable, int value, bool epilogue) { rtx label = gen_label_rtx (); rtx tmpcount = gen_reg_rtx (GET_MODE (variable)); @@ -12496,6 +13247,10 @@ ix86_expand_aligntest (rtx variable, int value) emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value))); emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable), 1, label); + if (epilogue) + predict_jump (REG_BR_PROB_BASE * 50 / 100); + else + predict_jump (REG_BR_PROB_BASE * 90 / 100); return label; } @@ -12523,581 +13278,1291 @@ ix86_zero_extend_to_Pmode (rtx exp) return r; } -/* Expand string move (memcpy) operation. Use i386 string operations when - profitable. expand_clrmem contains similar code. */ -int -ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp) +/* Divide COUNTREG by SCALE. */ +static rtx +scale_counter (rtx countreg, int scale) { - rtx srcreg, destreg, countreg, srcexp, destexp; - enum machine_mode counter_mode; - HOST_WIDE_INT align = 0; - unsigned HOST_WIDE_INT count = 0; + rtx sc; + rtx piece_size_mask; - if (GET_CODE (align_exp) == CONST_INT) - align = INTVAL (align_exp); + if (scale == 1) + return countreg; + if (CONST_INT_P (countreg)) + return GEN_INT (INTVAL (countreg) / scale); + gcc_assert (REG_P (countreg)); - /* Can't use any of this if the user has appropriated esi or edi. */ - if (global_regs[4] || global_regs[5]) - return 0; + piece_size_mask = GEN_INT (scale - 1); + sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg, + GEN_INT (exact_log2 (scale)), + NULL, 1, OPTAB_DIRECT); + return sc; +} - /* This simple hack avoids all inlining code and simplifies code below. */ - if (!TARGET_ALIGN_STRINGOPS) - align = 64; +/* When SRCPTR is non-NULL, output simple loop to move memory + pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times, + overall size is COUNT specified in bytes. When SRCPTR is NULL, output the + equivalent loop to set memory by VALUE (supposed to be in MODE). + + The size is rounded down to whole number of chunk size moved at once. + SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */ + + +static void +expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx value, + rtx count, enum machine_mode mode, int unroll, + int expected_size) +{ + rtx out_label, top_label, iter, tmp; + enum machine_mode iter_mode; + rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll); + rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1)); + rtx size; + rtx x_addr; + rtx y_addr; + int i; + + iter_mode = GET_MODE (count); + if (iter_mode == VOIDmode) + iter_mode = word_mode; + + top_label = gen_label_rtx (); + out_label = gen_label_rtx (); + iter = gen_reg_rtx (iter_mode); - if (GET_CODE (count_exp) == CONST_INT) + size = expand_simple_binop (iter_mode, AND, count, piece_size_mask, + NULL, 1, OPTAB_DIRECT); + /* Those two should combine. */ + if (piece_size == const1_rtx) { - count = INTVAL (count_exp); - if (!TARGET_INLINE_ALL_STRINGOPS && count > 64) - return 0; + emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode, + true, out_label); + predict_jump (REG_BR_PROB_BASE * 10 / 100); } + emit_move_insn (iter, const0_rtx); - /* Figure out proper mode for counter. For 32bits it is always SImode, - for 64bits use SImode when possible, otherwise DImode. - Set count to number of bytes copied when known at compile time. */ - if (!TARGET_64BIT - || GET_MODE (count_exp) == SImode - || x86_64_zext_immediate_operand (count_exp, VOIDmode)) - counter_mode = SImode; - else - counter_mode = DImode; + emit_label (top_label); - gcc_assert (counter_mode == SImode || counter_mode == DImode); + tmp = convert_modes (Pmode, iter_mode, iter, true); + x_addr = gen_rtx_PLUS (Pmode, destptr, tmp); + destmem = change_address (destmem, mode, x_addr); - destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); - if (destreg != XEXP (dst, 0)) - dst = replace_equiv_address_nv (dst, destreg); - srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); - if (srcreg != XEXP (src, 0)) - src = replace_equiv_address_nv (src, srcreg); - - /* When optimizing for size emit simple rep ; movsb instruction for - counts not divisible by 4, except when (movsl;)*(movsw;)?(movsb;)? - sequence is shorter than mov{b,l} $count, %{ecx,cl}; rep; movsb. - Sice of (movsl;)*(movsw;)?(movsb;)? sequence is - count / 4 + (count & 3), the other sequence is either 4 or 7 bytes, - but we don't know whether upper 24 (resp. 56) bits of %ecx will be - known to be zero or not. The rep; movsb sequence causes higher - register pressure though, so take that into account. */ - - if ((!optimize || optimize_size) - && (count == 0 - || ((count & 0x03) - && (!optimize_size - || count > 5 * 4 - || (count & 3) + count / 4 > 6)))) - { - emit_insn (gen_cld ()); - countreg = ix86_zero_extend_to_Pmode (count_exp); - destexp = gen_rtx_PLUS (Pmode, destreg, countreg); - srcexp = gen_rtx_PLUS (Pmode, srcreg, countreg); - emit_insn (gen_rep_mov (destreg, dst, srcreg, src, countreg, - destexp, srcexp)); - } - - /* For constant aligned (or small unaligned) copies use rep movsl - followed by code copying the rest. For PentiumPro ensure 8 byte - alignment to allow rep movsl acceleration. */ - - else if (count != 0 - && (align >= 8 - || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) - || optimize_size || count < (unsigned int) 64)) - { - unsigned HOST_WIDE_INT offset = 0; - int size = TARGET_64BIT && !optimize_size ? 8 : 4; - rtx srcmem, dstmem; - - emit_insn (gen_cld ()); - if (count & ~(size - 1)) - { - if ((TARGET_SINGLE_STRINGOP || optimize_size) && count < 5 * 4) - { - enum machine_mode movs_mode = size == 4 ? SImode : DImode; + if (srcmem) + { + y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp)); + srcmem = change_address (srcmem, mode, y_addr); - while (offset < (count & ~(size - 1))) + /* When unrolling for chips that reorder memory reads and writes, + we can save registers by using single temporary. + Also using 4 temporaries is overkill in 32bit mode. */ + if (!TARGET_64BIT && 0) + { + for (i = 0; i < unroll; i++) + { + if (i) { - srcmem = adjust_automodify_address_nv (src, movs_mode, - srcreg, offset); - dstmem = adjust_automodify_address_nv (dst, movs_mode, - destreg, offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - offset += size; + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + srcmem = + adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode)); } + emit_move_insn (destmem, srcmem); } - else - { - countreg = GEN_INT ((count >> (size == 4 ? 2 : 3)) - & (TARGET_64BIT ? -1 : 0x3fffffff)); - countreg = copy_to_mode_reg (counter_mode, countreg); - countreg = ix86_zero_extend_to_Pmode (countreg); - - destexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (size == 4 ? 2 : 3)); - srcexp = gen_rtx_PLUS (Pmode, destexp, srcreg); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - - emit_insn (gen_rep_mov (destreg, dst, srcreg, src, - countreg, destexp, srcexp)); - offset = count & ~(size - 1); - } - } - if (size == 8 && (count & 0x04)) - { - srcmem = adjust_automodify_address_nv (src, SImode, srcreg, - offset); - dstmem = adjust_automodify_address_nv (dst, SImode, destreg, - offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - offset += 4; - } - if (count & 0x02) - { - srcmem = adjust_automodify_address_nv (src, HImode, srcreg, - offset); - dstmem = adjust_automodify_address_nv (dst, HImode, destreg, - offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - offset += 2; } - if (count & 0x01) + else { - srcmem = adjust_automodify_address_nv (src, QImode, srcreg, - offset); - dstmem = adjust_automodify_address_nv (dst, QImode, destreg, - offset); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + rtx tmpreg[4]; + gcc_assert (unroll <= 4); + for (i = 0; i < unroll; i++) + { + tmpreg[i] = gen_reg_rtx (mode); + if (i) + { + srcmem = + adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode)); + } + emit_move_insn (tmpreg[i], srcmem); + } + for (i = 0; i < unroll; i++) + { + if (i) + { + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + } + emit_move_insn (destmem, tmpreg[i]); + } } } - /* The generic code based on the glibc implementation: - - align destination to 4 bytes (8 byte alignment is used for PentiumPro - allowing accelerated copying there) - - copy the data using rep movsl - - copy the rest. */ else - { - rtx countreg2; - rtx label = NULL; - rtx srcmem, dstmem; - int desired_alignment = (TARGET_PENTIUMPRO - && (count == 0 || count >= (unsigned int) 260) - ? 8 : UNITS_PER_WORD); - /* Get rid of MEM_OFFSETs, they won't be accurate. */ - dst = change_address (dst, BLKmode, destreg); - src = change_address (src, BLKmode, srcreg); + for (i = 0; i < unroll; i++) + { + if (i) + destmem = + adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode)); + emit_move_insn (destmem, value); + } - /* In case we don't know anything about the alignment, default to - library version, since it is usually equally fast and result in - shorter code. + tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter, + true, OPTAB_LIB_WIDEN); + if (tmp != iter) + emit_move_insn (iter, tmp); - Also emit call when we know that the count is large and call overhead - will not be important. */ - if (!TARGET_INLINE_ALL_STRINGOPS - && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) - return 0; + emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode, + true, top_label); + if (expected_size != -1) + { + expected_size /= GET_MODE_SIZE (mode) * unroll; + if (expected_size == 0) + predict_jump (0); + else if (expected_size > REG_BR_PROB_BASE) + predict_jump (REG_BR_PROB_BASE - 1); + else + predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size); + } + else + predict_jump (REG_BR_PROB_BASE * 80 / 100); + iter = ix86_zero_extend_to_Pmode (iter); + tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr, + true, OPTAB_LIB_WIDEN); + if (tmp != destptr) + emit_move_insn (destptr, tmp); + if (srcptr) + { + tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr, + true, OPTAB_LIB_WIDEN); + if (tmp != srcptr) + emit_move_insn (srcptr, tmp); + } + emit_label (out_label); +} - if (TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); +/* Output "rep; mov" instruction. + Arguments have same meaning as for previous function */ +static void +expand_movmem_via_rep_mov (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, + rtx count, + enum machine_mode mode) +{ + rtx destexp; + rtx srcexp; + rtx countreg; + + /* If the size is known, it is shorter to use rep movs. */ + if (mode == QImode && CONST_INT_P (count) + && !(INTVAL (count) & 3)) + mode = SImode; + + if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) + destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); + if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode) + srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0); + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); + if (mode != QImode) + { + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + srcexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr); + } + else + { + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg); + } + emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg, + destexp, srcexp)); +} - countreg2 = gen_reg_rtx (Pmode); - countreg = copy_to_mode_reg (counter_mode, count_exp); +/* Output "rep; stos" instruction. + Arguments have same meaning as for previous function */ +static void +expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value, + rtx count, + enum machine_mode mode) +{ + rtx destexp; + rtx countreg; - /* We don't use loops to align destination and to copy parts smaller - than 4 bytes, because gcc is able to optimize such code better (in - the case the destination or the count really is aligned, gcc is often - able to predict the branches) and also it is friendlier to the - hardware branch prediction. + if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode) + destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0); + value = force_reg (mode, gen_lowpart (mode, value)); + countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode))); + if (mode != QImode) + { + destexp = gen_rtx_ASHIFT (Pmode, countreg, + GEN_INT (exact_log2 (GET_MODE_SIZE (mode)))); + destexp = gen_rtx_PLUS (Pmode, destexp, destptr); + } + else + destexp = gen_rtx_PLUS (Pmode, destptr, countreg); + emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp)); +} - Using loops is beneficial for generic case, because we can - handle small counts using the loops. Many CPUs (such as Athlon) - have large REP prefix setup costs. +static void +emit_strmov (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, enum machine_mode mode, int offset) +{ + rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset); + rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); +} - This is quite costly. Maybe we can revisit this decision later or - add some customizability to this code. */ +/* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */ +static void +expand_movmem_epilogue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx count, int max_size) +{ + rtx src, dest; + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + int offset = 0; - if (count == 0 && align < desired_alignment) + if ((countval & 0x16) && max_size > 16) { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (countreg, GEN_INT (desired_alignment - 1), - LEU, 0, counter_mode, 1, label); + if (TARGET_64BIT) + { + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8); + } + else + gcc_unreachable (); + offset += 16; } - if (align <= 1) + if ((countval & 0x08) && max_size > 8) { - rtx label = ix86_expand_aligntest (destreg, 1); - srcmem = change_address (src, QImode, srcreg); - dstmem = change_address (dst, QImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - ix86_adjust_counter (countreg, 1); - emit_label (label); - LABEL_NUSES (label) = 1; + if (TARGET_64BIT) + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + else + { + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset); + emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 4); + } + offset += 8; } - if (align <= 2) + if ((countval & 0x04) && max_size > 4) { - rtx label = ix86_expand_aligntest (destreg, 2); - srcmem = change_address (src, HImode, srcreg); - dstmem = change_address (dst, HImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - ix86_adjust_counter (countreg, 2); - emit_label (label); - LABEL_NUSES (label) = 1; + emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset); + offset += 4; } - if (align <= 4 && desired_alignment > 4) + if ((countval & 0x02) && max_size > 2) { - rtx label = ix86_expand_aligntest (destreg, 4); - srcmem = change_address (src, SImode, srcreg); - dstmem = change_address (dst, SImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - ix86_adjust_counter (countreg, 4); - emit_label (label); - LABEL_NUSES (label) = 1; + emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset); + offset += 2; + } + if ((countval & 0x01) && max_size > 1) + { + emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset); + offset += 1; } + return; + } + if (max_size > 8) + { + count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), + count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL, + count, QImode, 1, 4); + return; + } - if (label && desired_alignment > 4 && !TARGET_64BIT) + /* When there are stringops, we can cheaply increase dest and src pointers. + Otherwise we save code size by maintaining offset (zero is readily + available from preceding rep operation) and using x86 addressing modes. + */ + if (TARGET_SINGLE_STRINGOP) + { + if (max_size > 4) { + rtx label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); emit_label (label); LABEL_NUSES (label) = 1; - label = NULL_RTX; } - if (!TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - if (TARGET_64BIT) + if (max_size > 2) { - emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), - GEN_INT (3))); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, GEN_INT (3)); + rtx label = ix86_expand_aligntest (count, 2, true); + src = change_address (srcmem, HImode, srcptr); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_label (label); + LABEL_NUSES (label) = 1; } - else + if (max_size > 1) { - emit_insn (gen_lshrsi3 (countreg2, countreg, const2_rtx)); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, const2_rtx); + rtx label = ix86_expand_aligntest (count, 1, true); + src = change_address (srcmem, QImode, srcptr); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strmov (destptr, dest, srcptr, src)); + emit_label (label); + LABEL_NUSES (label) = 1; } - srcexp = gen_rtx_PLUS (Pmode, destexp, srcreg); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_mov (destreg, dst, srcreg, src, - countreg2, destexp, srcexp)); + } + else + { + rtx offset = force_reg (Pmode, const0_rtx); + rtx tmp; - if (label) + if (max_size > 4) { + rtx label = ix86_expand_aligntest (count, 4, true); + src = change_address (srcmem, SImode, srcptr); + dest = change_address (destmem, SImode, destptr); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); emit_label (label); LABEL_NUSES (label) = 1; } - if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) - { - srcmem = change_address (src, SImode, srcreg); - dstmem = change_address (dst, SImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - } - if ((align <= 4 || count == 0) && TARGET_64BIT) - { - rtx label = ix86_expand_aligntest (countreg, 4); - srcmem = change_address (src, SImode, srcreg); - dstmem = change_address (dst, SImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + if (max_size > 2) + { + rtx label = ix86_expand_aligntest (count, 2, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, HImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, HImode, tmp); + emit_move_insn (dest, src); + tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp, + true, OPTAB_LIB_WIDEN); + if (tmp != offset) + emit_move_insn (offset, tmp); emit_label (label); LABEL_NUSES (label) = 1; } - if (align > 2 && count != 0 && (count & 2)) - { - srcmem = change_address (src, HImode, srcreg); - dstmem = change_address (dst, HImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - } - if (align <= 2 || count == 0) + if (max_size > 1) { - rtx label = ix86_expand_aligntest (countreg, 2); - srcmem = change_address (src, HImode, srcreg); - dstmem = change_address (dst, HImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + rtx label = ix86_expand_aligntest (count, 1, true); + tmp = gen_rtx_PLUS (Pmode, srcptr, offset); + src = change_address (srcmem, QImode, tmp); + tmp = gen_rtx_PLUS (Pmode, destptr, offset); + dest = change_address (destmem, QImode, tmp); + emit_move_insn (dest, src); emit_label (label); LABEL_NUSES (label) = 1; } - if (align > 1 && count != 0 && (count & 1)) - { - srcmem = change_address (src, QImode, srcreg); - dstmem = change_address (dst, QImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); + } +} + +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value, + rtx count, int max_size) +{ + count = + expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1), + count, 1, OPTAB_DIRECT); + expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL, + gen_lowpart (QImode, value), count, QImode, + 1, max_size / 2); +} + +/* Output code to set at most count & (max_size - 1) bytes starting by DEST. */ +static void +expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size) +{ + rtx dest; + + if (CONST_INT_P (count)) + { + HOST_WIDE_INT countval = INTVAL (count); + int offset = 0; + + if ((countval & 0x16) && max_size > 16) + { + if (TARGET_64BIT) + { + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8); + emit_insn (gen_strset (destptr, dest, value)); + } + else + gcc_unreachable (); + offset += 16; } - if (align <= 1 || count == 0) + if ((countval & 0x08) && max_size > 8) { - rtx label = ix86_expand_aligntest (countreg, 1); - srcmem = change_address (src, QImode, srcreg); - dstmem = change_address (dst, QImode, destreg); - emit_insn (gen_strmov (destreg, dstmem, srcreg, srcmem)); - emit_label (label); - LABEL_NUSES (label) = 1; + if (TARGET_64BIT) + { + dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, value)); + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4); + emit_insn (gen_strset (destptr, dest, value)); + } + offset += 8; + } + if ((countval & 0x04) && max_size > 4) + { + dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); + offset += 4; + } + if ((countval & 0x02) && max_size > 2) + { + dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); + offset += 2; + } + if ((countval & 0x01) && max_size > 1) + { + dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset); + emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); + offset += 1; } + return; + } + if (max_size > 32) + { + expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size); + return; + } + if (max_size > 16) + { + rtx label = ix86_expand_aligntest (count, 16, true); + if (TARGET_64BIT) + { + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 8) + { + rtx label = ix86_expand_aligntest (count, 8, true); + if (TARGET_64BIT) + { + dest = change_address (destmem, DImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + } + else + { + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, value)); + emit_insn (gen_strset (destptr, dest, value)); + } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 4) + { + rtx label = ix86_expand_aligntest (count, 4, true); + dest = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 2) + { + rtx label = ix86_expand_aligntest (count, 2, true); + dest = change_address (destmem, HImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (max_size > 1) + { + rtx label = ix86_expand_aligntest (count, 1, true); + dest = change_address (destmem, QImode, destptr); + emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value))); + emit_label (label); + LABEL_NUSES (label) = 1; } +} - return 1; +/* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to + DESIRED_ALIGNMENT. */ +static void +expand_movmem_prologue (rtx destmem, rtx srcmem, + rtx destptr, rtx srcptr, rtx count, + int align, int desired_alignment) +{ + if (align <= 1 && desired_alignment > 1) + { + rtx label = ix86_expand_aligntest (destptr, 1, false); + srcmem = change_address (srcmem, QImode, srcptr); + destmem = change_address (destmem, QImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2 && desired_alignment > 2) + { + rtx label = ix86_expand_aligntest (destptr, 2, false); + srcmem = change_address (srcmem, HImode, srcptr); + destmem = change_address (destmem, HImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && desired_alignment > 4) + { + rtx label = ix86_expand_aligntest (destptr, 4, false); + srcmem = change_address (srcmem, SImode, srcptr); + destmem = change_address (destmem, SImode, destptr); + emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem)); + ix86_adjust_counter (count, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + gcc_assert (desired_alignment <= 8); } -/* Expand string clear operation (bzero). Use i386 string operations when - profitable. expand_movmem contains similar code. */ +/* Set enough from DEST to align DEST known to by aligned by ALIGN to + DESIRED_ALIGNMENT. */ +static void +expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count, + int align, int desired_alignment) +{ + if (align <= 1 && desired_alignment > 1) + { + rtx label = ix86_expand_aligntest (destptr, 1, false); + destmem = change_address (destmem, QImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value))); + ix86_adjust_counter (count, 1); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 2 && desired_alignment > 2) + { + rtx label = ix86_expand_aligntest (destptr, 2, false); + destmem = change_address (destmem, HImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value))); + ix86_adjust_counter (count, 2); + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (align <= 4 && desired_alignment > 4) + { + rtx label = ix86_expand_aligntest (destptr, 4, false); + destmem = change_address (destmem, SImode, destptr); + emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value))); + ix86_adjust_counter (count, 4); + emit_label (label); + LABEL_NUSES (label) = 1; + } + gcc_assert (desired_alignment <= 8); +} + +/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */ +static enum stringop_alg +decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, + int *dynamic_check) +{ + const struct stringop_algs * algs; + + *dynamic_check = -1; + if (memset) + algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + if (stringop_alg != no_stringop) + return stringop_alg; + /* rep; movq or rep; movl is the smallest variant. */ + else if (optimize_size) + { + if (!count || (count & 3)) + return rep_prefix_1_byte; + else + return rep_prefix_4_byte; + } + /* Very tiny blocks are best handled via the loop, REP is expensive to setup. + */ + else if (expected_size != -1 && expected_size < 4) + return loop_1_byte; + else if (expected_size != -1) + { + unsigned int i; + enum stringop_alg alg = libcall; + for (i = 0; i < NAX_STRINGOP_ALGS; i++) + { + gcc_assert (algs->size[i].max); + if (algs->size[i].max >= expected_size || algs->size[i].max == -1) + { + if (algs->size[i].alg != libcall) + alg = algs->size[i].alg; + /* Honor TARGET_INLINE_ALL_STRINGOPS by picking + last non-libcall inline algorithm. */ + if (TARGET_INLINE_ALL_STRINGOPS) + { + /* When the current size is best to be copied by a libcall, + but we are still forced to inline, run the heuristic bellow + that will pick code for medium sized blocks. */ + if (alg != libcall) + return alg; + break; + } + else + return algs->size[i].alg; + } + } + gcc_assert (TARGET_INLINE_ALL_STRINGOPS); + } + /* When asked to inline the call anyway, try to pick meaningful choice. + We look for maximal size of block that is faster to copy by hand and + take blocks of at most of that size guessing that average size will + be roughly half of the block. + + If this turns out to be bad, we might simply specify the preferred + choice in ix86_costs. */ + if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY) + && algs->unknown_size == libcall) + { + int max = -1; + enum stringop_alg alg; + int i; + + for (i = 0; i < NAX_STRINGOP_ALGS; i++) + if (algs->size[i].alg != libcall && algs->size[i].alg) + max = algs->size[i].max; + if (max == -1) + max = 4096; + alg = decide_alg (count, max / 2, memset, dynamic_check); + gcc_assert (*dynamic_check == -1); + gcc_assert (alg != libcall); + if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) + *dynamic_check = max; + return alg; + } + return algs->unknown_size; +} + +/* Decide on alignment. We know that the operand is already aligned to ALIGN + (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */ +static int +decide_alignment (int align, + enum stringop_alg alg, + int expected_size) +{ + int desired_align = 0; + switch (alg) + { + case no_stringop: + gcc_unreachable (); + case loop: + case unrolled_loop: + desired_align = GET_MODE_SIZE (Pmode); + break; + case rep_prefix_8_byte: + desired_align = 8; + break; + case rep_prefix_4_byte: + /* PentiumPro has special logic triggering for 8 byte aligned blocks. + copying whole cacheline at once. */ + if (TARGET_PENTIUMPRO) + desired_align = 8; + else + desired_align = 4; + break; + case rep_prefix_1_byte: + /* PentiumPro has special logic triggering for 8 byte aligned blocks. + copying whole cacheline at once. */ + if (TARGET_PENTIUMPRO) + desired_align = 8; + else + desired_align = 1; + break; + case loop_1_byte: + desired_align = 1; + break; + case libcall: + return 0; + } + + if (optimize_size) + desired_align = 1; + if (desired_align < align) + desired_align = align; + if (expected_size != -1 && expected_size < 4) + desired_align = align; + return desired_align; +} + +/* Return the smallest power of 2 greater than VAL. */ +static int +smallest_pow2_greater_than (int val) +{ + int ret = 1; + while (ret <= val) + ret <<= 1; + return ret; +} + +/* Expand string move (memcpy) operation. Use i386 string operations when + profitable. expand_clrmem contains similar code. The code depends upon + architecture, block size and alignment, but always has the same + overall structure: + + 1) Prologue guard: Conditional that jumps up to epilogues for small + blocks that can be handled by epilogue alone. This is faster but + also needed for correctness, since prologue assume the block is larger + than the desired alignment. + + Optional dynamic check for size and libcall for large + blocks is emitted here too, with -minline-stringops-dynamically. + + 2) Prologue: copy first few bytes in order to get destination aligned + to DESIRED_ALIGN. It is emitted only when ALIGN is less than + DESIRED_ALIGN and and up to DESIRED_ALIGN - ALIGN bytes can be copied. + We emit either a jump tree on power of two sized blocks, or a byte loop. + + 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks + with specified algorithm. + + 4) Epilogue: code copying tail of the block that is too small to be + handled by main body (or up to size guarded by prologue guard). */ + int -ix86_expand_clrmem (rtx dst, rtx count_exp, rtx align_exp) +ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, + rtx expected_align_exp, rtx expected_size_exp) { - rtx destreg, zeroreg, countreg, destexp; - enum machine_mode counter_mode; - HOST_WIDE_INT align = 0; + rtx destreg; + rtx srcreg; + rtx label = NULL; + rtx tmp; + rtx jump_around_label = NULL; + HOST_WIDE_INT align = 1; unsigned HOST_WIDE_INT count = 0; + HOST_WIDE_INT expected_size = -1; + int size_needed = 0, epilogue_size_needed; + int desired_align = 0; + enum stringop_alg alg; + int dynamic_check; - if (GET_CODE (align_exp) == CONST_INT) + if (CONST_INT_P (align_exp)) align = INTVAL (align_exp); + /* i386 can do misaligned access on reasonably increased cost. */ + if (CONST_INT_P (expected_align_exp) + && INTVAL (expected_align_exp) > align) + align = INTVAL (expected_align_exp); + if (CONST_INT_P (count_exp)) + count = expected_size = INTVAL (count_exp); + if (CONST_INT_P (expected_size_exp) && count == 0) + expected_size = INTVAL (expected_size_exp); - /* Can't use any of this if the user has appropriated esi. */ - if (global_regs[4]) - return 0; + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ + + alg = decide_alg (count, expected_size, false, &dynamic_check); + desired_align = decide_alignment (align, alg, expected_size); - /* This simple hack avoids all inlining code and simplifies code below. */ if (!TARGET_ALIGN_STRINGOPS) - align = 32; + align = desired_align; - if (GET_CODE (count_exp) == CONST_INT) + if (alg == libcall) + return 0; + gcc_assert (alg != no_stringop); + if (!count) + count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0)); + switch (alg) { - count = INTVAL (count_exp); - if (!TARGET_INLINE_ALL_STRINGOPS && count > 64) - return 0; + case libcall: + case no_stringop: + gcc_unreachable (); + case loop: + size_needed = GET_MODE_SIZE (Pmode); + break; + case unrolled_loop: + size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2); + break; + case rep_prefix_8_byte: + size_needed = 8; + break; + case rep_prefix_4_byte: + size_needed = 4; + break; + case rep_prefix_1_byte: + case loop_1_byte: + size_needed = 1; + break; } - /* Figure out proper mode for counter. For 32bits it is always SImode, - for 64bits use SImode when possible, otherwise DImode. - Set count to number of bytes copied when known at compile time. */ - if (!TARGET_64BIT - || GET_MODE (count_exp) == SImode - || x86_64_zext_immediate_operand (count_exp, VOIDmode)) - counter_mode = SImode; - else - counter_mode = DImode; - destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); - if (destreg != XEXP (dst, 0)) - dst = replace_equiv_address_nv (dst, destreg); + epilogue_size_needed = size_needed; + /* Step 1: Prologue guard. */ - /* When optimizing for size emit simple rep ; movsb instruction for - counts not divisible by 4. The movl $N, %ecx; rep; stosb - sequence is 7 bytes long, so if optimizing for size and count is - small enough that some stosl, stosw and stosb instructions without - rep are shorter, fall back into the next if. */ + /* Alignment code needs count to be in register. */ + if (CONST_INT_P (count_exp) && desired_align > align) + { + enum machine_mode mode = SImode; + if (TARGET_64BIT && (count & ~0xffffffff)) + mode = DImode; + count_exp = force_reg (mode, count_exp); + } + gcc_assert (desired_align >= 1 && align >= 1); - if ((!optimize || optimize_size) - && (count == 0 - || ((count & 0x03) - && (!optimize_size || (count & 0x03) + (count >> 2) > 7)))) + /* Ensure that alignment prologue won't copy past end of block. */ + if ((size_needed > 1 || (desired_align > 1 && desired_align > align)) + && !count) { - emit_insn (gen_cld ()); + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); - countreg = ix86_zero_extend_to_Pmode (count_exp); - zeroreg = copy_to_mode_reg (QImode, const0_rtx); - destexp = gen_rtx_PLUS (Pmode, destreg, countreg); - emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, destexp)); + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); + + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, GET_MODE (count_exp), 1, label); + if (expected_size == -1 || expected_size < epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); } - else if (count != 0 - && (align >= 8 - || (!TARGET_PENTIUMPRO && !TARGET_64BIT && align >= 4) - || optimize_size || count < (unsigned int) 64)) + /* Emit code to decide on runtime whether library call or inline should be + used. */ + if (dynamic_check != -1) { - int size = TARGET_64BIT && !optimize_size ? 8 : 4; - unsigned HOST_WIDE_INT offset = 0; + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, GET_MODE (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + emit_block_move_via_libcall (dst, src, count_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } - emit_insn (gen_cld ()); + /* Step 2: Alignment prologue. */ - zeroreg = copy_to_mode_reg (size == 4 ? SImode : DImode, const0_rtx); - if (count & ~(size - 1)) - { - unsigned HOST_WIDE_INT repcount; - unsigned int max_nonrep; + if (desired_align > align) + { + /* Except for the first move in epilogue, we no longer know + constant offset in aliasing info. It don't seems to worth + the pain to maintain it for the first move, so throw away + the info early. */ + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align, + desired_align); + } + if (label && size_needed == 1) + { + emit_label (label); + LABEL_NUSES (label) = 1; + label = NULL; + } - repcount = count >> (size == 4 ? 2 : 3); - if (!TARGET_64BIT) - repcount &= 0x3fffffff; + /* Step 3: Main loop. */ - /* movl $N, %ecx; rep; stosl is 7 bytes, while N x stosl is N bytes. - movl $N, %ecx; rep; stosq is 8 bytes, while N x stosq is 2xN - bytes. In both cases the latter seems to be faster for small - values of N. */ - max_nonrep = size == 4 ? 7 : 4; - if (!optimize_size) - switch (ix86_tune) - { - case PROCESSOR_PENTIUM4: - case PROCESSOR_NOCONA: - max_nonrep = 3; - break; - default: - break; - } + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop_1_byte: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, QImode, 1, expected_size); + break; + case loop: + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, Pmode, 1, expected_size); + break; + case unrolled_loop: + /* Unroll only by factor of 2 in 32bit mode, since we don't have enough + registers for 4 temporaries anyway. */ + expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL, + count_exp, Pmode, TARGET_64BIT ? 4 : 2, + expected_size); + break; + case rep_prefix_8_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + DImode); + break; + case rep_prefix_4_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + SImode); + break; + case rep_prefix_1_byte: + expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp, + QImode); + break; + } + /* Adjust properly the offset of src and dest memory for aliasing. */ + if (CONST_INT_P (count_exp)) + { + src = adjust_automodify_address_nv (src, BLKmode, srcreg, + (count / size_needed) * size_needed); + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, + (count / size_needed) * size_needed); + } + else + { + src = change_address (src, BLKmode, srcreg); + dst = change_address (dst, BLKmode, destreg); + } - if (repcount <= max_nonrep) - while (repcount-- > 0) - { - rtx mem = adjust_automodify_address_nv (dst, - GET_MODE (zeroreg), - destreg, offset); - emit_insn (gen_strset (destreg, mem, zeroreg)); - offset += size; - } - else - { - countreg = copy_to_mode_reg (counter_mode, GEN_INT (repcount)); - countreg = ix86_zero_extend_to_Pmode (countreg); - destexp = gen_rtx_ASHIFT (Pmode, countreg, - GEN_INT (size == 4 ? 2 : 3)); - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_stos (destreg, countreg, dst, zeroreg, - destexp)); - offset = count & ~(size - 1); - } - } - if (size == 8 && (count & 0x04)) - { - rtx mem = adjust_automodify_address_nv (dst, SImode, destreg, - offset); - emit_insn (gen_strset (destreg, mem, - gen_rtx_SUBREG (SImode, zeroreg, 0))); - offset += 4; - } - if (count & 0x02) + /* Step 4: Epilogue to copy the remaining bytes. */ + + if (label) + { + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ + + if (size_needed < epilogue_size_needed) { - rtx mem = adjust_automodify_address_nv (dst, HImode, destreg, - offset); - emit_insn (gen_strset (destreg, mem, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - offset += 2; + tmp = + expand_simple_binop (GET_MODE (count_exp), AND, count_exp, + GEN_INT (size_needed - 1), count_exp, 1, + OPTAB_DIRECT); + if (tmp != count_exp) + emit_move_insn (count_exp, tmp); } - if (count & 0x01) + emit_label (label); + LABEL_NUSES (label) = 1; + } + + if (count_exp != const0_rtx && epilogue_size_needed > 1) + expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp, + epilogue_size_needed); + if (jump_around_label) + emit_label (jump_around_label); + return 1; +} + +/* Helper function for memcpy. For QImode value 0xXY produce + 0xXYXYXYXY of wide specified by MODE. This is essentially + a * 0x10101010, but we can do slightly better than + synth_mult by unwinding the sequence by hand on CPUs with + slow multiply. */ +static rtx +promote_duplicated_reg (enum machine_mode mode, rtx val) +{ + enum machine_mode valmode = GET_MODE (val); + rtx tmp; + int nops = mode == DImode ? 3 : 2; + + gcc_assert (mode == SImode || mode == DImode); + if (val == const0_rtx) + return copy_to_mode_reg (mode, const0_rtx); + if (CONST_INT_P (val)) + { + HOST_WIDE_INT v = INTVAL (val) & 255; + + v |= v << 8; + v |= v << 16; + if (mode == DImode) + v |= (v << 16) << 16; + return copy_to_mode_reg (mode, gen_int_mode (v, mode)); + } + + if (valmode == VOIDmode) + valmode = QImode; + if (valmode != QImode) + val = gen_lowpart (QImode, val); + if (mode == QImode) + return val; + if (!TARGET_PARTIAL_REG_STALL) + nops--; + if (ix86_cost->mult_init[mode == DImode ? 3 : 2] + + ix86_cost->mult_bit * (mode == DImode ? 8 : 4) + <= (ix86_cost->shift_const + ix86_cost->add) * nops + + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0))) + { + rtx reg = convert_modes (mode, QImode, val, true); + tmp = promote_duplicated_reg (mode, const1_rtx); + return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1, + OPTAB_DIRECT); + } + else + { + rtx reg = convert_modes (mode, QImode, val, true); + + if (!TARGET_PARTIAL_REG_STALL) + if (mode == SImode) + emit_insn (gen_movsi_insv_1 (reg, reg)); + else + emit_insn (gen_movdi_insv_1_rex64 (reg, reg)); + else { - rtx mem = adjust_automodify_address_nv (dst, QImode, destreg, - offset); - emit_insn (gen_strset (destreg, mem, - gen_rtx_SUBREG (QImode, zeroreg, 0))); + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8), + NULL, 1, OPTAB_DIRECT); + reg = + expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); } + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + if (mode == SImode) + return reg; + tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32), + NULL, 1, OPTAB_DIRECT); + reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT); + return reg; + } +} + +/* Duplicate value VAL using promote_duplicated_reg into maximal size that will + be needed by main loop copying SIZE_NEEDED chunks and prologue getting + alignment from ALIGN to DESIRED_ALIGN. */ +static rtx +promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align) +{ + rtx promoted_val; + + if (TARGET_64BIT + && (size_needed > 4 || (desired_align > align && desired_align > 4))) + promoted_val = promote_duplicated_reg (DImode, val); + else if (size_needed > 2 || (desired_align > align && desired_align > 2)) + promoted_val = promote_duplicated_reg (SImode, val); + else if (size_needed > 1 || (desired_align > align && desired_align > 1)) + promoted_val = promote_duplicated_reg (HImode, val); + else + promoted_val = val; + + return promoted_val; +} + +/* Expand string clear operation (bzero). Use i386 string operations when + profitable. See expand_movmem comment for explanation of individual + steps performed. */ +int +ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, + rtx expected_align_exp, rtx expected_size_exp) +{ + rtx destreg; + rtx label = NULL; + rtx tmp; + rtx jump_around_label = NULL; + HOST_WIDE_INT align = 1; + unsigned HOST_WIDE_INT count = 0; + HOST_WIDE_INT expected_size = -1; + int size_needed = 0, epilogue_size_needed; + int desired_align = 0; + enum stringop_alg alg; + rtx promoted_val = NULL; + bool force_loopy_epilogue = false; + int dynamic_check; + + if (CONST_INT_P (align_exp)) + align = INTVAL (align_exp); + /* i386 can do misaligned access on reasonably increased cost. */ + if (CONST_INT_P (expected_align_exp) + && INTVAL (expected_align_exp) > align) + align = INTVAL (expected_align_exp); + if (CONST_INT_P (count_exp)) + count = expected_size = INTVAL (count_exp); + if (CONST_INT_P (expected_size_exp) && count == 0) + expected_size = INTVAL (expected_size_exp); + + /* Step 0: Decide on preferred algorithm, desired alignment and + size of chunks to be copied by main loop. */ + + alg = decide_alg (count, expected_size, true, &dynamic_check); + desired_align = decide_alignment (align, alg, expected_size); + + if (!TARGET_ALIGN_STRINGOPS) + align = desired_align; + + if (alg == libcall) + return 0; + gcc_assert (alg != no_stringop); + if (!count) + count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp); + destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0)); + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop: + size_needed = GET_MODE_SIZE (Pmode); + break; + case unrolled_loop: + size_needed = GET_MODE_SIZE (Pmode) * 4; + break; + case rep_prefix_8_byte: + size_needed = 8; + break; + case rep_prefix_4_byte: + size_needed = 4; + break; + case rep_prefix_1_byte: + case loop_1_byte: + size_needed = 1; + break; + } + epilogue_size_needed = size_needed; + + /* Step 1: Prologue guard. */ + + /* Alignment code needs count to be in register. */ + if (CONST_INT_P (count_exp) && desired_align > align) + { + enum machine_mode mode = SImode; + if (TARGET_64BIT && (count & ~0xffffffff)) + mode = DImode; + count_exp = force_reg (mode, count_exp); + } + /* Do the cheap promotion to allow better CSE across the + main loop and epilogue (ie one load of the big constant in the + front of all code. */ + if (CONST_INT_P (val_exp)) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + /* Ensure that alignment prologue won't copy past end of block. */ + if ((size_needed > 1 || (desired_align > 1 && desired_align > align)) + && !count) + { + epilogue_size_needed = MAX (size_needed - 1, desired_align - align); + + /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes. + Make sure it is power of 2. */ + epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); + + /* To improve performance of small blocks, we jump around the VAL + promoting mode. This mean that if the promoted VAL is not constant, + we might not use it in the epilogue and have to use byte + loop variant. */ + if (epilogue_size_needed > 2 && !promoted_val) + force_loopy_epilogue = true; + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, GET_MODE (count_exp), 1, label); + if (expected_size == -1 || expected_size <= epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); + } + if (dynamic_check != -1) + { + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, GET_MODE (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + set_storage_via_libcall (dst, count_exp, val_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } + + /* Step 2: Alignment prologue. */ + + /* Do the expensive promotion once we branched off the small blocks. */ + if (!promoted_val) + promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed, + desired_align, align); + gcc_assert (desired_align >= 1 && align >= 1); + + if (desired_align > align) + { + /* Except for the first move in epilogue, we no longer know + constant offset in aliasing info. It don't seems to worth + the pain to maintain it for the first move, so throw away + the info early. */ + dst = change_address (dst, BLKmode, destreg); + expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align, + desired_align); + } + if (label && size_needed == 1) + { + emit_label (label); + LABEL_NUSES (label) = 1; + label = NULL; + } + + /* Step 3: Main loop. */ + + switch (alg) + { + case libcall: + case no_stringop: + gcc_unreachable (); + case loop_1_byte: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, QImode, 1, expected_size); + break; + case loop: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, Pmode, 1, expected_size); + break; + case unrolled_loop: + expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val, + count_exp, Pmode, 4, expected_size); + break; + case rep_prefix_8_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + DImode); + break; + case rep_prefix_4_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + SImode); + break; + case rep_prefix_1_byte: + expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp, + QImode); + break; } + /* Adjust properly the offset of src and dest memory for aliasing. */ + if (CONST_INT_P (count_exp)) + dst = adjust_automodify_address_nv (dst, BLKmode, destreg, + (count / size_needed) * size_needed); else - { - rtx countreg2; - rtx label = NULL; - /* Compute desired alignment of the string operation. */ - int desired_alignment = (TARGET_PENTIUMPRO - && (count == 0 || count >= (unsigned int) 260) - ? 8 : UNITS_PER_WORD); - - /* In case we don't know anything about the alignment, default to - library version, since it is usually equally fast and result in - shorter code. - - Also emit call when we know that the count is large and call overhead - will not be important. */ - if (!TARGET_INLINE_ALL_STRINGOPS - && (align < UNITS_PER_WORD || !TARGET_REP_MOVL_OPTIMAL)) - return 0; - - if (TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - - countreg2 = gen_reg_rtx (Pmode); - countreg = copy_to_mode_reg (counter_mode, count_exp); - zeroreg = copy_to_mode_reg (Pmode, const0_rtx); - /* Get rid of MEM_OFFSET, it won't be accurate. */ - dst = change_address (dst, BLKmode, destreg); + dst = change_address (dst, BLKmode, destreg); - if (count == 0 && align < desired_alignment) - { - label = gen_label_rtx (); - emit_cmp_and_jump_insns (countreg, GEN_INT (desired_alignment - 1), - LEU, 0, counter_mode, 1, label); - } - if (align <= 1) - { - rtx label = ix86_expand_aligntest (destreg, 1); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - ix86_adjust_counter (countreg, 1); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 2) - { - rtx label = ix86_expand_aligntest (destreg, 2); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - ix86_adjust_counter (countreg, 2); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align <= 4 && desired_alignment > 4) - { - rtx label = ix86_expand_aligntest (destreg, 4); - emit_insn (gen_strset (destreg, dst, - (TARGET_64BIT - ? gen_rtx_SUBREG (SImode, zeroreg, 0) - : zeroreg))); - ix86_adjust_counter (countreg, 4); - emit_label (label); - LABEL_NUSES (label) = 1; - } + /* Step 4: Epilogue to copy the remaining bytes. */ - if (label && desired_alignment > 4 && !TARGET_64BIT) - { - emit_label (label); - LABEL_NUSES (label) = 1; - label = NULL_RTX; - } + if (label) + { + /* When the main loop is done, COUNT_EXP might hold original count, + while we want to copy only COUNT_EXP & SIZE_NEEDED bytes. + Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED + bytes. Compensate if needed. */ - if (!TARGET_SINGLE_STRINGOP) - emit_insn (gen_cld ()); - if (TARGET_64BIT) + if (size_needed < desired_align - align) { - emit_insn (gen_lshrdi3 (countreg2, ix86_zero_extend_to_Pmode (countreg), - GEN_INT (3))); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, GEN_INT (3)); + tmp = + expand_simple_binop (GET_MODE (count_exp), AND, count_exp, + GEN_INT (size_needed - 1), count_exp, 1, + OPTAB_DIRECT); + size_needed = desired_align - align + 1; + if (tmp != count_exp) + emit_move_insn (count_exp, tmp); } + emit_label (label); + LABEL_NUSES (label) = 1; + } + if (count_exp != const0_rtx && epilogue_size_needed > 1) + { + if (force_loopy_epilogue) + expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp, + size_needed); else - { - emit_insn (gen_lshrsi3 (countreg2, countreg, const2_rtx)); - destexp = gen_rtx_ASHIFT (Pmode, countreg2, const2_rtx); - } - destexp = gen_rtx_PLUS (Pmode, destexp, destreg); - emit_insn (gen_rep_stos (destreg, countreg2, dst, zeroreg, destexp)); - - if (label) - { - emit_label (label); - LABEL_NUSES (label) = 1; - } - - if (TARGET_64BIT && align > 4 && count != 0 && (count & 4)) - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (SImode, zeroreg, 0))); - if (TARGET_64BIT && (align <= 4 || count == 0)) - { - rtx label = ix86_expand_aligntest (countreg, 4); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (SImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 2 && count != 0 && (count & 2)) - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - if (align <= 2 || count == 0) - { - rtx label = ix86_expand_aligntest (countreg, 2); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (HImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } - if (align > 1 && count != 0 && (count & 1)) - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - if (align <= 1 || count == 0) - { - rtx label = ix86_expand_aligntest (countreg, 1); - emit_insn (gen_strset (destreg, dst, - gen_rtx_SUBREG (QImode, zeroreg, 0))); - emit_label (label); - LABEL_NUSES (label) = 1; - } + expand_setmem_epilogue (dst, destreg, promoted_val, count_exp, + size_needed); } + if (jump_around_label) + emit_label (jump_around_label); return 1; } @@ -13113,7 +14578,7 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1 && !TARGET_INLINE_ALL_STRINGOPS && !optimize_size - && (GET_CODE (align) != CONST_INT || INTVAL (align) < 4)) + && (!CONST_INT_P (align) || INTVAL (align) < 4)) return 0; addr = force_reg (Pmode, XEXP (src, 0)); @@ -13152,7 +14617,6 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) emit_move_insn (scratch3, addr); eoschar = force_reg (QImode, eoschar); - emit_insn (gen_cld ()); src = replace_equiv_address_nv (src, scratch3); /* If .md starts supporting :P, this can be done in .md. */ @@ -13199,7 +14663,7 @@ ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx) rtx cmp; align = 0; - if (GET_CODE (align_rtx) == CONST_INT) + if (CONST_INT_P (align_rtx)) align = INTVAL (align_rtx); /* Loop to check 1..3 bytes for null to get an aligned pointer. */ @@ -13408,7 +14872,7 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, { rtx addr; addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0)); - fnaddr = gen_rtx_REG (Pmode, FIRST_REX_INT_REG + 3 /* R11 */); + fnaddr = gen_rtx_REG (Pmode, R11_REG); emit_move_insn (fnaddr, addr); fnaddr = gen_rtx_MEM (QImode, fnaddr); } @@ -13460,7 +14924,7 @@ assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n) for (s = ix86_stack_locals; s; s = s->next) if (s->mode == mode && s->n == n) - return s->rtl; + return copy_rtx (s->rtl); s = (struct stack_local_entry *) ggc_alloc (sizeof (struct stack_local_entry)); @@ -13644,7 +15108,7 @@ ix86_attr_length_address_default (rtx insn) extract_insn_cached (insn); for (i = recog_data.n_operands - 1; i >= 0; --i) - if (GET_CODE (recog_data.operand[i]) == MEM) + if (MEM_P (recog_data.operand[i])) { return memory_address_length (XEXP (recog_data.operand[i], 0)); break; @@ -13667,11 +15131,15 @@ ix86_issue_rate (void) case PROCESSOR_PENTIUM4: case PROCESSOR_ATHLON: case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: case PROCESSOR_NOCONA: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: return 3; + case PROCESSOR_CORE2: + return 4; + default: return 1; } @@ -13708,7 +15176,7 @@ ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type) else return 0; - if (GET_CODE (set) != REG || REGNO (set) != FLAGS_REG) + if (!REG_P (set) || REGNO (set) != FLAGS_REG) return 0; /* This test is true if the dependent insn reads the flags but @@ -13747,7 +15215,7 @@ ix86_agi_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type) int i; extract_insn_cached (insn); for (i = recog_data.n_operands - 1; i >= 0; --i) - if (GET_CODE (recog_data.operand[i]) == MEM) + if (MEM_P (recog_data.operand[i])) { addr = XEXP (recog_data.operand[i], 0); goto found; @@ -13810,7 +15278,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) && (set = single_set (dep_insn)) != NULL_RTX && (set2 = single_set (insn)) != NULL_RTX && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) - && GET_CODE (SET_DEST (set2)) == MEM) + && MEM_P (SET_DEST (set2))) cost += 1; /* Show ability of reorder buffer to hide latency of load by executing @@ -13862,6 +15330,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) case PROCESSOR_ATHLON: case PROCESSOR_K8: + case PROCESSOR_AMDFAM10: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: memory = get_attr_memory (insn); @@ -13948,7 +15417,7 @@ ix86_constant_alignment (tree exp, int align) int ix86_data_alignment (tree type, int align) { - int max_align = optimize_size ? BITS_PER_WORD : 256; + int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT); if (AGGREGATE_TYPE_P (type) && TYPE_SIZE (type) @@ -14574,6 +16043,14 @@ enum ix86_builtins IX86_BUILTIN_PABSW128, IX86_BUILTIN_PABSD128, + /* AMDFAM10 - SSE4A New Instructions. */ + IX86_BUILTIN_MOVNTSD, + IX86_BUILTIN_MOVNTSS, + IX86_BUILTIN_EXTRQI, + IX86_BUILTIN_EXTRQ, + IX86_BUILTIN_INSERTQI, + IX86_BUILTIN_INSERTQ, + IX86_BUILTIN_VEC_INIT_V2SI, IX86_BUILTIN_VEC_INIT_V4HI, IX86_BUILTIN_VEC_INIT_V8QI, @@ -14590,13 +16067,41 @@ enum ix86_builtins IX86_BUILTIN_MAX }; -#define def_builtin(MASK, NAME, TYPE, CODE) \ -do { \ - if ((MASK) & target_flags \ - && (!((MASK) & MASK_64BIT) || TARGET_64BIT)) \ - add_builtin_function ((NAME), (TYPE), (CODE), BUILT_IN_MD, \ - NULL, NULL_TREE); \ -} while (0) +/* Table for the ix86 builtin decls. */ +static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX]; + +/* Add a ix86 target builtin function with CODE, NAME and TYPE. Do so, + * if the target_flags include one of MASK. Stores the function decl + * in the ix86_builtins array. + * Returns the function decl or NULL_TREE, if the builtin was not added. */ + +static inline tree +def_builtin (int mask, const char *name, tree type, enum ix86_builtins code) +{ + tree decl = NULL_TREE; + + if (mask & target_flags + && (!(mask & MASK_64BIT) || TARGET_64BIT)) + { + decl = add_builtin_function (name, type, code, BUILT_IN_MD, + NULL, NULL_TREE); + ix86_builtins[(int) code] = decl; + } + + return decl; +} + +/* Like def_builtin, but also marks the function decl "const". */ + +static inline tree +def_builtin_const (int mask, const char *name, tree type, + enum ix86_builtins code) +{ + tree decl = def_builtin (mask, name, type, code); + if (decl) + TREE_READONLY (decl) = 1; + return decl; +} /* Bits for builtin_description.flag. */ @@ -14849,7 +16354,7 @@ static const struct builtin_description bdesc_2arg[] = { MASK_MMX, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, 0, 0 }, { MASK_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, 0, 0 }, + { MASK_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, 0, 0 }, { MASK_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_nandv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, 0, 0 }, @@ -14884,7 +16389,7 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, 0, 0 }, + { MASK_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_psadbw, 0, IX86_BUILTIN_PSADBW128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_umulsidi3, 0, IX86_BUILTIN_PMULUDQ, 0, 0 }, @@ -14985,8 +16490,8 @@ static const struct builtin_description bdesc_1arg[] = { MASK_SSE2, CODE_FOR_sse2_cvttps2dq, 0, IX86_BUILTIN_CVTTPS2DQ, 0, 0 }, /* SSE3 */ - { MASK_SSE3, CODE_FOR_sse3_movshdup, 0, IX86_BUILTIN_MOVSHDUP, 0, 0 }, - { MASK_SSE3, CODE_FOR_sse3_movsldup, 0, IX86_BUILTIN_MOVSLDUP, 0, 0 }, + { MASK_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, 0, 0 }, + { MASK_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, 0, 0 }, /* SSSE3 */ { MASK_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, 0, 0 }, @@ -15013,7 +16518,7 @@ ix86_init_mmx_sse_builtins (void) const struct builtin_description * d; size_t i; - tree V16QI_type_node = build_vector_type_for_mode (intQI_type_node, V16QImode); + tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode); tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode); tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode); tree V2DI_type_node @@ -15022,7 +16527,7 @@ ix86_init_mmx_sse_builtins (void) tree V4SF_type_node = build_vector_type_for_mode (float_type_node, V4SFmode); tree V4SI_type_node = build_vector_type_for_mode (intSI_type_node, V4SImode); tree V4HI_type_node = build_vector_type_for_mode (intHI_type_node, V4HImode); - tree V8QI_type_node = build_vector_type_for_mode (intQI_type_node, V8QImode); + tree V8QI_type_node = build_vector_type_for_mode (char_type_node, V8QImode); tree V8HI_type_node = build_vector_type_for_mode (intHI_type_node, V8HImode); tree pchar_type_node = build_pointer_type (char_type_node); @@ -15303,6 +16808,18 @@ ix86_init_mmx_sse_builtins (void) = build_function_type_list (void_type_node, pchar_type_node, V16QI_type_node, NULL_TREE); + tree v2di_ftype_v2di_unsigned_unsigned + = build_function_type_list (V2DI_type_node, V2DI_type_node, + unsigned_type_node, unsigned_type_node, + NULL_TREE); + tree v2di_ftype_v2di_v2di_unsigned_unsigned + = build_function_type_list (V2DI_type_node, V2DI_type_node, V2DI_type_node, + unsigned_type_node, unsigned_type_node, + NULL_TREE); + tree v2di_ftype_v2di_v16qi + = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node, + NULL_TREE); + tree float80_type; tree float128_type; tree ftype; @@ -15463,15 +16980,15 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE, "__builtin_ia32_ldmxcsr", void_ftype_unsigned, IX86_BUILTIN_LDMXCSR); def_builtin (MASK_SSE, "__builtin_ia32_stmxcsr", unsigned_ftype_void, IX86_BUILTIN_STMXCSR); - def_builtin (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS); - def_builtin (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI); - def_builtin (MASK_SSE, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS); - def_builtin (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS); - def_builtin (MASK_SSE, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI); - def_builtin (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64); - def_builtin (MASK_SSE, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI); - def_builtin (MASK_SSE, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI); - def_builtin (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS); + def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI); + def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI); + def_builtin_const (MASK_SSE, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI); + def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64); def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_maskmovq", void_ftype_v8qi_v8qi_pchar, IX86_BUILTIN_MASKMOVQ); @@ -15496,8 +17013,8 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS); def_builtin (MASK_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS); def_builtin (MASK_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS); - def_builtin (MASK_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS); - def_builtin (MASK_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS); + def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS); + def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS); def_builtin (MASK_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS); @@ -15551,35 +17068,35 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE2, "__builtin_ia32_pshufhw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFHW); def_builtin (MASK_SSE2, "__builtin_ia32_psadbw128", v2di_ftype_v16qi_v16qi, IX86_BUILTIN_PSADBW128); - def_builtin (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD); - def_builtin (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD); def_builtin (MASK_SSE2, "__builtin_ia32_shufpd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_SHUFPD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpd2ps", v4sf_ftype_v2df, IX86_BUILTIN_CVTPD2PS); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTTPD2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTTPD2PI); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2ps", v4sf_ftype_v2df, IX86_BUILTIN_CVTPD2PS); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTTPD2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTTPD2PI); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtpi2pd", v2df_ftype_v2si, IX86_BUILTIN_CVTPI2PD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpi2pd", v2df_ftype_v2si, IX86_BUILTIN_CVTPI2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI); - def_builtin (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64); - def_builtin (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI); + def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64); + def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD); - def_builtin (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS); - def_builtin (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD); + def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS); + def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD); def_builtin (MASK_SSE2, "__builtin_ia32_clflush", void_ftype_pcvoid, IX86_BUILTIN_CLFLUSH); def_builtin (MASK_SSE2, "__builtin_ia32_lfence", void_ftype_void, IX86_BUILTIN_LFENCE); @@ -15624,12 +17141,6 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSE3, "__builtin_ia32_mwait", void_ftype_unsigned_unsigned, IX86_BUILTIN_MWAIT); - def_builtin (MASK_SSE3, "__builtin_ia32_movshdup", - v4sf_ftype_v4sf, - IX86_BUILTIN_MOVSHDUP); - def_builtin (MASK_SSE3, "__builtin_ia32_movsldup", - v4sf_ftype_v4sf, - IX86_BUILTIN_MOVSLDUP); def_builtin (MASK_SSE3, "__builtin_ia32_lddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LDDQU); @@ -15639,6 +17150,20 @@ ix86_init_mmx_sse_builtins (void) def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int, IX86_BUILTIN_PALIGNR); + /* AMDFAM10 SSE4A New built-ins */ + def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd", + void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD); + def_builtin (MASK_SSE4A, "__builtin_ia32_movntss", + void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS); + def_builtin (MASK_SSE4A, "__builtin_ia32_extrqi", + v2di_ftype_v2di_unsigned_unsigned, IX86_BUILTIN_EXTRQI); + def_builtin (MASK_SSE4A, "__builtin_ia32_extrq", + v2di_ftype_v2di_v16qi, IX86_BUILTIN_EXTRQ); + def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi", + v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI); + def_builtin (MASK_SSE4A, "__builtin_ia32_insertq", + v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ); + /* Access to the vec_init patterns. */ ftype = build_function_type_list (V2SI_type_node, integer_type_node, integer_type_node, NULL_TREE); @@ -15725,11 +17250,11 @@ safe_vector_operand (rtx x, enum machine_mode mode) /* Subroutine of ix86_expand_builtin to take care of binop insns. */ static rtx -ix86_expand_binop_builtin (enum insn_code icode, tree arglist, rtx target) +ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) { rtx pat, xops[3]; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); enum machine_mode tmode = insn_data[icode].operand[0].mode; @@ -15793,11 +17318,11 @@ ix86_expand_binop_builtin (enum insn_code icode, tree arglist, rtx target) /* Subroutine of ix86_expand_builtin to take care of stores. */ static rtx -ix86_expand_store_builtin (enum insn_code icode, tree arglist) +ix86_expand_store_builtin (enum insn_code icode, tree exp) { rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); enum machine_mode mode0 = insn_data[icode].operand[0].mode; @@ -15818,11 +17343,11 @@ ix86_expand_store_builtin (enum insn_code icode, tree arglist) /* Subroutine of ix86_expand_builtin to take care of unop insns. */ static rtx -ix86_expand_unop_builtin (enum insn_code icode, tree arglist, +ix86_expand_unop_builtin (enum insn_code icode, tree exp, rtx target, int do_load) { rtx pat; - tree arg0 = TREE_VALUE (arglist); + tree arg0 = CALL_EXPR_ARG (exp, 0); rtx op0 = expand_normal (arg0); enum machine_mode tmode = insn_data[icode].operand[0].mode; enum machine_mode mode0 = insn_data[icode].operand[1].mode; @@ -15854,10 +17379,10 @@ ix86_expand_unop_builtin (enum insn_code icode, tree arglist, sqrtss, rsqrtss, rcpss. */ static rtx -ix86_expand_unop1_builtin (enum insn_code icode, tree arglist, rtx target) +ix86_expand_unop1_builtin (enum insn_code icode, tree exp, rtx target) { rtx pat; - tree arg0 = TREE_VALUE (arglist); + tree arg0 = CALL_EXPR_ARG (exp, 0); rtx op1, op0 = expand_normal (arg0); enum machine_mode tmode = insn_data[icode].operand[0].mode; enum machine_mode mode0 = insn_data[icode].operand[1].mode; @@ -15888,12 +17413,12 @@ ix86_expand_unop1_builtin (enum insn_code icode, tree arglist, rtx target) /* Subroutine of ix86_expand_builtin to take care of comparison insns. */ static rtx -ix86_expand_sse_compare (const struct builtin_description *d, tree arglist, +ix86_expand_sse_compare (const struct builtin_description *d, tree exp, rtx target) { rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); rtx op2; @@ -15940,12 +17465,12 @@ ix86_expand_sse_compare (const struct builtin_description *d, tree arglist, /* Subroutine of ix86_expand_builtin to take care of comi insns. */ static rtx -ix86_expand_sse_comi (const struct builtin_description *d, tree arglist, +ix86_expand_sse_comi (const struct builtin_description *d, tree exp, rtx target) { rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); rtx op0 = expand_normal (arg0); rtx op1 = expand_normal (arg1); rtx op2; @@ -16020,7 +17545,7 @@ get_element_number (tree vec_type, tree arg) these sorts of instructions. */ static rtx -ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target) +ix86_expand_vec_init_builtin (tree type, tree exp, rtx target) { enum machine_mode tmode = TYPE_MODE (type); enum machine_mode inner_mode = GET_MODE_INNER (tmode); @@ -16028,15 +17553,14 @@ ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target) rtvec v = rtvec_alloc (n_elt); gcc_assert (VECTOR_MODE_P (tmode)); + gcc_assert (call_expr_nargs (exp) == n_elt); - for (i = 0; i < n_elt; ++i, arglist = TREE_CHAIN (arglist)) + for (i = 0; i < n_elt; ++i) { - rtx x = expand_normal (TREE_VALUE (arglist)); + rtx x = expand_normal (CALL_EXPR_ARG (exp, i)); RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x); } - gcc_assert (arglist == NULL); - if (!target || !register_operand (target, tmode)) target = gen_reg_rtx (tmode); @@ -16049,15 +17573,15 @@ ix86_expand_vec_init_builtin (tree type, tree arglist, rtx target) had a language-level syntax for referencing vector elements. */ static rtx -ix86_expand_vec_ext_builtin (tree arglist, rtx target) +ix86_expand_vec_ext_builtin (tree exp, rtx target) { enum machine_mode tmode, mode0; tree arg0, arg1; int elt; rtx op0; - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); elt = get_element_number (TREE_TYPE (arg0), arg1); @@ -16081,16 +17605,16 @@ ix86_expand_vec_ext_builtin (tree arglist, rtx target) a language-level syntax for referencing vector elements. */ static rtx -ix86_expand_vec_set_builtin (tree arglist) +ix86_expand_vec_set_builtin (tree exp) { enum machine_mode tmode, mode1; tree arg0, arg1, arg2; int elt; rtx op0, op1; - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); tmode = TYPE_MODE (TREE_TYPE (arg0)); mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0))); @@ -16125,11 +17649,10 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, const struct builtin_description *d; size_t i; enum insn_code icode; - tree fndecl = TREE_OPERAND (TREE_OPERAND (exp, 0), 0); - tree arglist = TREE_OPERAND (exp, 1); - tree arg0, arg1, arg2; - rtx op0, op1, op2, pat; - enum machine_mode tmode, mode0, mode1, mode2, mode3; + tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); + tree arg0, arg1, arg2, arg3; + rtx op0, op1, op2, op3, pat; + enum machine_mode tmode, mode0, mode1, mode2, mode3, mode4; unsigned int fcode = DECL_FUNCTION_CODE (fndecl); switch (fcode) @@ -16148,9 +17671,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, ? CODE_FOR_mmx_maskmovq : CODE_FOR_sse2_maskmovdqu); /* Note the arg order is different from the operand order. */ - arg1 = TREE_VALUE (arglist); - arg2 = TREE_VALUE (TREE_CHAIN (arglist)); - arg0 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg1 = CALL_EXPR_ARG (exp, 0); + arg2 = CALL_EXPR_ARG (exp, 1); + arg0 = CALL_EXPR_ARG (exp, 2); op0 = expand_normal (arg0); op1 = expand_normal (arg1); op2 = expand_normal (arg2); @@ -16174,17 +17697,17 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_SQRTSS: - return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, exp, target); case IX86_BUILTIN_RSQRTSS: - return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrsqrtv4sf2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrsqrtv4sf2, exp, target); case IX86_BUILTIN_RCPSS: - return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrcpv4sf2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrcpv4sf2, exp, target); case IX86_BUILTIN_LOADUPS: - return ix86_expand_unop_builtin (CODE_FOR_sse_movups, arglist, target, 1); + return ix86_expand_unop_builtin (CODE_FOR_sse_movups, exp, target, 1); case IX86_BUILTIN_STOREUPS: - return ix86_expand_store_builtin (CODE_FOR_sse_movups, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse_movups, exp); case IX86_BUILTIN_LOADHPS: case IX86_BUILTIN_LOADLPS: @@ -16194,8 +17717,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_loadlps : fcode == IX86_BUILTIN_LOADHPD ? CODE_FOR_sse2_loadhpd : CODE_FOR_sse2_loadlpd); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); tmode = insn_data[icode].operand[0].mode; @@ -16218,8 +17741,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_STORELPS: icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_storehps : CODE_FOR_sse_storelps); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); mode0 = insn_data[icode].operand[0].mode; @@ -16235,12 +17758,12 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return const0_rtx; case IX86_BUILTIN_MOVNTPS: - return ix86_expand_store_builtin (CODE_FOR_sse_movntv4sf, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse_movntv4sf, exp); case IX86_BUILTIN_MOVNTQ: - return ix86_expand_store_builtin (CODE_FOR_sse_movntdi, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse_movntdi, exp); case IX86_BUILTIN_LDMXCSR: - op0 = expand_normal (TREE_VALUE (arglist)); + op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); target = assign_386_stack_local (SImode, SLOT_TEMP); emit_move_insn (target, op0); emit_insn (gen_sse_ldmxcsr (target)); @@ -16256,9 +17779,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, icode = (fcode == IX86_BUILTIN_SHUFPS ? CODE_FOR_sse_shufps : CODE_FOR_sse2_shufpd); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); op0 = expand_normal (arg0); op1 = expand_normal (arg1); op2 = expand_normal (arg2); @@ -16296,8 +17819,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, : fcode == IX86_BUILTIN_PSHUFLW ? CODE_FOR_sse2_pshuflw : fcode == IX86_BUILTIN_PSHUFD ? CODE_FOR_sse2_pshufd : CODE_FOR_mmx_pshufw); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); tmode = insn_data[icode].operand[0].mode; @@ -16326,8 +17849,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_PSRLDQI128: icode = ( fcode == IX86_BUILTIN_PSLLDQI128 ? CODE_FOR_sse2_ashlti3 : CODE_FOR_sse2_lshrti3); - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); tmode = insn_data[icode].operand[0].mode; @@ -16356,86 +17879,86 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return NULL_RTX; case IX86_BUILTIN_PAVGUSB: - return ix86_expand_binop_builtin (CODE_FOR_mmx_uavgv8qi3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_uavgv8qi3, exp, target); case IX86_BUILTIN_PF2ID: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2id, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2id, exp, target, 0); case IX86_BUILTIN_PFACC: - return ix86_expand_binop_builtin (CODE_FOR_mmx_haddv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_haddv2sf3, exp, target); case IX86_BUILTIN_PFADD: - return ix86_expand_binop_builtin (CODE_FOR_mmx_addv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_addv2sf3, exp, target); case IX86_BUILTIN_PFCMPEQ: - return ix86_expand_binop_builtin (CODE_FOR_mmx_eqv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_eqv2sf3, exp, target); case IX86_BUILTIN_PFCMPGE: - return ix86_expand_binop_builtin (CODE_FOR_mmx_gev2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_gev2sf3, exp, target); case IX86_BUILTIN_PFCMPGT: - return ix86_expand_binop_builtin (CODE_FOR_mmx_gtv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_gtv2sf3, exp, target); case IX86_BUILTIN_PFMAX: - return ix86_expand_binop_builtin (CODE_FOR_mmx_smaxv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_smaxv2sf3, exp, target); case IX86_BUILTIN_PFMIN: - return ix86_expand_binop_builtin (CODE_FOR_mmx_sminv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_sminv2sf3, exp, target); case IX86_BUILTIN_PFMUL: - return ix86_expand_binop_builtin (CODE_FOR_mmx_mulv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_mulv2sf3, exp, target); case IX86_BUILTIN_PFRCP: - return ix86_expand_unop_builtin (CODE_FOR_mmx_rcpv2sf2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_rcpv2sf2, exp, target, 0); case IX86_BUILTIN_PFRCPIT1: - return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit1v2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit1v2sf3, exp, target); case IX86_BUILTIN_PFRCPIT2: - return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit2v2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit2v2sf3, exp, target); case IX86_BUILTIN_PFRSQIT1: - return ix86_expand_binop_builtin (CODE_FOR_mmx_rsqit1v2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_rsqit1v2sf3, exp, target); case IX86_BUILTIN_PFRSQRT: - return ix86_expand_unop_builtin (CODE_FOR_mmx_rsqrtv2sf2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_rsqrtv2sf2, exp, target, 0); case IX86_BUILTIN_PFSUB: - return ix86_expand_binop_builtin (CODE_FOR_mmx_subv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_subv2sf3, exp, target); case IX86_BUILTIN_PFSUBR: - return ix86_expand_binop_builtin (CODE_FOR_mmx_subrv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_subrv2sf3, exp, target); case IX86_BUILTIN_PI2FD: - return ix86_expand_unop_builtin (CODE_FOR_mmx_floatv2si2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_floatv2si2, exp, target, 0); case IX86_BUILTIN_PMULHRW: - return ix86_expand_binop_builtin (CODE_FOR_mmx_pmulhrwv4hi3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_pmulhrwv4hi3, exp, target); case IX86_BUILTIN_PF2IW: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2iw, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2iw, exp, target, 0); case IX86_BUILTIN_PFNACC: - return ix86_expand_binop_builtin (CODE_FOR_mmx_hsubv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_hsubv2sf3, exp, target); case IX86_BUILTIN_PFPNACC: - return ix86_expand_binop_builtin (CODE_FOR_mmx_addsubv2sf3, arglist, target); + return ix86_expand_binop_builtin (CODE_FOR_mmx_addsubv2sf3, exp, target); case IX86_BUILTIN_PI2FW: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pi2fw, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pi2fw, exp, target, 0); case IX86_BUILTIN_PSWAPDSI: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2si2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2si2, exp, target, 0); case IX86_BUILTIN_PSWAPDSF: - return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2sf2, arglist, target, 0); + return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2sf2, exp, target, 0); case IX86_BUILTIN_SQRTSD: - return ix86_expand_unop1_builtin (CODE_FOR_sse2_vmsqrtv2df2, arglist, target); + return ix86_expand_unop1_builtin (CODE_FOR_sse2_vmsqrtv2df2, exp, target); case IX86_BUILTIN_LOADUPD: - return ix86_expand_unop_builtin (CODE_FOR_sse2_movupd, arglist, target, 1); + return ix86_expand_unop_builtin (CODE_FOR_sse2_movupd, exp, target, 1); case IX86_BUILTIN_STOREUPD: - return ix86_expand_store_builtin (CODE_FOR_sse2_movupd, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movupd, exp); case IX86_BUILTIN_MFENCE: emit_insn (gen_sse2_mfence ()); @@ -16445,7 +17968,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_CLFLUSH: - arg0 = TREE_VALUE (arglist); + arg0 = CALL_EXPR_ARG (exp, 0); op0 = expand_normal (arg0); icode = CODE_FOR_sse2_clflush; if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode)) @@ -16455,21 +17978,21 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_MOVNTPD: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, exp); case IX86_BUILTIN_MOVNTDQ: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, exp); case IX86_BUILTIN_MOVNTI: - return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, exp); case IX86_BUILTIN_LOADDQU: - return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, arglist, target, 1); + return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, exp, target, 1); case IX86_BUILTIN_STOREDQU: - return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, arglist); + return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, exp); case IX86_BUILTIN_MONITOR: - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); op0 = expand_normal (arg0); op1 = expand_normal (arg1); op2 = expand_normal (arg2); @@ -16486,8 +18009,8 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_MWAIT: - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); op0 = expand_normal (arg0); op1 = expand_normal (arg1); if (!REG_P (op0)) @@ -16498,7 +18021,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return 0; case IX86_BUILTIN_LDDQU: - return ix86_expand_unop_builtin (CODE_FOR_sse3_lddqu, arglist, + return ix86_expand_unop_builtin (CODE_FOR_sse3_lddqu, exp, target, 1); case IX86_BUILTIN_PALIGNR: @@ -16513,9 +18036,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, icode = CODE_FOR_ssse3_palignrti; mode = V2DImode; } - arg0 = TREE_VALUE (arglist); - arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - arg2 = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (arglist))); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0); op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0); op2 = expand_expr (arg2, NULL_RTX, VOIDmode, 0); @@ -16547,10 +18070,118 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, emit_insn (pat); return target; + case IX86_BUILTIN_MOVNTSD: + return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, exp); + + case IX86_BUILTIN_MOVNTSS: + return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv4sf, exp); + + case IX86_BUILTIN_INSERTQ: + case IX86_BUILTIN_EXTRQ: + icode = (fcode == IX86_BUILTIN_EXTRQ + ? CODE_FOR_sse4a_extrq + : CODE_FOR_sse4a_insertq); + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + op1 = copy_to_mode_reg (mode2, op1); + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + + case IX86_BUILTIN_EXTRQI: + icode = CODE_FOR_sse4a_extrqi; + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + { + error ("index mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) + { + error ("length mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + + case IX86_BUILTIN_INSERTQI: + icode = CODE_FOR_sse4a_insertqi; + arg0 = CALL_EXPR_ARG (exp, 0); + arg1 = CALL_EXPR_ARG (exp, 1); + arg2 = CALL_EXPR_ARG (exp, 2); + arg3 = CALL_EXPR_ARG (exp, 3); + op0 = expand_normal (arg0); + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + op3 = expand_normal (arg3); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + mode3 = insn_data[icode].operand[3].mode; + mode4 = insn_data[icode].operand[4].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + op0 = copy_to_mode_reg (mode1, op0); + + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + op1 = copy_to_mode_reg (mode2, op1); + + if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) + { + error ("index mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (! (*insn_data[icode].operand[4].predicate) (op3, mode4)) + { + error ("length mask must be an immediate"); + return gen_reg_rtx (tmode); + } + if (optimize || target == 0 + || GET_MODE (target) != tmode + || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + target = gen_reg_rtx (tmode); + pat = GEN_FCN (icode) (target, op0, op1, op2, op3); + if (! pat) + return NULL_RTX; + emit_insn (pat); + return target; + case IX86_BUILTIN_VEC_INIT_V2SI: case IX86_BUILTIN_VEC_INIT_V4HI: case IX86_BUILTIN_VEC_INIT_V8QI: - return ix86_expand_vec_init_builtin (TREE_TYPE (exp), arglist, target); + return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target); case IX86_BUILTIN_VEC_EXT_V2DF: case IX86_BUILTIN_VEC_EXT_V2DI: @@ -16559,11 +18190,11 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_VEC_EXT_V8HI: case IX86_BUILTIN_VEC_EXT_V2SI: case IX86_BUILTIN_VEC_EXT_V4HI: - return ix86_expand_vec_ext_builtin (arglist, target); + return ix86_expand_vec_ext_builtin (exp, target); case IX86_BUILTIN_VEC_SET_V8HI: case IX86_BUILTIN_VEC_SET_V4HI: - return ix86_expand_vec_set_builtin (arglist); + return ix86_expand_vec_set_builtin (exp); default: break; @@ -16577,22 +18208,103 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, || d->icode == CODE_FOR_sse_vmmaskcmpv4sf3 || d->icode == CODE_FOR_sse2_maskcmpv2df3 || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3) - return ix86_expand_sse_compare (d, arglist, target); + return ix86_expand_sse_compare (d, exp, target); - return ix86_expand_binop_builtin (d->icode, arglist, target); + return ix86_expand_binop_builtin (d->icode, exp, target); } for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++) if (d->code == fcode) - return ix86_expand_unop_builtin (d->icode, arglist, target, 0); + return ix86_expand_unop_builtin (d->icode, exp, target, 0); for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++) if (d->code == fcode) - return ix86_expand_sse_comi (d, arglist, target); + return ix86_expand_sse_comi (d, exp, target); gcc_unreachable (); } +/* Returns a function decl for a vectorized version of the builtin function + with builtin function code FN and the result vector type TYPE, or NULL_TREE + if it is not available. */ + +static tree +ix86_builtin_vectorized_function (enum built_in_function fn, tree type_out, + tree type_in) +{ + enum machine_mode in_mode, out_mode; + int in_n, out_n; + + if (TREE_CODE (type_out) != VECTOR_TYPE + || TREE_CODE (type_in) != VECTOR_TYPE) + return NULL_TREE; + + out_mode = TYPE_MODE (TREE_TYPE (type_out)); + out_n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + + switch (fn) + { + case BUILT_IN_SQRT: + if (out_mode == DFmode && out_n == 2 + && in_mode == DFmode && in_n == 2) + return ix86_builtins[IX86_BUILTIN_SQRTPD]; + return NULL_TREE; + + case BUILT_IN_SQRTF: + if (out_mode == SFmode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return ix86_builtins[IX86_BUILTIN_SQRTPS]; + return NULL_TREE; + + case BUILT_IN_LRINTF: + if (out_mode == SImode && out_n == 4 + && in_mode == SFmode && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CVTPS2DQ]; + return NULL_TREE; + + default: + ; + } + + return NULL_TREE; +} + +/* Returns a decl of a function that implements conversion of the + input vector of type TYPE, or NULL_TREE if it is not available. */ + +static tree +ix86_builtin_conversion (enum tree_code code, tree type) +{ + if (TREE_CODE (type) != VECTOR_TYPE) + return NULL_TREE; + + switch (code) + { + case FLOAT_EXPR: + switch (TYPE_MODE (type)) + { + case V4SImode: + return ix86_builtins[IX86_BUILTIN_CVTDQ2PS]; + default: + return NULL_TREE; + } + + case FIX_TRUNC_EXPR: + switch (TYPE_MODE (type)) + { + case V4SFmode: + return ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]; + default: + return NULL_TREE; + } + default: + return NULL_TREE; + + } +} + /* Store OPERAND to the memory after reload is completed. This means that we can't easily use assign_stack_local. */ rtx @@ -16838,18 +18550,12 @@ ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2, /* If the target says that inter-unit moves are more expensive than moving through memory, then don't generate them. */ - if (!TARGET_INTER_UNIT_MOVES && !optimize_size) + if (!TARGET_INTER_UNIT_MOVES) return true; /* Between SSE and general, we have moves no larger than word size. */ if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) return true; - - /* ??? For the cost of one register reformat penalty, we could use - the same instructions to move SFmode and DFmode data, but the - relevant move patterns don't support those alternatives. */ - if (mode == SFmode || mode == DFmode) - return true; } return false; @@ -17222,7 +18928,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) return false; case ASHIFT: - if (GET_CODE (XEXP (x, 1)) == CONST_INT + if (CONST_INT_P (XEXP (x, 1)) && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT)) { HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); @@ -17246,7 +18952,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) case ROTATERT: if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode) { - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) { if (INTVAL (XEXP (x, 1)) > 32) *total = ix86_cost->shift_const + COSTS_N_INSNS (2); @@ -17263,7 +18969,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) } else { - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) *total = ix86_cost->shift_const; else *total = ix86_cost->shift_var; @@ -17281,7 +18987,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) rtx op0 = XEXP (x, 0); rtx op1 = XEXP (x, 1); int nbits; - if (GET_CODE (XEXP (x, 1)) == CONST_INT) + if (CONST_INT_P (XEXP (x, 1))) { unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1)); for (nbits = 0; value != 0; value &= value - 1) @@ -17301,7 +19007,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) if (GET_CODE (op0) == GET_CODE (op1)) is_mulwiden = 1, op1 = XEXP (op1, 0); - else if (GET_CODE (op1) == CONST_INT) + else if (CONST_INT_P (op1)) { if (GET_CODE (op0) == SIGN_EXTEND) is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode) @@ -17339,7 +19045,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) { if (GET_CODE (XEXP (x, 0)) == PLUS && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT - && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1)) && CONSTANT_P (XEXP (x, 1))) { HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1)); @@ -17354,7 +19060,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) } } else if (GET_CODE (XEXP (x, 0)) == MULT - && GET_CODE (XEXP (XEXP (x, 0), 1)) == CONST_INT) + && CONST_INT_P (XEXP (XEXP (x, 0), 1))) { HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1)); if (val == 2 || val == 4 || val == 8) @@ -17416,7 +19122,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) case COMPARE: if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT && XEXP (XEXP (x, 0), 1) == const1_rtx - && GET_CODE (XEXP (XEXP (x, 0), 2)) == CONST_INT + && CONST_INT_P (XEXP (XEXP (x, 0), 2)) && XEXP (x, 1) == const0_rtx) { /* This kind of construct is implemented using test[bwl]. @@ -17432,22 +19138,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code, int *total) if (!TARGET_SSE_MATH || mode == XFmode || (mode == DFmode && !TARGET_SSE2)) - /* For standard 80387 constants, raise the cost to prevent - compress_float_constant() to generate load from memory. */ - switch (standard_80387_constant_p (XEXP (x, 0))) - { - case -1: - case 0: - *total = 0; - break; - case 1: /* 0.0 */ - *total = 1; - break; - default: - *total = (x86_ext_80387_constants & TUNEMASK - || optimize_size - ? 1 : 0); - } + *total = 0; return false; case ABS: @@ -17737,7 +19428,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, { if (!x86_64_general_operand (xops[0], DImode)) { - tmp = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 2 /* R10 */); + tmp = gen_rtx_REG (DImode, R10_REG); xops[1] = tmp; output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", xops); xops[0] = tmp; @@ -17753,7 +19444,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, if (vcall_offset) { if (TARGET_64BIT) - tmp = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 2 /* R10 */); + tmp = gen_rtx_REG (DImode, R10_REG); else { int tmp_regno = 2 /* ECX */; @@ -17774,7 +19465,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, xops[0] = gen_rtx_MEM (Pmode, plus_constant (tmp, vcall_offset)); if (TARGET_64BIT && !memory_operand (xops[0], Pmode)) { - rtx tmp2 = gen_rtx_REG (DImode, FIRST_REX_INT_REG + 3 /* R11 */); + rtx tmp2 = gen_rtx_REG (DImode, R11_REG); xops[0] = GEN_INT (vcall_offset); xops[1] = tmp2; output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops); @@ -17925,14 +19616,14 @@ min_insn_size (rtx insn) if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN) return 0; - if (GET_CODE (insn) == JUMP_INSN + if (JUMP_P (insn) && (GET_CODE (PATTERN (insn)) == ADDR_VEC || GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC)) return 0; /* Important case - calls are always 5 bytes. It is common to have many calls in the row. */ - if (GET_CODE (insn) == CALL_INSN + if (CALL_P (insn) && symbolic_reference_mentioned_p (PATTERN (insn)) && !SIBLING_CALL_P (insn)) return 5; @@ -17942,7 +19633,7 @@ min_insn_size (rtx insn) /* For normal instructions we may rely on the sizes of addresses and the presence of symbol to require 4 bytes of encoding. This is not the case for jumps where references are PC relative. */ - if (GET_CODE (insn) != JUMP_INSN) + if (!JUMP_P (insn)) { l = get_attr_length_address (insn); if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn))) @@ -17981,10 +19672,10 @@ ix86_avoid_jump_misspredicts (void) if (dump_file) fprintf(dump_file, "Insn %i estimated to %i bytes\n", INSN_UID (insn), min_insn_size (insn)); - if ((GET_CODE (insn) == JUMP_INSN + if ((JUMP_P (insn) && GET_CODE (PATTERN (insn)) != ADDR_VEC && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC) - || GET_CODE (insn) == CALL_INSN) + || CALL_P (insn)) njumps++; else continue; @@ -17992,10 +19683,10 @@ ix86_avoid_jump_misspredicts (void) while (njumps > 3) { start = NEXT_INSN (start); - if ((GET_CODE (start) == JUMP_INSN + if ((JUMP_P (start) && GET_CODE (PATTERN (start)) != ADDR_VEC && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC) - || GET_CODE (start) == CALL_INSN) + || CALL_P (start)) njumps--, isjump = 1; else isjump = 0; @@ -18035,13 +19726,13 @@ ix86_pad_returns (void) rtx prev; bool replace = false; - if (GET_CODE (ret) != JUMP_INSN || GET_CODE (PATTERN (ret)) != RETURN + if (!JUMP_P (ret) || GET_CODE (PATTERN (ret)) != RETURN || !maybe_hot_bb_p (bb)) continue; for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev)) - if (active_insn_p (prev) || GET_CODE (prev) == CODE_LABEL) + if (active_insn_p (prev) || LABEL_P (prev)) break; - if (prev && GET_CODE (prev) == CODE_LABEL) + if (prev && LABEL_P (prev)) { edge e; edge_iterator ei; @@ -18055,8 +19746,8 @@ ix86_pad_returns (void) { prev = prev_active_insn (ret); if (prev - && ((GET_CODE (prev) == JUMP_INSN && any_condjump_p (prev)) - || GET_CODE (prev) == CALL_INSN)) + && ((JUMP_P (prev) && any_condjump_p (prev)) + || CALL_P (prev))) replace = true; /* Empty functions get branch mispredict even when the jump destination is not visible to us. */ @@ -18133,21 +19824,25 @@ x86_emit_floatuns (rtx operands[2]) mode = GET_MODE (out); neglab = gen_label_rtx (); donelab = gen_label_rtx (); - i1 = gen_reg_rtx (Pmode); f0 = gen_reg_rtx (mode); - emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, Pmode, 0, neglab); + emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab); + + expand_float (out, in, 0); - emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_FLOAT (mode, in))); emit_jump_insn (gen_jump (donelab)); emit_barrier (); emit_label (neglab); - i0 = expand_simple_binop (Pmode, LSHIFTRT, in, const1_rtx, NULL, 1, OPTAB_DIRECT); - i1 = expand_simple_binop (Pmode, AND, in, const1_rtx, NULL, 1, OPTAB_DIRECT); - i0 = expand_simple_binop (Pmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT); + i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL, + 1, OPTAB_DIRECT); + i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT); + expand_float (f0, i0, 0); + emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0))); emit_label (donelab); @@ -19009,8 +20704,6 @@ ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED, clobbers); clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"), clobbers); - clobbers = tree_cons (NULL_TREE, build_string (7, "dirflag"), - clobbers); return clobbers; } @@ -19136,14 +20829,14 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1) emit_jump_insn (gen_bge (label1)); emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */ - emit_insn (gen_fyl2xp1_xf3 (op0, tmp2, op1)); + emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2)); emit_jump (label2); emit_label (label1); emit_move_insn (tmp, CONST1_RTX (XFmode)); emit_insn (gen_addxf3 (tmp, op1, tmp)); emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */ - emit_insn (gen_fyl2x_xf3 (op0, tmp2, tmp)); + emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2)); emit_label (label2); } @@ -19376,7 +21069,7 @@ ix86_expand_lround (rtx op0, rtx op1) ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX); /* adj = op1 + adj */ - expand_simple_binop (mode, PLUS, adj, op1, adj, 0, OPTAB_DIRECT); + adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT); /* op0 = (imode)adj */ expand_fix (op0, adj, 0); @@ -19394,7 +21087,7 @@ ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor) */ enum machine_mode fmode = GET_MODE (op1); enum machine_mode imode = GET_MODE (op0); - rtx ireg, freg, label; + rtx ireg, freg, label, tmp; /* reg = (long)op1 */ ireg = gen_reg_rtx (imode); @@ -19407,8 +21100,10 @@ ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor) /* ireg = (freg > op1) ? ireg - 1 : ireg */ label = ix86_expand_sse_compare_and_jump (UNLE, freg, op1, !do_floor); - expand_simple_binop (imode, do_floor ? MINUS : PLUS, - ireg, const1_rtx, ireg, 0, OPTAB_DIRECT); + tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS, + ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (ireg, tmp); + emit_label (label); LABEL_NUSES (label) = 1; @@ -19421,10 +21116,11 @@ void ix86_expand_rint (rtx operand0, rtx operand1) { /* C code for the stuff we're doing below: - if (!isless (fabs (operand1), 2**52)) + xa = fabs (operand1); + if (!isless (xa, 2**52)) return operand1; - tmp = copysign (2**52, operand1); - return operand1 + tmp - tmp; + xa = xa + 2**52 - 2**52; + return copysign (xa, operand1); */ enum machine_mode mode = GET_MODE (operand0); rtx res, xa, label, TWO52, mask; @@ -19439,10 +21135,10 @@ ix86_expand_rint (rtx operand0, rtx operand1) TWO52 = ix86_gen_TWO52 (mode); label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); - ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask); + xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); - expand_simple_binop (mode, PLUS, res, TWO52, res, 0, OPTAB_DIRECT); - expand_simple_binop (mode, MINUS, res, TWO52, res, 0, OPTAB_DIRECT); + ix86_sse_copysign_to_positive (res, xa, res, mask); emit_label (label); LABEL_NUSES (label) = 1; @@ -19466,7 +21162,7 @@ ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor) x2 -= 1; Compensate. Ceil: if (x2 < x) - x2 += 1; + x2 -= -1; return x2; */ enum machine_mode mode = GET_MODE (operand0); @@ -19486,21 +21182,25 @@ ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor) label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); /* xa = xa + TWO52 - TWO52; */ - expand_simple_binop (mode, PLUS, xa, TWO52, xa, 0, OPTAB_DIRECT); - expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT); /* xa = copysign (xa, operand1) */ ix86_sse_copysign_to_positive (xa, xa, res, mask); - /* generate 1.0 */ - one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + /* generate 1.0 or -1.0 */ + one = force_reg (mode, + const_double_from_real_value (do_floor + ? dconst1 : dconstm1, mode)); /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */ tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); emit_insn (gen_rtx_SET (VOIDmode, tmp, gen_rtx_AND (mode, one, tmp))); - expand_simple_binop (mode, do_floor ? MINUS : PLUS, - xa, tmp, res, 0, OPTAB_DIRECT); + /* We always need to subtract here to preserve signed zero. */ + tmp = expand_simple_binop (mode, MINUS, + xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); emit_label (label); LABEL_NUSES (label) = 1; @@ -19524,10 +21224,12 @@ ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor) Compensate. Ceil: if (x2 < x) x2 += 1; + if (HONOR_SIGNED_ZEROS (mode)) + return copysign (x2, x); return x2; */ enum machine_mode mode = GET_MODE (operand0); - rtx xa, xi, TWO52, tmp, label, one, res; + rtx xa, xi, TWO52, tmp, label, one, res, mask; TWO52 = ix86_gen_TWO52 (mode); @@ -19537,7 +21239,7 @@ ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor) emit_move_insn (res, operand1); /* xa = abs (operand1) */ - xa = ix86_expand_sse_fabs (res, NULL); + xa = ix86_expand_sse_fabs (res, &mask); /* if (!isless (xa, TWO52)) goto label; */ label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); @@ -19554,8 +21256,12 @@ ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor) tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor); emit_insn (gen_rtx_SET (VOIDmode, tmp, gen_rtx_AND (mode, one, tmp))); - expand_simple_binop (mode, do_floor ? MINUS : PLUS, - xa, tmp, res, 0, OPTAB_DIRECT); + tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS, + xa, tmp, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + if (HONOR_SIGNED_ZEROS (mode)) + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); emit_label (label); LABEL_NUSES (label) = 1; @@ -19602,20 +21308,17 @@ ix86_expand_rounddf_32 (rtx operand0, rtx operand1) label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); /* xa2 = xa + TWO52 - TWO52; */ - xa2 = gen_reg_rtx (mode); - expand_simple_binop (mode, PLUS, xa, TWO52, xa2, 0, OPTAB_DIRECT); - expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT); + xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT); /* dxa = xa2 - xa; */ - dxa = gen_reg_rtx (mode); - expand_simple_binop (mode, MINUS, xa2, xa, dxa, 0, OPTAB_DIRECT); + dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT); /* generate 0.5, 1.0 and -0.5 */ half = force_reg (mode, const_double_from_real_value (dconsthalf, mode)); - one = gen_reg_rtx (mode); - expand_simple_binop (mode, PLUS, half, half, one, 0, OPTAB_DIRECT); - mhalf = gen_reg_rtx (mode); - expand_simple_binop (mode, MINUS, half, one, mhalf, 0, OPTAB_DIRECT); + one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT); + mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX, + 0, OPTAB_DIRECT); /* Compensate. */ tmp = gen_reg_rtx (mode); @@ -19623,12 +21326,12 @@ ix86_expand_rounddf_32 (rtx operand0, rtx operand1) tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false); emit_insn (gen_rtx_SET (VOIDmode, tmp, gen_rtx_AND (mode, one, tmp))); - expand_simple_binop (mode, MINUS, xa2, tmp, xa2, 0, OPTAB_DIRECT); + xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */ tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false); emit_insn (gen_rtx_SET (VOIDmode, tmp, gen_rtx_AND (mode, one, tmp))); - expand_simple_binop (mode, PLUS, xa2, tmp, xa2, 0, OPTAB_DIRECT); + xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT); /* res = copysign (xa2, operand1) */ ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask); @@ -19639,6 +21342,108 @@ ix86_expand_rounddf_32 (rtx operand0, rtx operand1) emit_move_insn (operand0, res); } +/* Expand SSE sequence for computing trunc from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_trunc (rtx operand0, rtx operand1) +{ + /* C code for SSE variant we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + x2 = (double)(long)x; + if (HONOR_SIGNED_ZEROS (mode)) + return copysign (x2, x); + return x2; + */ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, xi, TWO52, label, res, mask; + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &mask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* x = (double)(long)x */ + xi = gen_reg_rtx (mode == DFmode ? DImode : SImode); + expand_fix (xi, res, 0); + expand_float (res, xi, 0); + + if (HONOR_SIGNED_ZEROS (mode)) + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + +/* Expand SSE sequence for computing trunc from OPERAND1 storing + into OPERAND0. */ +void +ix86_expand_truncdf_32 (rtx operand0, rtx operand1) +{ + enum machine_mode mode = GET_MODE (operand0); + rtx xa, mask, TWO52, label, one, res, smask, tmp; + + /* C code for SSE variant we expand below. + double xa = fabs (x), x2; + if (!isless (xa, TWO52)) + return x; + xa2 = xa + TWO52 - TWO52; + Compensate: + if (xa2 > xa) + xa2 -= 1.0; + x2 = copysign (xa2, x); + return x2; + */ + + TWO52 = ix86_gen_TWO52 (mode); + + /* Temporary for holding the result, initialized to the input + operand to ease control flow. */ + res = gen_reg_rtx (mode); + emit_move_insn (res, operand1); + + /* xa = abs (operand1) */ + xa = ix86_expand_sse_fabs (res, &smask); + + /* if (!isless (xa, TWO52)) goto label; */ + label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false); + + /* res = xa + TWO52 - TWO52; */ + tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT); + tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + /* generate 1.0 */ + one = force_reg (mode, const_double_from_real_value (dconst1, mode)); + + /* Compensate: res = xa2 - (res > xa ? 1 : 0) */ + mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false); + emit_insn (gen_rtx_SET (VOIDmode, mask, + gen_rtx_AND (mode, mask, one))); + tmp = expand_simple_binop (mode, MINUS, + res, mask, NULL_RTX, 0, OPTAB_DIRECT); + emit_move_insn (res, tmp); + + /* res = copysign (res, operand1) */ + ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask); + + emit_label (label); + LABEL_NUSES (label) = 1; + + emit_move_insn (operand0, res); +} + /* Expand SSE sequence for computing round from OPERAND1 storing into OPERAND0. */ void @@ -19672,7 +21477,7 @@ ix86_expand_round (rtx operand0, rtx operand1) /* xa = xa + 0.5 */ half = force_reg (mode, const_double_from_real_value (pred_half, mode)); - expand_simple_binop (mode, PLUS, xa, half, xa, 0, OPTAB_DIRECT); + xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT); /* xa = (double)(int64_t)xa */ xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);