X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Fconfig%2Fi386%2Fi386.c;h=8ddfa9f183b28c5df331704dcc57917c48c5bad4;hb=89ba852b50987cc54e46c5bd591582ffbf8f8cc8;hp=0d46436c23957c8e8748115645dc33112e7ca064;hpb=5c9051a4b06d9fdc9370293f2449dda052a7828d;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 0d46436c239..8ddfa9f183b 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1,6 +1,7 @@ /* Subroutines used for code generation on IA-32. Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. + 2002, 2003, 2004, 2005, 2006, 2007, 2008 + Free Software Foundation, Inc. This file is part of GCC. @@ -700,7 +701,7 @@ struct processor_costs k8_cost = { to limit number of prefetches at all, as their execution also takes some time). */ 100, /* number of parallel prefetches */ - 5, /* Branch cost */ + 3, /* Branch cost */ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ COSTS_N_INSNS (4), /* cost of FMUL instruction. */ COSTS_N_INSNS (19), /* cost of FDIV instruction. */ @@ -724,8 +725,8 @@ struct processor_costs k8_cost = { 2, /* vec_align_load_cost. */ 3, /* vec_unalign_load_cost. */ 3, /* vec_store_cost. */ - 6, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ + 3, /* cond_taken_branch_cost. */ + 2, /* cond_not_taken_branch_cost. */ }; struct processor_costs amdfam10_cost = { @@ -786,7 +787,7 @@ struct processor_costs amdfam10_cost = { to limit number of prefetches at all, as their execution also takes some time). */ 100, /* number of parallel prefetches */ - 5, /* Branch cost */ + 2, /* Branch cost */ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ COSTS_N_INSNS (4), /* cost of FMUL instruction. */ COSTS_N_INSNS (19), /* cost of FDIV instruction. */ @@ -811,7 +812,7 @@ struct processor_costs amdfam10_cost = { 2, /* vec_align_load_cost. */ 2, /* vec_unalign_load_cost. */ 2, /* vec_store_cost. */ - 6, /* cond_taken_branch_cost. */ + 2, /* cond_taken_branch_cost. */ 1, /* cond_not_taken_branch_cost. */ }; @@ -1429,7 +1430,7 @@ unsigned int ix86_tune_features[X86_TUNE_LAST] = { replacement is long decoded, so this split helps here as well. */ m_K6, - /* X86_TUNE_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion + /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion from integer to FP. */ m_AMDFAM10, }; @@ -1767,35 +1768,65 @@ int ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT; was set or cleared on the command line. */ static int ix86_isa_flags_explicit; -/* Define a set of ISAs which aren't available for a given ISA. MMX - and SSE ISAs are handled separately. */ +/* Define a set of ISAs which are available when a given ISA is + enabled. MMX and SSE ISAs are handled separately. */ + +#define OPTION_MASK_ISA_MMX_SET OPTION_MASK_ISA_MMX +#define OPTION_MASK_ISA_3DNOW_SET \ + (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_MMX_SET) + +#define OPTION_MASK_ISA_SSE_SET OPTION_MASK_ISA_SSE +#define OPTION_MASK_ISA_SSE2_SET \ + (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE_SET) +#define OPTION_MASK_ISA_SSE3_SET \ + (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE2_SET) +#define OPTION_MASK_ISA_SSSE3_SET \ + (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE3_SET) +#define OPTION_MASK_ISA_SSE4_1_SET \ + (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSSE3_SET) +#define OPTION_MASK_ISA_SSE4_2_SET \ + (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_1_SET) + +/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same + as -msse4.2. */ +#define OPTION_MASK_ISA_SSE4_SET OPTION_MASK_ISA_SSE4_2_SET + +#define OPTION_MASK_ISA_SSE4A_SET \ + (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET) +#define OPTION_MASK_ISA_SSE5_SET \ + (OPTION_MASK_ISA_SSE5 | OPTION_MASK_ISA_SSE4A_SET) + +/* Define a set of ISAs which aren't available when a given ISA is + disabled. MMX and SSE ISAs are handled separately. */ #define OPTION_MASK_ISA_MMX_UNSET \ - (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET) -#define OPTION_MASK_ISA_3DNOW_UNSET OPTION_MASK_ISA_3DNOW_A + (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_3DNOW_UNSET) +#define OPTION_MASK_ISA_3DNOW_UNSET \ + (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_A_UNSET) +#define OPTION_MASK_ISA_3DNOW_A_UNSET OPTION_MASK_ISA_3DNOW_A #define OPTION_MASK_ISA_SSE_UNSET \ - (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE2_UNSET) + (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2_UNSET) #define OPTION_MASK_ISA_SSE2_UNSET \ - (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE3_UNSET) + (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE3_UNSET) #define OPTION_MASK_ISA_SSE3_UNSET \ - (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSSE3_UNSET) + (OPTION_MASK_ISA_SSE3 \ + | OPTION_MASK_ISA_SSSE3_UNSET \ + | OPTION_MASK_ISA_SSE4A_UNSET ) #define OPTION_MASK_ISA_SSSE3_UNSET \ - (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_1_UNSET) + (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE4_1_UNSET) #define OPTION_MASK_ISA_SSE4_1_UNSET \ - (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_2_UNSET) -#define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4A + (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2_UNSET) +#define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4_2 -/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same - as -msse4.1 -msse4.2. -mno-sse4 should the same as -mno-sse4.1. */ -#define OPTION_MASK_ISA_SSE4 \ - (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2) +/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same + as -mno-sse4.1. */ #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET -#define OPTION_MASK_ISA_SSE4A_UNSET OPTION_MASK_ISA_SSE4 +#define OPTION_MASK_ISA_SSE4A_UNSET \ + (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE5_UNSET) -#define OPTION_MASK_ISA_SSE5_UNSET \ - (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET) +#define OPTION_MASK_ISA_SSE5_UNSET OPTION_MASK_ISA_SSE5 /* Vectorization library interface and handlers. */ tree (*ix86_veclib_handler)(enum built_in_function, tree, tree) = NULL; @@ -1809,8 +1840,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) switch (code) { case OPT_mmmx: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_MMX_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_MMX_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_UNSET; @@ -1818,8 +1853,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_m3dnow: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_3DNOW_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_UNSET; @@ -1830,8 +1869,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return false; case OPT_msse: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_UNSET; @@ -1839,8 +1882,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse2: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE2_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE2_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_UNSET; @@ -1848,8 +1895,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse3: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE3_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE3_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_UNSET; @@ -1857,8 +1908,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_mssse3: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSSE3_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSSE3_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_UNSET; @@ -1866,8 +1921,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse4_1: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_1_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_UNSET; @@ -1875,8 +1934,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse4_2: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_2_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_UNSET; @@ -1884,8 +1947,8 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse4: - ix86_isa_flags |= OPTION_MASK_ISA_SSE4; - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4; + ix86_isa_flags |= OPTION_MASK_ISA_SSE4_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_SET; return true; case OPT_mno_sse4: @@ -1894,8 +1957,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse4a: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE4A_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4A_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_UNSET; @@ -1903,8 +1970,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse5: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE5_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE5_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_UNSET; @@ -1963,7 +2034,31 @@ override_options (void) {&amdfam10_cost, 32, 24, 32, 7, 32} }; - static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES; + static const char *const cpu_names[TARGET_CPU_DEFAULT_max] = + { + "generic", + "i386", + "i486", + "pentium", + "pentium-mmx", + "pentiumpro", + "pentium2", + "pentium3", + "pentium4", + "pentium-m", + "prescott", + "nocona", + "core2", + "geode", + "k6", + "k6-2", + "k6-3", + "athlon", + "athlon-4", + "k8", + "amdfam10" + }; + enum pta_flags { PTA_SSE = 1 << 0, @@ -2011,7 +2106,7 @@ override_options (void) {"pentium4", PROCESSOR_PENTIUM4, PTA_MMX |PTA_SSE | PTA_SSE2}, {"pentium4m", PROCESSOR_PENTIUM4, PTA_MMX | PTA_SSE | PTA_SSE2}, {"prescott", PROCESSOR_NOCONA, PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3}, - {"nocona", PROCESSOR_NOCONA, (PTA_64BIT + {"nocona", PROCESSOR_NOCONA, (PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_CX16 | PTA_NO_SAHF)}, {"core2", PROCESSOR_CORE2, (PTA_64BIT @@ -2138,7 +2233,7 @@ override_options (void) ix86_tune_string = ix86_arch_string; if (!ix86_tune_string) { - ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT]; + ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT]; ix86_tune_defaulted = 1; } @@ -2181,7 +2276,7 @@ override_options (void) ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386"; else ix86_arch_specified = 1; - + if (!strcmp (ix86_arch_string, "generic")) error ("generic CPU can be used only for -mtune= switch"); if (!strncmp (ix86_arch_string, "generic", 7)) @@ -2505,34 +2600,6 @@ override_options (void) if (!TARGET_80387) target_flags |= MASK_NO_FANCY_MATH_387; - /* Turn on SSE4A bultins for -msse5. */ - if (TARGET_SSE5) - ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; - - /* Turn on SSE4.1 builtins for -msse4.2. */ - if (TARGET_SSE4_2) - ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1; - - /* Turn on SSSE3 builtins for -msse4.1. */ - if (TARGET_SSE4_1) - ix86_isa_flags |= OPTION_MASK_ISA_SSSE3; - - /* Turn on SSE3 builtins for -mssse3. */ - if (TARGET_SSSE3) - ix86_isa_flags |= OPTION_MASK_ISA_SSE3; - - /* Turn on SSE3 builtins for -msse4a. */ - if (TARGET_SSE4A) - ix86_isa_flags |= OPTION_MASK_ISA_SSE3; - - /* Turn on SSE2 builtins for -msse3. */ - if (TARGET_SSE3) - ix86_isa_flags |= OPTION_MASK_ISA_SSE2; - - /* Turn on SSE builtins for -msse2. */ - if (TARGET_SSE2) - ix86_isa_flags |= OPTION_MASK_ISA_SSE; - /* Turn on MMX builtins for -msse. */ if (TARGET_SSE) { @@ -2540,10 +2607,6 @@ override_options (void) x86_prefetch_sse = true; } - /* Turn on MMX builtins for 3Dnow. */ - if (TARGET_3DNOW) - ix86_isa_flags |= OPTION_MASK_ISA_MMX; - /* Turn on popcnt instruction for -msse4.2 or -mabm. */ if (TARGET_SSE4_2 || TARGET_ABM) x86_popcnt = true; @@ -2637,6 +2700,18 @@ override_options (void) target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; } + /* If stack probes are required, the space used for large function + arguments on the stack must also be probed, so enable + -maccumulate-outgoing-args so this happens in the prologue. */ + if (TARGET_STACK_PROBE + && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) + { + if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) + warning (0, "stack probing requires -maccumulate-outgoing-args " + "for correctness"); + target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; + } + /* For sane SSE instruction set generation we need fcomi instruction. It is safe to enable all CMOVE instructions. */ if (TARGET_SSE) @@ -2665,6 +2740,11 @@ override_options (void) set_param_value ("l1-cache-size", ix86_cost->l1_cache_size); if (!PARAM_SET_P (PARAM_L2_CACHE_SIZE)) set_param_value ("l2-cache-size", ix86_cost->l2_cache_size); + + /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0) + can be optimized to ap = __builtin_next_arg (0). */ + if (!TARGET_64BIT || TARGET_64BIT_MS_ABI) + targetm.expand_builtin_va_start = NULL; } /* Return true if this goes in large data/bss. */ @@ -2904,6 +2984,7 @@ optimization_options (int level, int size ATTRIBUTE_UNUSED) flag_omit_frame_pointer = 2; flag_pcc_struct_return = 2; flag_asynchronous_unwind_tables = 2; + flag_vect_cost_model = 1; #ifdef SUBTARGET_OPTIMIZATION_OPTIONS SUBTARGET_OPTIMIZATION_OPTIONS; #endif @@ -3118,7 +3199,8 @@ ix86_comp_type_attributes (const_tree type1, const_tree type2) /* Check for mismatch of non-default calling convention. */ const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall"; - if (TREE_CODE (type1) != FUNCTION_TYPE) + if (TREE_CODE (type1) != FUNCTION_TYPE + && TREE_CODE (type1) != METHOD_TYPE) return 1; /* Check for mismatched fastcall/regparm types. */ @@ -3173,9 +3255,9 @@ ix86_function_regparm (const_tree type, const_tree decl) struct function *f; /* Make sure no regparm register is taken by a - global register variable. */ - for (local_regparm = 0; local_regparm < 3; local_regparm++) - if (global_regs[local_regparm]) + fixed register variable. */ + for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++) + if (fixed_regs[local_regparm]) break; /* We can't use regparm(3) for nested functions as these use @@ -3197,12 +3279,14 @@ ix86_function_regparm (const_tree type, const_tree decl) TYPE_ATTRIBUTES (TREE_TYPE (decl))))) local_regparm = 2; - /* Each global register variable increases register preassure, - so the more global reg vars there are, the smaller regparm - optimization use, unless requested by the user explicitly. */ - for (regno = 0; regno < 6; regno++) - if (global_regs[regno]) + /* Each fixed register usage increases register pressure, + so less registers should be used for argument passing. + This functionality can be overriden by an explicit + regparm value. */ + for (regno = 0; regno <= DI_REG; regno++) + if (fixed_regs[regno]) globals++; + local_regparm = globals < local_regparm ? local_regparm - globals : 0; @@ -3220,7 +3304,7 @@ ix86_function_regparm (const_tree type, const_tree decl) indirectly or considering a libcall. Otherwise return 0. */ static int -ix86_function_sseregparm (const_tree type, const_tree decl) +ix86_function_sseregparm (const_tree type, const_tree decl, bool warn) { gcc_assert (!TARGET_64BIT); @@ -3231,12 +3315,15 @@ ix86_function_sseregparm (const_tree type, const_tree decl) { if (!TARGET_SSE) { - if (decl) - error ("Calling %qD with attribute sseregparm without " - "SSE/SSE2 enabled", decl); - else - error ("Calling %qT with attribute sseregparm without " - "SSE/SSE2 enabled", type); + if (warn) + { + if (decl) + error ("Calling %qD with attribute sseregparm without " + "SSE/SSE2 enabled", decl); + else + error ("Calling %qT with attribute sseregparm without " + "SSE/SSE2 enabled", type); + } return 0; } @@ -3360,7 +3447,7 @@ ix86_function_arg_regno_p (int regno) } /* RAX is used as hidden argument to va_arg functions. */ - if (!TARGET_64BIT_MS_ABI && regno == 0) + if (!TARGET_64BIT_MS_ABI && regno == AX_REG) return true; if (TARGET_64BIT_MS_ABI) @@ -3398,6 +3485,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ rtx libname, /* SYMBOL_REF of library name or 0 */ tree fndecl) { + struct cgraph_local_info *i = fndecl ? cgraph_local_info (fndecl) : NULL; memset (cum, 0, sizeof (*cum)); /* Set up the number of registers to use for passing arguments. */ @@ -3408,6 +3496,15 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ cum->mmx_nregs = MMX_REGPARM_MAX; cum->warn_sse = true; cum->warn_mmx = true; + + /* Because type might mismatch in between caller and callee, we need to + use actual type of function for local calls. + FIXME: cgraph_analyze can be told to actually record if function uses + va_start so for local functions maybe_vaarg can be made aggressive + helping K&R code. + FIXME: once typesytem is fixed, we won't need this code anymore. */ + if (i && i->local) + fntype = TREE_TYPE (fndecl); cum->maybe_vaarg = (fntype ? (!prototype_p (fntype) || stdarg_p (fntype)) : !libname); @@ -3441,7 +3538,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ /* Set up the number of SSE registers used for passing SFmode and DFmode arguments. Warn for mismatching ABI. */ - cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl); + cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true); } } @@ -3741,7 +3838,7 @@ classify_argument (enum machine_mode mode, const_tree type, } /* for V1xx modes, just use the base mode */ - if (VECTOR_MODE_P (mode) + if (VECTOR_MODE_P (mode) && mode != V1DImode && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes) mode = GET_MODE_INNER (mode); @@ -3813,6 +3910,7 @@ classify_argument (enum machine_mode mode, const_tree type, classes[0] = X86_64_SSE_CLASS; classes[1] = X86_64_SSEUP_CLASS; return 2; + case V1DImode: case V2SFmode: case V2SImode: case V4HImode: @@ -4114,6 +4212,7 @@ function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, case V4HImode: case V2SImode: case V2SFmode: + case V1DImode: if (!type || !AGGREGATE_TYPE_P (type)) { cum->mmx_words += words; @@ -4228,15 +4327,18 @@ function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, int regno = cum->regno; /* Fastcall allocates the first two DWORD (SImode) or - smaller arguments to ECX and EDX. */ + smaller arguments to ECX and EDX if it isn't an + aggregate type . */ if (cum->fastcall) { - if (mode == BLKmode || mode == DImode) + if (mode == BLKmode + || mode == DImode + || (type && AGGREGATE_TYPE_P (type))) break; /* ECX not EAX is the first allocated register. */ - if (regno == 0) - regno = 2; + if (regno == AX_REG) + regno = CX_REG; } return gen_rtx_REG (mode, regno); } @@ -4274,6 +4376,7 @@ function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, case V4HImode: case V2SImode: case V2SFmode: + case V1DImode: if (!type || !AGGREGATE_TYPE_P (type)) { if (!TARGET_MMX && !warnedmmx && cum->warn_mmx) @@ -4557,13 +4660,13 @@ function_value_32 (enum machine_mode orig_mode, enum machine_mode mode, regno = FIRST_FLOAT_REG; else /* Most things go in %eax. */ - regno = 0; - + regno = AX_REG; + /* Override FP return register with %xmm0 for local functions when SSE math is enabled or for functions with sseregparm attribute. */ if ((fn || fntype) && (mode == SFmode || mode == DFmode)) { - int sse_level = ix86_function_sseregparm (fntype, fn); + int sse_level = ix86_function_sseregparm (fntype, fn, false); if ((sse_level >= 1 && mode == SFmode) || (sse_level == 2 && mode == DFmode)) regno = FIRST_SSE_REG; @@ -4598,7 +4701,7 @@ function_value_64 (enum machine_mode orig_mode, enum machine_mode mode, case TCmode: return NULL; default: - return gen_rtx_REG (mode, 0); + return gen_rtx_REG (mode, AX_REG); } } @@ -4609,7 +4712,7 @@ function_value_64 (enum machine_mode orig_mode, enum machine_mode mode, /* For zero sized structures, construct_container returns NULL, but we need to keep rest of compiler happy by returning meaningful value. */ if (!ret) - ret = gen_rtx_REG (orig_mode, 0); + ret = gen_rtx_REG (orig_mode, AX_REG); return ret; } @@ -4617,7 +4720,7 @@ function_value_64 (enum machine_mode orig_mode, enum machine_mode mode, static rtx function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode) { - unsigned int regno = 0; + unsigned int regno = AX_REG; if (TARGET_SSE) { @@ -4721,11 +4824,12 @@ return_in_memory_ms_64 (const_tree type, enum machine_mode mode) HOST_WIDE_INT size = int_size_in_bytes (type); /* __m128 and friends are returned in xmm0. */ - if (size == 16 && VECTOR_MODE_P (mode)) + if (!COMPLEX_MODE_P (mode) && size == 16 && VECTOR_MODE_P (mode)) return 0; - /* Otherwise, the size must be exactly in [1248]. */ - return (size != 1 && size != 2 && size != 4 && size != 8); + /* Otherwise, the size must be exactly in [1248]. But not for complex. */ + return (size != 1 && size != 2 && size != 4 && size != 8) + || COMPLEX_MODE_P (mode); } int @@ -4746,7 +4850,7 @@ ix86_return_in_memory (const_tree type) but differs notably in that when MMX is available, 8-byte vectors are returned in memory, rather than in MMX registers. */ -int +int ix86_sol10_return_in_memory (const_tree type) { int size; @@ -4889,7 +4993,7 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) ix86_save_varrargs_registers = 1; /* We need 16-byte stack alignment to save SSE registers. If user asked for lower preferred_stack_boundary, lets just hope that he knows - what he is doing and won't varargs SSE values. + what he is doing and won't varargs SSE values. We also may end up assuming that only 64bit values are stored in SSE register let some floating point program work. */ @@ -4926,7 +5030,7 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) label - 5*eax + nnamed_sse_arguments*5 */ tmp_reg = gen_reg_rtx (Pmode); nsse_reg = gen_reg_rtx (Pmode); - emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, 0))); + emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, AX_REG))); emit_insn (gen_rtx_SET (VOIDmode, tmp_reg, gen_rtx_MULT (Pmode, nsse_reg, GEN_INT (4)))); @@ -5011,7 +5115,7 @@ ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode, /* Implement va_start. */ -void +static void ix86_va_start (tree valist, rtx nextarg) { HOST_WIDE_INT words, n_gpr, n_fpr; @@ -6172,7 +6276,7 @@ ix86_internal_arg_pointer (void) ix86_force_align_arg_pointer_string); return virtual_incoming_args_rtx; } - cfun->machine->force_align_arg_pointer = gen_rtx_REG (Pmode, 2); + cfun->machine->force_align_arg_pointer = gen_rtx_REG (Pmode, CX_REG); return copy_to_reg (cfun->machine->force_align_arg_pointer); } else @@ -6281,8 +6385,12 @@ ix86_expand_prologue (void) allocate += frame.nregs * UNITS_PER_WORD; /* When using red zone we may start register saving before allocating - the stack frame saving one cycle of the prologue. */ - if (TARGET_RED_ZONE && frame.save_regs_using_mov) + the stack frame saving one cycle of the prologue. However I will + avoid doing this if I am going to have to probe the stack since + at least on x86_64 the stack probe can turn into a call that clobbers + a red zone location */ + if (TARGET_RED_ZONE && frame.save_regs_using_mov + && (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)) ix86_emit_save_regs_using_mov (frame_pointer_needed ? hard_frame_pointer_rtx : stack_pointer_rtx, -frame.nregs * UNITS_PER_WORD); @@ -6295,7 +6403,7 @@ ix86_expand_prologue (void) else { /* Only valid for Win32. */ - rtx eax = gen_rtx_REG (Pmode, 0); + rtx eax = gen_rtx_REG (Pmode, AX_REG); bool eax_live; rtx t; @@ -6338,7 +6446,9 @@ ix86_expand_prologue (void) } } - if (frame.save_regs_using_mov && !TARGET_RED_ZONE) + if (frame.save_regs_using_mov + && !(TARGET_RED_ZONE + && (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT))) { if (!frame_pointer_needed || !frame.to_allocate) ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate); @@ -6366,8 +6476,7 @@ ix86_expand_prologue (void) { if (ix86_cmodel == CM_LARGE_PIC) { - rtx tmp_reg = gen_rtx_REG (DImode, - FIRST_REX_INT_REG + 3 /* R11 */); + rtx tmp_reg = gen_rtx_REG (DImode, R11_REG); rtx label = gen_label_rtx (); emit_label (label); LABEL_PRESERVE_P (label) = 1; @@ -6580,7 +6689,7 @@ ix86_expand_epilogue (int style) if (current_function_pops_args >= 65536) { - rtx ecx = gen_rtx_REG (SImode, 2); + rtx ecx = gen_rtx_REG (SImode, CX_REG); /* There is no "pascal" calling convention in any 64bit ABI. */ gcc_assert (!TARGET_64BIT); @@ -6756,7 +6865,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) /* Special case: on K6, [%esi] makes the instruction vector decoded. Avoid this by transforming to [%esi+0]. */ - if (ix86_tune == PROCESSOR_K6 && !optimize_size + if (TARGET_K6 && !optimize_size && base_reg && !index_reg && !disp && REG_P (base_reg) && REGNO_REG_CLASS (REGNO (base_reg)) == SIREG) @@ -7643,7 +7752,7 @@ legitimize_tls_address (rtx x, enum tls_model model, int for_mov) if (TARGET_64BIT && ! TARGET_GNU2_TLS) { - rtx rax = gen_rtx_REG (Pmode, 0), insns; + rtx rax = gen_rtx_REG (Pmode, AX_REG), insns; start_sequence (); emit_call_insn (gen_tls_global_dynamic_64 (rax, x)); @@ -7672,7 +7781,7 @@ legitimize_tls_address (rtx x, enum tls_model model, int for_mov) if (TARGET_64BIT && ! TARGET_GNU2_TLS) { - rtx rax = gen_rtx_REG (Pmode, 0), insns, note; + rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, note; start_sequence (); emit_call_insn (gen_tls_local_dynamic_base_64 (rax)); @@ -7838,6 +7947,7 @@ get_dllimport_decl (tree decl) set_mem_alias_set (rtl, ix86_GOT_alias_set ()); SET_DECL_RTL (to, rtl); + SET_DECL_ASSEMBLER_NAME (to, get_identifier (name)); return to; } @@ -8183,7 +8293,8 @@ output_pic_addr_const (FILE *file, rtx x, int code) fputs ("@PLTOFF", file); break; case UNSPEC_GOTPCREL: - fputs ("@GOTPCREL(%rip)", file); + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file); break; case UNSPEC_GOTTPOFF: /* FIXME: This might be @TPOFF in Sun ld too. */ @@ -8203,7 +8314,8 @@ output_pic_addr_const (FILE *file, rtx x, int code) break; case UNSPEC_GOTNTPOFF: if (TARGET_64BIT) - fputs ("@GOTTPOFF(%rip)", file); + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "@GOTTPOFF(%rip)": "@GOTTPOFF[rip]", file); else fputs ("@GOTNTPOFF", file); break; @@ -8529,15 +8641,23 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse, void print_reg (rtx x, int code, FILE *file) { - gcc_assert (REGNO (x) != ARG_POINTER_REGNUM - && REGNO (x) != FRAME_POINTER_REGNUM - && REGNO (x) != FLAGS_REG - && REGNO (x) != FPSR_REG - && REGNO (x) != FPCR_REG); + gcc_assert (x == pc_rtx + || (REGNO (x) != ARG_POINTER_REGNUM + && REGNO (x) != FRAME_POINTER_REGNUM + && REGNO (x) != FLAGS_REG + && REGNO (x) != FPSR_REG + && REGNO (x) != FPCR_REG)); - if (ASSEMBLER_DIALECT == ASM_ATT || USER_LABEL_PREFIX[0] == 0) + if (ASSEMBLER_DIALECT == ASM_ATT) putc ('%', file); + if (x == pc_rtx) + { + gcc_assert (TARGET_64BIT); + fputs ("rip", file); + return; + } + if (code == 'w' || MMX_REG_P (x)) code = 2; else if (code == 'b') @@ -9034,8 +9154,9 @@ print_operand (FILE *file, rtx x, int code) else if (MEM_P (x)) { - /* No `byte ptr' prefix for call instructions. */ - if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P') + /* No `byte ptr' prefix for call instructions or BLKmode operands. */ + if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P' + && GET_MODE (x) != BLKmode) { const char * size; switch (GET_MODE_SIZE (GET_MODE (x))) @@ -9045,7 +9166,12 @@ print_operand (FILE *file, rtx x, int code) case 4: size = "DWORD"; break; case 8: size = "QWORD"; break; case 12: size = "XWORD"; break; - case 16: size = "XMMWORD"; break; + case 16: + if (GET_MODE (x) == XFmode) + size = "XWORD"; + else + size = "XMMWORD"; + break; default: gcc_unreachable (); } @@ -9163,7 +9289,7 @@ print_operand_address (FILE *file, rtx addr) break; case SEG_FS: case SEG_GS: - if (USER_LABEL_PREFIX[0] == 0) + if (ASSEMBLER_DIALECT == ASM_ATT) putc ('%', file); fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file); break; @@ -9171,6 +9297,21 @@ print_operand_address (FILE *file, rtx addr) gcc_unreachable (); } + /* Use one byte shorter RIP relative addressing for 64bit mode. */ + if (TARGET_64BIT && !base && !index) + { + rtx symbol = disp; + + if (GET_CODE (disp) == CONST + && GET_CODE (XEXP (disp, 0)) == PLUS + && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) + symbol = XEXP (XEXP (disp, 0), 0); + + if (GET_CODE (symbol) == LABEL_REF + || (GET_CODE (symbol) == SYMBOL_REF + && SYMBOL_REF_TLS_MODEL (symbol) == 0)) + base = pc_rtx; + } if (!base && !index) { /* Displacement only requires special attention. */ @@ -9178,30 +9319,13 @@ print_operand_address (FILE *file, rtx addr) if (CONST_INT_P (disp)) { if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT) - { - if (USER_LABEL_PREFIX[0] == 0) - putc ('%', file); - fputs ("ds:", file); - } + fputs ("ds:", file); fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp)); } else if (flag_pic) output_pic_addr_const (file, disp, 0); else output_addr_const (file, disp); - - /* Use one byte shorter RIP relative addressing for 64bit mode. */ - if (TARGET_64BIT) - { - if (GET_CODE (disp) == CONST - && GET_CODE (XEXP (disp, 0)) == PLUS - && CONST_INT_P (XEXP (XEXP (disp, 0), 1))) - disp = XEXP (XEXP (disp, 0), 0); - if (GET_CODE (disp) == LABEL_REF - || (GET_CODE (disp) == SYMBOL_REF - && SYMBOL_REF_TLS_MODEL (disp) == 0)) - fputs ("(%rip)", file); - } } else { @@ -9317,7 +9441,8 @@ output_addr_const_extra (FILE *file, rtx x) case UNSPEC_GOTNTPOFF: output_addr_const (file, op); if (TARGET_64BIT) - fputs ("@GOTTPOFF(%rip)", file); + fputs (ASSEMBLER_DIALECT == ASM_ATT ? + "@GOTTPOFF(%rip)" : "@GOTTPOFF[rip]", file); else fputs ("@GOTNTPOFF", file); break; @@ -10028,7 +10153,6 @@ maybe_get_pool_constant (rtx x) void ix86_expand_move (enum machine_mode mode, rtx operands[]) { - int strict = (reload_in_progress || reload_completed); rtx op0, op1; enum tls_model model; @@ -10122,31 +10246,29 @@ ix86_expand_move (enum machine_mode mode, rtx operands[]) /* Force large constants in 64bit compilation into register to get them CSEed. */ - if (TARGET_64BIT && mode == DImode + if (can_create_pseudo_p () + && (mode == DImode) && TARGET_64BIT && immediate_operand (op1, mode) && !x86_64_zext_immediate_operand (op1, VOIDmode) && !register_operand (op0, mode) - && optimize && !reload_completed && !reload_in_progress) + && optimize) op1 = copy_to_mode_reg (mode, op1); - if (FLOAT_MODE_P (mode)) + if (can_create_pseudo_p () + && FLOAT_MODE_P (mode) + && GET_CODE (op1) == CONST_DOUBLE) { /* If we are loading a floating point constant to a register, force the value to memory now, since we'll get better code out the back end. */ - if (strict) - ; - else if (GET_CODE (op1) == CONST_DOUBLE) + op1 = validize_mem (force_const_mem (mode, op1)); + if (!register_operand (op0, mode)) { - op1 = validize_mem (force_const_mem (mode, op1)); - if (!register_operand (op0, mode)) - { - rtx temp = gen_reg_rtx (mode); - emit_insn (gen_rtx_SET (VOIDmode, temp, op1)); - emit_move_insn (op0, temp); - return; - } + rtx temp = gen_reg_rtx (mode); + emit_insn (gen_rtx_SET (VOIDmode, temp, op1)); + emit_move_insn (op0, temp); + return; } } } @@ -10164,7 +10286,7 @@ ix86_expand_vector_move (enum machine_mode mode, rtx operands[]) the instructions used to build constants modify the upper 64 bits of the register, once we have that information we may be able to handle some of them more efficiently. */ - if ((reload_in_progress | reload_completed) == 0 + if (can_create_pseudo_p () && register_operand (op0, mode) && (CONSTANT_P (op1) || (GET_CODE (op1) == SUBREG @@ -10172,7 +10294,7 @@ ix86_expand_vector_move (enum machine_mode mode, rtx operands[]) && standard_sse_constant_p (op1) <= 0) op1 = validize_mem (force_const_mem (mode, op1)); - /* TDmode values are passed as TImode on the stack. Timode values + /* TDmode values are passed as TImode on the stack. TImode values are moved via xmm registers, and moving them to stack can result in unaligned memory access. Use ix86_expand_vector_move_misalign() if memory operand is not aligned correctly. */ @@ -10781,6 +10903,14 @@ ix86_expand_convert_uns_didf_sse (rtx target, rtx input) ix86_expand_vector_extract (false, target, fp_xmm, 0); } +/* Not used, but eases macroization of patterns. */ +void +ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED, + rtx input ATTRIBUTE_UNUSED) +{ + gcc_unreachable (); +} + /* Convert an unsigned SImode value into a DFmode. Only currently used for SSE, but applicable anywhere. */ @@ -10964,7 +11094,6 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, rtx operands[]) { rtx mask, set, use, clob, dst, src; - bool matching_memory; bool use_sse = false; bool vector_mode = VECTOR_MODE_P (mode); enum machine_mode elt_mode = mode; @@ -10989,19 +11118,6 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, dst = operands[0]; src = operands[1]; - /* If the destination is memory, and we don't have matching source - operands or we're using the x87, do things in registers. */ - matching_memory = false; - if (MEM_P (dst)) - { - if (use_sse && rtx_equal_p (dst, src)) - matching_memory = true; - else - dst = gen_reg_rtx (mode); - } - if (MEM_P (src) && !matching_memory) - src = force_reg (mode, src); - if (vector_mode) { set = gen_rtx_fmt_ee (code == NEG ? XOR : AND, mode, src, mask); @@ -11022,9 +11138,6 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, else emit_insn (set); } - - if (dst != operands[0]) - emit_move_insn (operands[0], dst); } /* Expand a copysign operation. Special case operand 0 being a constant. */ @@ -12044,16 +12157,30 @@ ix86_expand_branch (enum rtx_code code, rtx label) /* Otherwise, if we are doing less-than or greater-or-equal-than, op1 is a constant and the low word is zero, then we can just - examine the high word. */ + examine the high word. Similarly for low word -1 and + less-or-equal-than or greater-than. */ - if (CONST_INT_P (hi[1]) && lo[1] == const0_rtx) + if (CONST_INT_P (hi[1])) switch (code) { case LT: case LTU: case GE: case GEU: - ix86_compare_op0 = hi[0]; - ix86_compare_op1 = hi[1]; - ix86_expand_branch (code, label); - return; + if (lo[1] == const0_rtx) + { + ix86_compare_op0 = hi[0]; + ix86_compare_op1 = hi[1]; + ix86_expand_branch (code, label); + return; + } + break; + case LE: case LEU: case GT: case GTU: + if (lo[1] == constm1_rtx) + { + ix86_compare_op0 = hi[0]; + ix86_compare_op1 = hi[1]; + ix86_expand_branch (code, label); + return; + } + break; default: break; } @@ -13443,8 +13570,8 @@ ix86_expand_sse4_unpack (rtx operands[2], bool unsigned_p, bool high_p) #define PPERM_REV_INV 0x60 /* bit reverse & invert src */ #define PPERM_ZERO 0x80 /* all 0's */ #define PPERM_ONES 0xa0 /* all 1's */ -#define PPERM_SIGN 0xc0 /* propigate sign bit */ -#define PPERM_INV_SIGN 0xe0 /* invert & propigate sign */ +#define PPERM_SIGN 0xc0 /* propagate sign bit */ +#define PPERM_INV_SIGN 0xe0 /* invert & propagate sign */ #define PPERM_SRC1 0x00 /* use first source byte */ #define PPERM_SRC2 0x10 /* use second source byte */ @@ -13539,15 +13666,15 @@ ix86_expand_sse5_unpack (rtx operands[2], bool unsigned_p, bool high_p) for (i = 0; i < 16; i++) RTVEC_ELT (v, i) = GEN_INT (pperm_bytes[i]); - for (i = 0; i < 4; i++) + for (i = 0; i < 2; i++) RTVEC_ELT (vs, i) = GEN_INT (i + h2); p = gen_rtx_PARALLEL (VOIDmode, vs); x = force_reg (V16QImode, gen_rtx_CONST_VECTOR (V16QImode, v)); if (unsigned_p) - emit_insn (gen_sse5_pperm_zero_v8hi_v4si (op0, op1, p, x)); + emit_insn (gen_sse5_pperm_zero_v4si_v2di (op0, op1, p, x)); else - emit_insn (gen_sse5_pperm_sign_v8hi_v4si (op0, op1, p, x)); + emit_insn (gen_sse5_pperm_sign_v4si_v2di (op0, op1, p, x)); break; default: @@ -15057,21 +15184,33 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, int *dynamic_check) { const struct stringop_algs * algs; + /* Algorithms using the rep prefix want at least edi and ecx; + additionally, memset wants eax and memcpy wants esi. Don't + consider such algorithms if the user has appropriated those + registers for their own purposes. */ + bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG] + || (memset + ? fixed_regs[AX_REG] : fixed_regs[SI_REG])); + +#define ALG_USABLE_P(alg) (rep_prefix_usable \ + || (alg != rep_prefix_1_byte \ + && alg != rep_prefix_4_byte \ + && alg != rep_prefix_8_byte)) *dynamic_check = -1; if (memset) algs = &ix86_cost->memset[TARGET_64BIT != 0]; else algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; - if (stringop_alg != no_stringop) + if (stringop_alg != no_stringop && ALG_USABLE_P (stringop_alg)) return stringop_alg; /* rep; movq or rep; movl is the smallest variant. */ else if (optimize_size) { if (!count || (count & 3)) - return rep_prefix_1_byte; + return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte; else - return rep_prefix_4_byte; + return rep_prefix_usable ? rep_prefix_4_byte : loop; } /* Very tiny blocks are best handled via the loop, REP is expensive to setup. */ @@ -15083,27 +15222,34 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, enum stringop_alg alg = libcall; for (i = 0; i < NAX_STRINGOP_ALGS; i++) { - gcc_assert (algs->size[i].max); + /* We get here if the algorithms that were not libcall-based + were rep-prefix based and we are unable to use rep prefixes + based on global register usage. Break out of the loop and + use the heuristic below. */ + if (algs->size[i].max == 0) + break; if (algs->size[i].max >= expected_size || algs->size[i].max == -1) { - if (algs->size[i].alg != libcall) - alg = algs->size[i].alg; + enum stringop_alg candidate = algs->size[i].alg; + + if (candidate != libcall && ALG_USABLE_P (candidate)) + alg = candidate; /* Honor TARGET_INLINE_ALL_STRINGOPS by picking - last non-libcall inline algorithm. */ + last non-libcall inline algorithm. */ if (TARGET_INLINE_ALL_STRINGOPS) { /* When the current size is best to be copied by a libcall, - but we are still forced to inline, run the heuristic bellow + but we are still forced to inline, run the heuristic below that will pick code for medium sized blocks. */ if (alg != libcall) return alg; break; } - else - return algs->size[i].alg; + else if (ALG_USABLE_P (candidate)) + return candidate; } } - gcc_assert (TARGET_INLINE_ALL_STRINGOPS); + gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable); } /* When asked to inline the call anyway, try to pick meaningful choice. We look for maximal size of block that is faster to copy by hand and @@ -15113,15 +15259,32 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, If this turns out to be bad, we might simply specify the preferred choice in ix86_costs. */ if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY) - && algs->unknown_size == libcall) + && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size))) { int max = -1; enum stringop_alg alg; int i; + bool any_alg_usable_p = true; for (i = 0; i < NAX_STRINGOP_ALGS; i++) - if (algs->size[i].alg != libcall && algs->size[i].alg) - max = algs->size[i].max; + { + enum stringop_alg candidate = algs->size[i].alg; + any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate); + + if (candidate != libcall && candidate + && ALG_USABLE_P (candidate)) + max = algs->size[i].max; + } + /* If there aren't any usable algorithms, then recursing on + smaller sizes isn't going to find anything. Just return the + simple byte-at-a-time copy loop. */ + if (!any_alg_usable_p) + { + /* Pick something reasonable. */ + if (TARGET_INLINE_STRINGOPS_DYNAMICALLY) + *dynamic_check = 128; + return loop_1_byte; + } if (max == -1) max = 4096; alg = decide_alg (count, max / 2, memset, dynamic_check); @@ -15131,7 +15294,8 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, *dynamic_check = max; return alg; } - return algs->unknown_size; + return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall; +#undef ALG_USABLE_P } /* Decide on alignment. We know that the operand is already aligned to ALIGN @@ -15196,7 +15360,7 @@ smallest_pow2_greater_than (int val) } /* Expand string move (memcpy) operation. Use i386 string operations when - profitable. expand_clrmem contains similar code. The code depends upon + profitable. expand_setmem contains similar code. The code depends upon architecture, block size and alignment, but always has the same overall structure: @@ -15247,6 +15411,10 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, if (CONST_INT_P (expected_size_exp) && count == 0) expected_size = INTVAL (expected_size_exp); + /* Make sure we don't need to care about overflow later on. */ + if (count > ((unsigned HOST_WIDE_INT) 1 << 30)) + return 0; + /* Step 0: Decide on preferred algorithm, desired alignment and size of chunks to be copied by main loop. */ @@ -15292,12 +15460,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, /* Alignment code needs count to be in register. */ if (CONST_INT_P (count_exp) && desired_align > align) - { - enum machine_mode mode = SImode; - if (TARGET_64BIT && (count & ~0xffffffff)) - mode = DImode; - count_exp = force_reg (mode, count_exp); - } + count_exp = force_reg (counter_mode (count_exp), count_exp); gcc_assert (desired_align >= 1 && align >= 1); /* Ensure that alignment prologue won't copy past end of block. */ @@ -15308,29 +15471,48 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, Make sure it is power of 2. */ epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); - label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (epilogue_size_needed), - LTU, 0, counter_mode (count_exp), 1, label); - if (GET_CODE (count_exp) == CONST_INT) - ; - else if (expected_size == -1 || expected_size < epilogue_size_needed) - predict_jump (REG_BR_PROB_BASE * 60 / 100); + if (CONST_INT_P (count_exp)) + { + if (UINTVAL (count_exp) < (unsigned HOST_WIDE_INT)epilogue_size_needed) + goto epilogue; + } else - predict_jump (REG_BR_PROB_BASE * 20 / 100); + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (expected_size == -1 || expected_size < epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); + } } + /* Emit code to decide on runtime whether library call or inline should be used. */ if (dynamic_check != -1) { - rtx hot_label = gen_label_rtx (); - jump_around_label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), - LEU, 0, GET_MODE (count_exp), 1, hot_label); - predict_jump (REG_BR_PROB_BASE * 90 / 100); - emit_block_move_via_libcall (dst, src, count_exp, false); - emit_jump (jump_around_label); - emit_label (hot_label); + if (CONST_INT_P (count_exp)) + { + if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check) + { + emit_block_move_via_libcall (dst, src, count_exp, false); + count_exp = const0_rtx; + goto epilogue; + } + } + else + { + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, GET_MODE (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + emit_block_move_via_libcall (dst, src, count_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } } /* Step 2: Alignment prologue. */ @@ -15403,7 +15585,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, } /* Step 4: Epilogue to copy the remaining bytes. */ - + epilogue: if (label) { /* When the main loop is done, COUNT_EXP might hold original count, @@ -15557,6 +15739,10 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, if (CONST_INT_P (expected_size_exp) && count == 0) expected_size = INTVAL (expected_size_exp); + /* Make sure we don't need to care about overflow later on. */ + if (count > ((unsigned HOST_WIDE_INT) 1 << 30)) + return 0; + /* Step 0: Decide on preferred algorithm, desired alignment and size of chunks to be copied by main loop. */ @@ -15985,6 +16171,11 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) else { rtx unspec; + + /* Can't use this if the user has appropriated eax, ecx, or edi. */ + if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG]) + return false; + scratch2 = gen_reg_rtx (Pmode); scratch3 = gen_reg_rtx (Pmode); scratch4 = force_reg (Pmode, constm1_rtx); @@ -16057,7 +16248,7 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, if (TARGET_64BIT && INTVAL (callarg2) >= 0) { - rtx al = gen_rtx_REG (QImode, 0); + rtx al = gen_rtx_REG (QImode, AX_REG); emit_move_insn (al, callarg2); use_reg (&use, al); } @@ -16582,15 +16773,18 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) static int ia32_multipass_dfa_lookahead (void) { - if (ix86_tune == PROCESSOR_PENTIUM) - return 2; + switch (ix86_tune) + { + case PROCESSOR_PENTIUM: + return 2; - if (ix86_tune == PROCESSOR_PENTIUMPRO - || ix86_tune == PROCESSOR_K6) - return 1; + case PROCESSOR_PENTIUMPRO: + case PROCESSOR_K6: + return 1; - else - return 0; + default: + return 0; + } } @@ -16603,7 +16797,8 @@ ia32_multipass_dfa_lookahead (void) int ix86_constant_alignment (tree exp, int align) { - if (TREE_CODE (exp) == REAL_CST) + if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST + || TREE_CODE (exp) == INTEGER_CST) { if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64) return 64; @@ -16965,9 +17160,11 @@ enum ix86_builtins IX86_BUILTIN_RCPPS, IX86_BUILTIN_RCPSS, IX86_BUILTIN_RSQRTPS, + IX86_BUILTIN_RSQRTPS_NR, IX86_BUILTIN_RSQRTSS, IX86_BUILTIN_RSQRTF, IX86_BUILTIN_SQRTPS, + IX86_BUILTIN_SQRTPS_NR, IX86_BUILTIN_SQRTSS, IX86_BUILTIN_UNPCKHPS, @@ -17721,7 +17918,7 @@ static const struct builtin_description bdesc_2arg[] = { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE, CODE_FOR_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, 0 }, @@ -17770,11 +17967,11 @@ static const struct builtin_description bdesc_2arg[] = { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_adddi3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subdi3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, 0 }, @@ -17825,25 +18022,6 @@ static const struct builtin_description bdesc_2arg[] = { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, 0, IX86_BUILTIN_CVTSI2SS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, 0, IX86_BUILTIN_CVTSI642SS, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLW, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLWI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLDI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQ, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQI, UNKNOWN, 0 }, - - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLW, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLWI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLDI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQ, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQI, UNKNOWN, 0 }, - - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAW, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAWI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRAD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRADI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, 0, IX86_BUILTIN_PSADBW, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, 0, IX86_BUILTIN_PMADDWD, UNKNOWN, 0 }, @@ -17952,20 +18130,9 @@ static const struct builtin_description bdesc_2arg[] = { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, 0, IX86_BUILTIN_PSADBW128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulsidi3, 0, IX86_BUILTIN_PMULUDQ, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, 0, IX86_BUILTIN_PMULUDQ, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, 0, IX86_BUILTIN_PMULUDQ128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, 0, IX86_BUILTIN_PSLLWI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, 0, IX86_BUILTIN_PSLLDI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, 0, IX86_BUILTIN_PSLLQI128, UNKNOWN, 0 }, - - { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, 0, IX86_BUILTIN_PSRLWI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, 0, IX86_BUILTIN_PSRLDI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, 0, IX86_BUILTIN_PSRLQI128, UNKNOWN, 0 }, - - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, 0, IX86_BUILTIN_PSRAWI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, 0, IX86_BUILTIN_PSRADI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, 0, IX86_BUILTIN_PMADDWD128, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, 0, IX86_BUILTIN_CVTSI2SD, UNKNOWN, 0 }, @@ -18027,11 +18194,14 @@ static const struct builtin_description bdesc_2arg[] = static const struct builtin_description bdesc_1arg[] = { + /* SSE */ { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, 0, IX86_BUILTIN_MOVMSKPS, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS_NR, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, 0, IX86_BUILTIN_RCPPS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, 0, IX86_BUILTIN_CVTPS2PI, UNKNOWN, 0 }, @@ -18041,6 +18211,7 @@ static const struct builtin_description bdesc_1arg[] = { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, 0, IX86_BUILTIN_CVTTSS2SI, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, 0, IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, 0 }, + /* SSE2 */ { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB128, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, 0, IX86_BUILTIN_MOVMSKPD, UNKNOWN, 0 }, @@ -18398,6 +18569,8 @@ ix86_init_mmx_sse_builtins (void) tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode); tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode); + tree V1DI_type_node + = build_vector_type_for_mode (long_long_integer_type_node, V1DImode); tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode); tree V2DI_type_node = build_vector_type_for_mode (long_long_integer_type_node, V2DImode); @@ -18462,14 +18635,13 @@ ix86_init_mmx_sse_builtins (void) tree v4hi_ftype_v4hi_int = build_function_type_list (V4HI_type_node, V4HI_type_node, integer_type_node, NULL_TREE); - tree v4hi_ftype_v4hi_di - = build_function_type_list (V4HI_type_node, - V4HI_type_node, long_long_unsigned_type_node, - NULL_TREE); - tree v2si_ftype_v2si_di + tree v2si_ftype_v2si_int = build_function_type_list (V2SI_type_node, - V2SI_type_node, long_long_unsigned_type_node, - NULL_TREE); + V2SI_type_node, integer_type_node, NULL_TREE); + tree v1di_ftype_v1di_int + = build_function_type_list (V1DI_type_node, + V1DI_type_node, integer_type_node, NULL_TREE); + tree void_ftype_void = build_function_type (void_type_node, void_list_node); tree void_ftype_unsigned @@ -18536,10 +18708,9 @@ ix86_init_mmx_sse_builtins (void) tree v2si_ftype_v2si_v2si = build_function_type_list (V2SI_type_node, V2SI_type_node, V2SI_type_node, NULL_TREE); - tree di_ftype_di_di - = build_function_type_list (long_long_unsigned_type_node, - long_long_unsigned_type_node, - long_long_unsigned_type_node, NULL_TREE); + tree v1di_ftype_v1di_v1di + = build_function_type_list (V1DI_type_node, + V1DI_type_node, V1DI_type_node, NULL_TREE); tree di_ftype_di_di_int = build_function_type_list (long_long_unsigned_type_node, @@ -18663,11 +18834,11 @@ ix86_init_mmx_sse_builtins (void) tree v4si_ftype_v8hi_v8hi = build_function_type_list (V4SI_type_node, V8HI_type_node, V8HI_type_node, NULL_TREE); - tree di_ftype_v8qi_v8qi - = build_function_type_list (long_long_unsigned_type_node, + tree v1di_ftype_v8qi_v8qi + = build_function_type_list (V1DI_type_node, V8QI_type_node, V8QI_type_node, NULL_TREE); - tree di_ftype_v2si_v2si - = build_function_type_list (long_long_unsigned_type_node, + tree v1di_ftype_v2si_v2si + = build_function_type_list (V1DI_type_node, V2SI_type_node, V2SI_type_node, NULL_TREE); tree v2di_ftype_v16qi_v16qi = build_function_type_list (V2DI_type_node, @@ -18995,8 +19166,8 @@ ix86_init_mmx_sse_builtins (void) case V2SImode: type = v2si_ftype_v2si_v2si; break; - case DImode: - type = di_ftype_di_di; + case V1DImode: + type = v1di_ftype_v1di_v1di; break; default: @@ -19088,16 +19259,25 @@ ix86_init_mmx_sse_builtins (void) /* Add the remaining MMX insns with somewhat more complicated types. */ def_builtin (OPTION_MASK_ISA_MMX, "__builtin_ia32_emms", void_ftype_void, IX86_BUILTIN_EMMS); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSLLW); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_di, IX86_BUILTIN_PSLLD); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllq", di_ftype_di_di, IX86_BUILTIN_PSLLQ); - - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRLW); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_di, IX86_BUILTIN_PSRLD); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlq", di_ftype_di_di, IX86_BUILTIN_PSRLQ); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRAW); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_di, IX86_BUILTIN_PSRAD); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllwi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSLLWI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslldi", v2si_ftype_v2si_int, IX86_BUILTIN_PSLLDI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllqi", v1di_ftype_v1di_int, IX86_BUILTIN_PSLLQI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSLLW); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSLLD); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllq", v1di_ftype_v1di_v1di, IX86_BUILTIN_PSLLQ); + + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlwi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSRLWI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrldi", v2si_ftype_v2si_int, IX86_BUILTIN_PSRLDI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlqi", v1di_ftype_v1di_int, IX86_BUILTIN_PSRLQI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSRLW); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSRLD); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlq", v1di_ftype_v1di_v1di, IX86_BUILTIN_PSRLQ); + + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrawi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSRAWI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psradi", v2si_ftype_v2si_int, IX86_BUILTIN_PSRADI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSRAW); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSRAD); def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_pshufw", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSHUFW); def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pmaddwd", v2si_ftype_v4hi_v4hi, IX86_BUILTIN_PMADDWD); @@ -19146,17 +19326,19 @@ ix86_init_mmx_sse_builtins (void) def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_sfence", void_ftype_void, IX86_BUILTIN_SFENCE); - def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_psadbw", di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW); + def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_psadbw", v1di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpps", v4sf_ftype_v4sf, IX86_BUILTIN_RCPPS); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS); + def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps_nr", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS_NR); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS); ftype = build_function_type_list (float_type_node, float_type_node, NULL_TREE); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtf", ftype, IX86_BUILTIN_RSQRTF); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS); + def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps_nr", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS_NR); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS); @@ -19248,7 +19430,7 @@ ix86_init_mmx_sse_builtins (void) def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_loaddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LOADDQU); def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_storedqu", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQU); - def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pmuludq", di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pmuludq", v1di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ); def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pmuludq128", v2di_ftype_v4si_v4si, IX86_BUILTIN_PMULUDQ128); def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pslldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLDQI128); @@ -19794,7 +19976,7 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode); - if (optimize + if (optimize || ! (*insn_data[icode].operand[i+adjust+1].predicate) (op, mode) || num_memory > 1) op = force_reg (mode, op); @@ -20640,80 +20822,71 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, emit_insn (pat); return target; - case IX86_BUILTIN_PSLLWI128: - icode = CODE_FOR_ashlv8hi3; - goto do_pshifti; - case IX86_BUILTIN_PSLLDI128: - icode = CODE_FOR_ashlv4si3; - goto do_pshifti; - case IX86_BUILTIN_PSLLQI128: - icode = CODE_FOR_ashlv2di3; - goto do_pshifti; - case IX86_BUILTIN_PSRAWI128: - icode = CODE_FOR_ashrv8hi3; - goto do_pshifti; - case IX86_BUILTIN_PSRADI128: - icode = CODE_FOR_ashrv4si3; - goto do_pshifti; - case IX86_BUILTIN_PSRLWI128: - icode = CODE_FOR_lshrv8hi3; - goto do_pshifti; - case IX86_BUILTIN_PSRLDI128: - icode = CODE_FOR_lshrv4si3; - goto do_pshifti; - case IX86_BUILTIN_PSRLQI128: - icode = CODE_FOR_lshrv2di3; - goto do_pshifti; - do_pshifti: - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - - if (!CONST_INT_P (op1)) - { - error ("shift must be an immediate"); - return const0_rtx; - } - if (INTVAL (op1) < 0 || INTVAL (op1) > 255) - op1 = GEN_INT (255); - - tmode = insn_data[icode].operand[0].mode; - mode1 = insn_data[icode].operand[1].mode; - if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) - op0 = copy_to_reg (op0); - - target = gen_reg_rtx (tmode); - pat = GEN_FCN (icode) (target, op0, op1); - if (!pat) - return 0; - emit_insn (pat); - return target; + case IX86_BUILTIN_PSLLW: + case IX86_BUILTIN_PSLLWI: + icode = CODE_FOR_mmx_ashlv4hi3; + goto do_pshift; + case IX86_BUILTIN_PSLLD: + case IX86_BUILTIN_PSLLDI: + icode = CODE_FOR_mmx_ashlv2si3; + goto do_pshift; + case IX86_BUILTIN_PSLLQ: + case IX86_BUILTIN_PSLLQI: + icode = CODE_FOR_mmx_ashlv1di3; + goto do_pshift; + case IX86_BUILTIN_PSRAW: + case IX86_BUILTIN_PSRAWI: + icode = CODE_FOR_mmx_ashrv4hi3; + goto do_pshift; + case IX86_BUILTIN_PSRAD: + case IX86_BUILTIN_PSRADI: + icode = CODE_FOR_mmx_ashrv2si3; + goto do_pshift; + case IX86_BUILTIN_PSRLW: + case IX86_BUILTIN_PSRLWI: + icode = CODE_FOR_mmx_lshrv4hi3; + goto do_pshift; + case IX86_BUILTIN_PSRLD: + case IX86_BUILTIN_PSRLDI: + icode = CODE_FOR_mmx_lshrv2si3; + goto do_pshift; + case IX86_BUILTIN_PSRLQ: + case IX86_BUILTIN_PSRLQI: + icode = CODE_FOR_mmx_lshrv1di3; + goto do_pshift; case IX86_BUILTIN_PSLLW128: + case IX86_BUILTIN_PSLLWI128: icode = CODE_FOR_ashlv8hi3; goto do_pshift; case IX86_BUILTIN_PSLLD128: + case IX86_BUILTIN_PSLLDI128: icode = CODE_FOR_ashlv4si3; goto do_pshift; case IX86_BUILTIN_PSLLQ128: + case IX86_BUILTIN_PSLLQI128: icode = CODE_FOR_ashlv2di3; goto do_pshift; case IX86_BUILTIN_PSRAW128: + case IX86_BUILTIN_PSRAWI128: icode = CODE_FOR_ashrv8hi3; goto do_pshift; case IX86_BUILTIN_PSRAD128: + case IX86_BUILTIN_PSRADI128: icode = CODE_FOR_ashrv4si3; goto do_pshift; case IX86_BUILTIN_PSRLW128: + case IX86_BUILTIN_PSRLWI128: icode = CODE_FOR_lshrv8hi3; goto do_pshift; case IX86_BUILTIN_PSRLD128: + case IX86_BUILTIN_PSRLDI128: icode = CODE_FOR_lshrv4si3; goto do_pshift; case IX86_BUILTIN_PSRLQ128: + case IX86_BUILTIN_PSRLQI128: icode = CODE_FOR_lshrv2di3; - goto do_pshift; + do_pshift: arg0 = CALL_EXPR_ARG (exp, 0); arg1 = CALL_EXPR_ARG (exp, 1); @@ -20726,7 +20899,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) op0 = copy_to_reg (op0); - op1 = simplify_gen_subreg (SImode, op1, GET_MODE (op1), 0); + if (!CONST_INT_P (op1)) + op1 = simplify_gen_subreg (SImode, op1, GET_MODE (op1), 0); + if (! (*insn_data[icode].operand[2].predicate) (op1, SImode)) op1 = copy_to_reg (op1); @@ -21213,7 +21388,7 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out, case BUILT_IN_SQRTF: if (out_mode == SFmode && out_n == 4 && in_mode == SFmode && in_n == 4) - return ix86_builtins[IX86_BUILTIN_SQRTPS]; + return ix86_builtins[IX86_BUILTIN_SQRTPS_NR]; break; case BUILT_IN_LRINT: @@ -21295,7 +21470,7 @@ ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in) || n != 4) return NULL_TREE; break; - + default: return NULL_TREE; } @@ -21375,8 +21550,8 @@ ix86_builtin_reciprocal (unsigned int fn, bool md_fn, switch (fn) { /* Vectorized version of sqrt to rsqrt conversion. */ - case IX86_BUILTIN_SQRTPS: - return ix86_builtins[IX86_BUILTIN_RSQRTPS]; + case IX86_BUILTIN_SQRTPS_NR: + return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR]; default: return NULL_TREE; @@ -21872,7 +22047,7 @@ ix86_register_move_cost (enum machine_mode mode, enum reg_class class1, where integer modes in MMX/SSE registers are not tieable because of missing QImode and HImode moves to, from or between MMX/SSE registers. */ - return MAX (ix86_cost->mmxsse_to_integer, 8); + return MAX (8, ix86_cost->mmxsse_to_integer); if (MAYBE_FLOAT_CLASS_P (class1)) return ix86_cost->fp_move; @@ -22160,7 +22335,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total) nbits = 7; /* Compute costs correctly for widening multiplication. */ - if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op1) == ZERO_EXTEND) + if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND) && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2 == GET_MODE_SIZE (mode)) { @@ -22531,6 +22706,7 @@ x86_this_parameter (tree function) { tree type = TREE_TYPE (function); bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0; + int nregs; if (TARGET_64BIT) { @@ -22543,11 +22719,25 @@ x86_this_parameter (tree function) return gen_rtx_REG (DImode, parm_regs[aggr]); } - if (ix86_function_regparm (type, function) > 0 && !stdarg_p (type)) + nregs = ix86_function_regparm (type, function); + + if (nregs > 0 && !stdarg_p (type)) { - int regno = 0; + int regno; + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) - regno = 2; + regno = aggr ? DX_REG : CX_REG; + else + { + regno = AX_REG; + if (aggr) + { + regno = DX_REG; + if (nregs == 1) + return gen_rtx_MEM (SImode, + plus_constant (stack_pointer_rtx, 4)); + } + } return gen_rtx_REG (SImode, regno); } @@ -22604,7 +22794,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, { /* Put the this parameter into %eax. */ xops[0] = this_param; - xops[1] = this_reg = gen_rtx_REG (Pmode, 0); + xops[1] = this_reg = gen_rtx_REG (Pmode, AX_REG); output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops); } else @@ -22638,10 +22828,10 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, tmp = gen_rtx_REG (DImode, R10_REG); else { - int tmp_regno = 2 /* ECX */; + int tmp_regno = CX_REG; if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (TREE_TYPE (function)))) - tmp_regno = 0 /* EAX */; + tmp_regno = AX_REG; tmp = gen_rtx_REG (SImode, tmp_regno); } @@ -22714,7 +22904,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, else #endif /* TARGET_MACHO */ { - tmp = gen_rtx_REG (SImode, 2 /* ECX */); + tmp = gen_rtx_REG (SImode, CX_REG); output_set_got (tmp, NULL_RTX); xops[1] = tmp; @@ -22736,7 +22926,7 @@ x86_file_start (void) if (X86_FILE_START_FLTUSED) fputs ("\t.global\t__fltused\n", asm_out_file); if (ix86_asm_dialect == ASM_INTEL) - fputs ("\t.intel_syntax\n", asm_out_file); + fputs ("\t.intel_syntax noprefix\n", asm_out_file); } int @@ -23509,7 +23699,9 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals) for (i = 0; i < n_elts; ++i) { x = XVECEXP (vals, 0, i); - if (!CONSTANT_P (x)) + if (!(CONST_INT_P (x) + || GET_CODE (x) == CONST_DOUBLE + || GET_CODE (x) == CONST_FIXED)) n_var++, one_var = i; else if (x != CONST0_RTX (inner_mode)) all_const_zero = false; @@ -24086,7 +24278,7 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) /* a / b = a * rcp(b) * (2.0 - b * rcp(b)) */ - /* x0 = 1./b estimate */ + /* x0 = rcp(b) estimate */ emit_insn (gen_rtx_SET (VOIDmode, x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), UNSPEC_RCP))); @@ -24110,7 +24302,8 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, bool recip) { - rtx x0, e0, e1, e2, e3, three, half, zero, mask; + rtx x0, e0, e1, e2, e3, mthree, mhalf; + REAL_VALUE_TYPE r; x0 = gen_reg_rtx (mode); e0 = gen_reg_rtx (mode); @@ -24118,42 +24311,41 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, e2 = gen_reg_rtx (mode); e3 = gen_reg_rtx (mode); - three = CONST_DOUBLE_FROM_REAL_VALUE (dconst3, SFmode); - half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode); + real_arithmetic (&r, NEGATE_EXPR, &dconst3, NULL); + mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode); - mask = gen_reg_rtx (mode); + real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL); + mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode); if (VECTOR_MODE_P (mode)) { - three = ix86_build_const_vector (SFmode, true, three); - half = ix86_build_const_vector (SFmode, true, half); + mthree = ix86_build_const_vector (SFmode, true, mthree); + mhalf = ix86_build_const_vector (SFmode, true, mhalf); } - three = force_reg (mode, three); - half = force_reg (mode, half); - - zero = force_reg (mode, CONST0_RTX(mode)); + /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) + rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */ - /* sqrt(a) = 0.5 * a * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) - 1.0 / sqrt(a) = 0.5 * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) */ - - /* Compare a to zero. */ - emit_insn (gen_rtx_SET (VOIDmode, mask, - gen_rtx_NE (mode, a, zero))); - - /* x0 = 1./sqrt(a) estimate */ + /* x0 = rsqrt(a) estimate */ emit_insn (gen_rtx_SET (VOIDmode, x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), UNSPEC_RSQRT))); - /* Filter out infinity. */ - if (VECTOR_MODE_P (mode)) - emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (V4SFmode, x0), - gen_rtx_AND (mode, - gen_lowpart (V4SFmode, x0), - gen_lowpart (V4SFmode, mask)))); - else - emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_AND (mode, x0, mask))); + + /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */ + if (!recip) + { + rtx zero, mask; + + zero = gen_reg_rtx (mode); + mask = gen_reg_rtx (mode); + + zero = force_reg (mode, CONST0_RTX(mode)); + emit_insn (gen_rtx_SET (VOIDmode, mask, + gen_rtx_NE (mode, zero, a))); + + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_AND (mode, x0, mask))); + } /* e0 = x0 * a */ emit_insn (gen_rtx_SET (VOIDmode, e0, @@ -24161,17 +24353,21 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, /* e1 = e0 * x0 */ emit_insn (gen_rtx_SET (VOIDmode, e1, gen_rtx_MULT (mode, e0, x0))); - /* e2 = 3. - e1 */ + + /* e2 = e1 - 3. */ + mthree = force_reg (mode, mthree); emit_insn (gen_rtx_SET (VOIDmode, e2, - gen_rtx_MINUS (mode, three, e1))); + gen_rtx_PLUS (mode, e1, mthree))); + + mhalf = force_reg (mode, mhalf); if (recip) - /* e3 = .5 * x0 */ + /* e3 = -.5 * x0 */ emit_insn (gen_rtx_SET (VOIDmode, e3, - gen_rtx_MULT (mode, half, x0))); + gen_rtx_MULT (mode, x0, mhalf))); else - /* e3 = .5 * e0 */ + /* e3 = -.5 * e0 */ emit_insn (gen_rtx_SET (VOIDmode, e3, - gen_rtx_MULT (mode, half, e0))); + gen_rtx_MULT (mode, e0, mhalf))); /* ret = e2 * e3 */ emit_insn (gen_rtx_SET (VOIDmode, res, gen_rtx_MULT (mode, e2, e3))); @@ -24839,9 +25035,10 @@ ix86_expand_round (rtx operand0, rtx operand1) /* Validate whether a SSE5 instruction is valid or not. OPERANDS is the array of operands. NUM is the number of operands. - USES_OC0 is true if the instruction uses OC0 and provides 4 varients. + USES_OC0 is true if the instruction uses OC0 and provides 4 variants. NUM_MEMORY is the maximum number of memory operands to accept. */ -bool ix86_sse5_valid_op_p (rtx operands[], rtx insn, int num, bool uses_oc0, int num_memory) +bool +ix86_sse5_valid_op_p (rtx operands[], rtx insn, int num, bool uses_oc0, int num_memory) { int mem_mask; int mem_count; @@ -24920,7 +25117,7 @@ bool ix86_sse5_valid_op_p (rtx operands[], rtx insn, int num, bool uses_oc0, int else if (num == 4 && num_memory == 2) { /* If there are two memory operations, we can load one of the memory ops - into the destination register. This is for optimizating the + into the destination register. This is for optimizing the multiply/add ops, which the combiner has optimized both the multiply and the add insns to have a memory operation. We have to be careful that the destination doesn't overlap with the inputs. */ @@ -24944,7 +25141,7 @@ bool ix86_sse5_valid_op_p (rtx operands[], rtx insn, int num, bool uses_oc0, int /* format, example pmacsdd: xmm1, xmm2, xmm3/mem, xmm1 - + For the integer multiply/add instructions be more restrictive and require operands[2] and operands[3] to be the memory operands. */ else @@ -25044,7 +25241,7 @@ static const struct attribute_spec ix86_attribute_table[] = }; /* Implement targetm.vectorize.builtin_vectorization_cost. */ -static int +static int x86_builtin_vectorization_cost (bool runtime_test) { /* If the branch of the runtime test is taken - i.e. - the vectorized @@ -25192,6 +25389,9 @@ x86_builtin_vectorization_cost (bool runtime_test) #undef TARGET_BUILD_BUILTIN_VA_LIST #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list +#undef TARGET_EXPAND_BUILTIN_VA_START +#define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start + #undef TARGET_MD_ASM_CLOBBERS #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers