X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;ds=sidebyside;f=gcc%2Fconfig%2Fi386%2Fi386.c;h=1e9776acabb703d5ca3a3ea07b59b90c83a3552f;hb=219b28500b61b301cef02a5886df802828420bac;hp=b378aaec8dddb39dc836896040f5a995ad09dba0;hpb=8ba8ec53f9dc56ef16f71b2a6315282b45fa6c94;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index b378aaec8dd..1e9776acabb 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1,6 +1,7 @@ /* Subroutines used for code generation on IA-32. Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, - 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. + 2002, 2003, 2004, 2005, 2006, 2007, 2008 + Free Software Foundation, Inc. This file is part of GCC. @@ -53,6 +54,7 @@ along with GCC; see the file COPYING3. If not see #include "params.h" static int x86_builtin_vectorization_cost (bool); +static rtx legitimize_dllimport_symbol (rtx, bool); #ifndef CHECK_STACK_LIMIT #define CHECK_STACK_LIMIT (-1) @@ -700,7 +702,7 @@ struct processor_costs k8_cost = { to limit number of prefetches at all, as their execution also takes some time). */ 100, /* number of parallel prefetches */ - 5, /* Branch cost */ + 3, /* Branch cost */ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ COSTS_N_INSNS (4), /* cost of FMUL instruction. */ COSTS_N_INSNS (19), /* cost of FDIV instruction. */ @@ -724,8 +726,8 @@ struct processor_costs k8_cost = { 2, /* vec_align_load_cost. */ 3, /* vec_unalign_load_cost. */ 3, /* vec_store_cost. */ - 6, /* cond_taken_branch_cost. */ - 1, /* cond_not_taken_branch_cost. */ + 3, /* cond_taken_branch_cost. */ + 2, /* cond_not_taken_branch_cost. */ }; struct processor_costs amdfam10_cost = { @@ -786,7 +788,7 @@ struct processor_costs amdfam10_cost = { to limit number of prefetches at all, as their execution also takes some time). */ 100, /* number of parallel prefetches */ - 5, /* Branch cost */ + 2, /* Branch cost */ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ COSTS_N_INSNS (4), /* cost of FMUL instruction. */ COSTS_N_INSNS (19), /* cost of FDIV instruction. */ @@ -811,7 +813,7 @@ struct processor_costs amdfam10_cost = { 2, /* vec_align_load_cost. */ 2, /* vec_unalign_load_cost. */ 2, /* vec_store_cost. */ - 6, /* cond_taken_branch_cost. */ + 2, /* cond_taken_branch_cost. */ 1, /* cond_not_taken_branch_cost. */ }; @@ -1767,38 +1769,69 @@ int ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT; was set or cleared on the command line. */ static int ix86_isa_flags_explicit; -/* Define a set of ISAs which aren't available for a given ISA. MMX - and SSE ISAs are handled separately. */ +/* Define a set of ISAs which are available when a given ISA is + enabled. MMX and SSE ISAs are handled separately. */ + +#define OPTION_MASK_ISA_MMX_SET OPTION_MASK_ISA_MMX +#define OPTION_MASK_ISA_3DNOW_SET \ + (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_MMX_SET) + +#define OPTION_MASK_ISA_SSE_SET OPTION_MASK_ISA_SSE +#define OPTION_MASK_ISA_SSE2_SET \ + (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE_SET) +#define OPTION_MASK_ISA_SSE3_SET \ + (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE2_SET) +#define OPTION_MASK_ISA_SSSE3_SET \ + (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE3_SET) +#define OPTION_MASK_ISA_SSE4_1_SET \ + (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSSE3_SET) +#define OPTION_MASK_ISA_SSE4_2_SET \ + (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_1_SET) + +/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same + as -msse4.2. */ +#define OPTION_MASK_ISA_SSE4_SET OPTION_MASK_ISA_SSE4_2_SET + +#define OPTION_MASK_ISA_SSE4A_SET \ + (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET) +#define OPTION_MASK_ISA_SSE5_SET \ + (OPTION_MASK_ISA_SSE5 | OPTION_MASK_ISA_SSE4A_SET) + +/* Define a set of ISAs which aren't available when a given ISA is + disabled. MMX and SSE ISAs are handled separately. */ #define OPTION_MASK_ISA_MMX_UNSET \ - (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET) -#define OPTION_MASK_ISA_3DNOW_UNSET OPTION_MASK_ISA_3DNOW_A + (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_3DNOW_UNSET) +#define OPTION_MASK_ISA_3DNOW_UNSET \ + (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_A_UNSET) +#define OPTION_MASK_ISA_3DNOW_A_UNSET OPTION_MASK_ISA_3DNOW_A #define OPTION_MASK_ISA_SSE_UNSET \ - (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE2_UNSET) + (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2_UNSET) #define OPTION_MASK_ISA_SSE2_UNSET \ - (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE3_UNSET) + (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE3_UNSET) #define OPTION_MASK_ISA_SSE3_UNSET \ - (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSSE3_UNSET) + (OPTION_MASK_ISA_SSE3 \ + | OPTION_MASK_ISA_SSSE3_UNSET \ + | OPTION_MASK_ISA_SSE4A_UNSET ) #define OPTION_MASK_ISA_SSSE3_UNSET \ - (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_1_UNSET) + (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE4_1_UNSET) #define OPTION_MASK_ISA_SSE4_1_UNSET \ - (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_2_UNSET) -#define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4A + (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2_UNSET) +#define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4_2 -/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same - as -msse4.1 -msse4.2. -mno-sse4 should the same as -mno-sse4.1. */ -#define OPTION_MASK_ISA_SSE4 \ - (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2) +/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same + as -mno-sse4.1. */ #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET -#define OPTION_MASK_ISA_SSE4A_UNSET OPTION_MASK_ISA_SSE4 +#define OPTION_MASK_ISA_SSE4A_UNSET \ + (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE5_UNSET) -#define OPTION_MASK_ISA_SSE5_UNSET \ - (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET) +#define OPTION_MASK_ISA_SSE5_UNSET OPTION_MASK_ISA_SSE5 /* Vectorization library interface and handlers. */ tree (*ix86_veclib_handler)(enum built_in_function, tree, tree) = NULL; +static tree ix86_veclibabi_svml (enum built_in_function, tree, tree); static tree ix86_veclibabi_acml (enum built_in_function, tree, tree); /* Implement TARGET_HANDLE_OPTION. */ @@ -1809,8 +1842,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) switch (code) { case OPT_mmmx: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_MMX_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_MMX_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_UNSET; @@ -1818,8 +1855,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_m3dnow: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_3DNOW_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_UNSET; @@ -1830,8 +1871,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return false; case OPT_msse: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_UNSET; @@ -1839,8 +1884,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse2: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE2_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE2_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_UNSET; @@ -1848,8 +1897,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse3: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE3_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE3_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_UNSET; @@ -1857,8 +1910,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_mssse3: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSSE3_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSSE3_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_UNSET; @@ -1866,8 +1923,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse4_1: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_1_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_UNSET; @@ -1875,8 +1936,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse4_2: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_2_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_UNSET; @@ -1884,8 +1949,8 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse4: - ix86_isa_flags |= OPTION_MASK_ISA_SSE4; - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4; + ix86_isa_flags |= OPTION_MASK_ISA_SSE4_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_SET; return true; case OPT_mno_sse4: @@ -1894,8 +1959,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse4a: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE4A_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4A_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_UNSET; @@ -1903,8 +1972,12 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value) return true; case OPT_msse5: - ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5; - if (!value) + if (value) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE5_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_SET; + } + else { ix86_isa_flags &= ~OPTION_MASK_ISA_SSE5_UNSET; ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_UNSET; @@ -2006,7 +2079,9 @@ override_options (void) PTA_NO_SAHF = 1 << 13, PTA_SSE4_1 = 1 << 14, PTA_SSE4_2 = 1 << 15, - PTA_SSE5 = 1 << 16 + PTA_SSE5 = 1 << 16, + PTA_AES = 1 << 17, + PTA_PCLMUL = 1 << 18 }; static struct pta @@ -2313,6 +2388,10 @@ override_options (void) x86_prefetch_sse = true; if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))) x86_sahf = true; + if (processor_alias_table[i].flags & PTA_AES) + x86_aes = true; + if (processor_alias_table[i].flags & PTA_PCLMUL) + x86_pclmul = true; break; } @@ -2356,6 +2435,14 @@ override_options (void) if (i == pta_size) error ("bad value (%s) for -mtune= switch", ix86_tune_string); + /* Enable SSE2 if AES or PCLMUL is enabled. */ + if ((x86_aes || x86_pclmul) + && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2)) + { + ix86_isa_flags |= OPTION_MASK_ISA_SSE2_SET; + ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_SET; + } + ix86_tune_mask = 1u << ix86_tune; for (i = 0; i < X86_TUNE_LAST; ++i) ix86_tune_features[i] &= ix86_tune_mask; @@ -2529,34 +2616,6 @@ override_options (void) if (!TARGET_80387) target_flags |= MASK_NO_FANCY_MATH_387; - /* Turn on SSE4A bultins for -msse5. */ - if (TARGET_SSE5) - ix86_isa_flags |= OPTION_MASK_ISA_SSE4A; - - /* Turn on SSE4.1 builtins for -msse4.2. */ - if (TARGET_SSE4_2) - ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1; - - /* Turn on SSSE3 builtins for -msse4.1. */ - if (TARGET_SSE4_1) - ix86_isa_flags |= OPTION_MASK_ISA_SSSE3; - - /* Turn on SSE3 builtins for -mssse3. */ - if (TARGET_SSSE3) - ix86_isa_flags |= OPTION_MASK_ISA_SSE3; - - /* Turn on SSE3 builtins for -msse4a. */ - if (TARGET_SSE4A) - ix86_isa_flags |= OPTION_MASK_ISA_SSE3; - - /* Turn on SSE2 builtins for -msse3. */ - if (TARGET_SSE3) - ix86_isa_flags |= OPTION_MASK_ISA_SSE2; - - /* Turn on SSE builtins for -msse2. */ - if (TARGET_SSE2) - ix86_isa_flags |= OPTION_MASK_ISA_SSE; - /* Turn on MMX builtins for -msse. */ if (TARGET_SSE) { @@ -2564,10 +2623,6 @@ override_options (void) x86_prefetch_sse = true; } - /* Turn on MMX builtins for 3Dnow. */ - if (TARGET_3DNOW) - ix86_isa_flags |= OPTION_MASK_ISA_MMX; - /* Turn on popcnt instruction for -msse4.2 or -mabm. */ if (TARGET_SSE4_2 || TARGET_ABM) x86_popcnt = true; @@ -2634,7 +2689,9 @@ override_options (void) /* Use external vectorized library in vectorizing intrinsics. */ if (ix86_veclibabi_string) { - if (strcmp (ix86_veclibabi_string, "acml") == 0) + if (strcmp (ix86_veclibabi_string, "svml") == 0) + ix86_veclib_handler = ix86_veclibabi_svml; + else if (strcmp (ix86_veclibabi_string, "acml") == 0) ix86_veclib_handler = ix86_veclibabi_acml; else error ("unknown vectorization library ABI type (%s) for " @@ -2661,6 +2718,18 @@ override_options (void) target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; } + /* If stack probes are required, the space used for large function + arguments on the stack must also be probed, so enable + -maccumulate-outgoing-args so this happens in the prologue. */ + if (TARGET_STACK_PROBE + && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS)) + { + if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) + warning (0, "stack probing requires -maccumulate-outgoing-args " + "for correctness"); + target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; + } + /* For sane SSE instruction set generation we need fcomi instruction. It is safe to enable all CMOVE instructions. */ if (TARGET_SSE) @@ -2689,6 +2758,11 @@ override_options (void) set_param_value ("l1-cache-size", ix86_cost->l1_cache_size); if (!PARAM_SET_P (PARAM_L2_CACHE_SIZE)) set_param_value ("l2-cache-size", ix86_cost->l2_cache_size); + + /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0) + can be optimized to ap = __builtin_next_arg (0). */ + if (!TARGET_64BIT || TARGET_64BIT_MS_ABI) + targetm.expand_builtin_va_start = NULL; } /* Return true if this goes in large data/bss. */ @@ -2928,6 +3002,7 @@ optimization_options (int level, int size ATTRIBUTE_UNUSED) flag_omit_frame_pointer = 2; flag_pcc_struct_return = 2; flag_asynchronous_unwind_tables = 2; + flag_vect_cost_model = 1; #ifdef SUBTARGET_OPTIMIZATION_OPTIONS SUBTARGET_OPTIMIZATION_OPTIONS; #endif @@ -3176,12 +3251,33 @@ ix86_function_regparm (const_tree type, const_tree decl) tree attr; int regparm = ix86_regparm; + static bool error_issued; + if (TARGET_64BIT) return regparm; attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); if (attr) - return TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); + { + regparm + = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); + + if (decl && TREE_CODE (decl) == FUNCTION_DECL) + { + /* We can't use regparm(3) for nested functions because + these pass static chain pointer in %ecx register. */ + if (!error_issued && regparm == 3 + && decl_function_context (decl) + && !DECL_NO_STATIC_CHAIN (decl)) + { + error ("nested functions are limited to 2 register parameters"); + error_issued = true; + return 0; + } + } + + return regparm; + } if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) return 2; @@ -3198,9 +3294,9 @@ ix86_function_regparm (const_tree type, const_tree decl) struct function *f; /* Make sure no regparm register is taken by a - global register variable. */ - for (local_regparm = 0; local_regparm < 3; local_regparm++) - if (global_regs[local_regparm]) + fixed register variable. */ + for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++) + if (fixed_regs[local_regparm]) break; /* We can't use regparm(3) for nested functions as these use @@ -3222,12 +3318,14 @@ ix86_function_regparm (const_tree type, const_tree decl) TYPE_ATTRIBUTES (TREE_TYPE (decl))))) local_regparm = 2; - /* Each global register variable increases register preassure, - so the more global reg vars there are, the smaller regparm - optimization use, unless requested by the user explicitly. */ - for (regno = 0; regno < 6; regno++) - if (global_regs[regno]) + /* Each fixed register usage increases register pressure, + so less registers should be used for argument passing. + This functionality can be overriden by an explicit + regparm value. */ + for (regno = 0; regno <= DI_REG; regno++) + if (fixed_regs[regno]) globals++; + local_regparm = globals < local_regparm ? local_regparm - globals : 0; @@ -3245,7 +3343,7 @@ ix86_function_regparm (const_tree type, const_tree decl) indirectly or considering a libcall. Otherwise return 0. */ static int -ix86_function_sseregparm (const_tree type, const_tree decl) +ix86_function_sseregparm (const_tree type, const_tree decl, bool warn) { gcc_assert (!TARGET_64BIT); @@ -3256,12 +3354,15 @@ ix86_function_sseregparm (const_tree type, const_tree decl) { if (!TARGET_SSE) { - if (decl) - error ("Calling %qD with attribute sseregparm without " - "SSE/SSE2 enabled", decl); - else - error ("Calling %qT with attribute sseregparm without " - "SSE/SSE2 enabled", type); + if (warn) + { + if (decl) + error ("Calling %qD with attribute sseregparm without " + "SSE/SSE2 enabled", decl); + else + error ("Calling %qT with attribute sseregparm without " + "SSE/SSE2 enabled", type); + } return 0; } @@ -3423,6 +3524,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ rtx libname, /* SYMBOL_REF of library name or 0 */ tree fndecl) { + struct cgraph_local_info *i = fndecl ? cgraph_local_info (fndecl) : NULL; memset (cum, 0, sizeof (*cum)); /* Set up the number of registers to use for passing arguments. */ @@ -3433,6 +3535,15 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ cum->mmx_nregs = MMX_REGPARM_MAX; cum->warn_sse = true; cum->warn_mmx = true; + + /* Because type might mismatch in between caller and callee, we need to + use actual type of function for local calls. + FIXME: cgraph_analyze can be told to actually record if function uses + va_start so for local functions maybe_vaarg can be made aggressive + helping K&R code. + FIXME: once typesytem is fixed, we won't need this code anymore. */ + if (i && i->local) + fntype = TREE_TYPE (fndecl); cum->maybe_vaarg = (fntype ? (!prototype_p (fntype) || stdarg_p (fntype)) : !libname); @@ -3441,7 +3552,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ { /* If there are variable arguments, then we won't pass anything in registers in 32-bit mode. */ - if (cum->maybe_vaarg) + if (stdarg_p (fntype)) { cum->nregs = 0; cum->sse_nregs = 0; @@ -3466,7 +3577,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ /* Set up the number of SSE registers used for passing SFmode and DFmode arguments. Warn for mismatching ABI. */ - cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl); + cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true); } } @@ -3766,7 +3877,7 @@ classify_argument (enum machine_mode mode, const_tree type, } /* for V1xx modes, just use the base mode */ - if (VECTOR_MODE_P (mode) + if (VECTOR_MODE_P (mode) && mode != V1DImode && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes) mode = GET_MODE_INNER (mode); @@ -3838,6 +3949,7 @@ classify_argument (enum machine_mode mode, const_tree type, classes[0] = X86_64_SSE_CLASS; classes[1] = X86_64_SSEUP_CLASS; return 2; + case V1DImode: case V2SFmode: case V2SImode: case V4HImode: @@ -4139,6 +4251,7 @@ function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, case V4HImode: case V2SImode: case V2SFmode: + case V1DImode: if (!type || !AGGREGATE_TYPE_P (type)) { cum->mmx_words += words; @@ -4253,10 +4366,13 @@ function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, int regno = cum->regno; /* Fastcall allocates the first two DWORD (SImode) or - smaller arguments to ECX and EDX. */ + smaller arguments to ECX and EDX if it isn't an + aggregate type . */ if (cum->fastcall) { - if (mode == BLKmode || mode == DImode) + if (mode == BLKmode + || mode == DImode + || (type && AGGREGATE_TYPE_P (type))) break; /* ECX not EAX is the first allocated register. */ @@ -4299,6 +4415,7 @@ function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode, case V4HImode: case V2SImode: case V2SFmode: + case V1DImode: if (!type || !AGGREGATE_TYPE_P (type)) { if (!TARGET_MMX && !warnedmmx && cum->warn_mmx) @@ -4338,7 +4455,8 @@ function_arg_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode, static rtx function_arg_ms_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode, - enum machine_mode orig_mode, int named) + enum machine_mode orig_mode, int named, + HOST_WIDE_INT bytes) { unsigned int regno; @@ -4370,6 +4488,14 @@ function_arg_ms_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode, return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2)); } } + /* Handle aggregated types passed in register. */ + if (orig_mode == BLKmode) + { + if (bytes > 0 && bytes <= 8) + mode = (bytes > 4 ? DImode : SImode); + if (mode == BLKmode) + mode = DImode; + } return gen_reg_or_parallel (mode, orig_mode, regno); } @@ -4393,7 +4519,7 @@ function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode, mode = type_natural_mode (type); if (TARGET_64BIT_MS_ABI) - return function_arg_ms_64 (cum, mode, omode, named); + return function_arg_ms_64 (cum, mode, omode, named, bytes); else if (TARGET_64BIT) return function_arg_64 (cum, mode, omode, type); else @@ -4411,8 +4537,10 @@ ix86_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED, enum machine_mode mode ATTRIBUTE_UNUSED, const_tree type, bool named ATTRIBUTE_UNUSED) { + /* See Windows x64 Software Convention. */ if (TARGET_64BIT_MS_ABI) { + int msize = (int) GET_MODE_SIZE (mode); if (type) { /* Arrays are passed by reference. */ @@ -4423,16 +4551,17 @@ ix86_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED, { /* Structs/unions of sizes other than 8, 16, 32, or 64 bits are passed by reference. */ - int el2 = exact_log2 (int_size_in_bytes (type)); - return !(el2 >= 0 && el2 <= 3); + msize = int_size_in_bytes (type); } } /* __m128 is passed by reference. */ - /* ??? How to handle complex? For now treat them as structs, - and pass them by reference if they're too large. */ - if (GET_MODE_SIZE (mode) > 8) - return true; + switch (msize) { + case 1: case 2: case 4: case 8: + break; + default: + return true; + } } else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1) return 1; @@ -4499,7 +4628,8 @@ ix86_function_arg_boundary (enum machine_mode mode, tree type) align = GET_MODE_ALIGNMENT (mode); if (align < PARM_BOUNDARY) align = PARM_BOUNDARY; - if (!TARGET_64BIT) + /* Decimal floating point is aligned to its natural boundary. */ + if (!TARGET_64BIT && !VALID_DFP_MODE_P (mode)) { /* i386 ABI defines all arguments to be 4 byte aligned. We have to make an exception for SSE modes since these require 128bit @@ -4521,8 +4651,8 @@ ix86_function_arg_boundary (enum machine_mode mode, tree type) align = PARM_BOUNDARY; } } - if (align > 128) - align = 128; + if (align > BIGGEST_ALIGNMENT) + align = BIGGEST_ALIGNMENT; return align; } @@ -4588,7 +4718,7 @@ function_value_32 (enum machine_mode orig_mode, enum machine_mode mode, SSE math is enabled or for functions with sseregparm attribute. */ if ((fn || fntype) && (mode == SFmode || mode == DFmode)) { - int sse_level = ix86_function_sseregparm (fntype, fn); + int sse_level = ix86_function_sseregparm (fntype, fn, false); if ((sse_level >= 1 && mode == SFmode) || (sse_level == 2 && mode == DFmode)) regno = FIRST_SSE_REG; @@ -4646,12 +4776,22 @@ function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode) if (TARGET_SSE) { - if (mode == SFmode || mode == DFmode) - regno = FIRST_SSE_REG; - else if (VECTOR_MODE_P (mode) || GET_MODE_SIZE (mode) == 16) - regno = FIRST_SSE_REG; + switch (GET_MODE_SIZE (mode)) + { + case 16: + if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)) + && !COMPLEX_MODE_P (mode)) + regno = FIRST_SSE_REG; + break; + case 8: + case 4: + if (mode == SFmode || mode == DFmode) + regno = FIRST_SSE_REG; + break; + default: + break; + } } - return gen_rtx_REG (orig_mode, regno); } @@ -4745,13 +4885,13 @@ return_in_memory_ms_64 (const_tree type, enum machine_mode mode) { HOST_WIDE_INT size = int_size_in_bytes (type); - /* __m128 and friends are returned in xmm0. */ - if (!COMPLEX_MODE_P (mode) && size == 16 && VECTOR_MODE_P (mode)) + /* __m128 is returned in xmm0. */ + if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode)) + && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16)) return 0; - /* Otherwise, the size must be exactly in [1248]. But not for complex. */ - return (size != 1 && size != 2 && size != 4 && size != 8) - || COMPLEX_MODE_P (mode); + /* Otherwise, the size must be exactly in [1248]. */ + return (size != 1 && size != 2 && size != 4 && size != 8); } int @@ -4919,8 +5059,8 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) We also may end up assuming that only 64bit values are stored in SSE register let some floating point program work. */ - if (ix86_preferred_stack_boundary >= 128) - cfun->stack_alignment_needed = 128; + if (ix86_preferred_stack_boundary >= BIGGEST_ALIGNMENT) + crtl->stack_alignment_needed = BIGGEST_ALIGNMENT; save_area = frame_pointer_rtx; set = get_varargs_alias_set (); @@ -5037,7 +5177,7 @@ ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode, /* Implement va_start. */ -void +static void ix86_va_start (tree valist, rtx nextarg) { HOST_WIDE_INT words, n_gpr, n_fpr; @@ -5064,9 +5204,9 @@ ix86_va_start (tree valist, rtx nextarg) sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE); /* Count number of gp and fp argument registers used. */ - words = current_function_args_info.words; - n_gpr = current_function_args_info.regno; - n_fpr = current_function_args_info.sse_regno; + words = crtl->args.info.words; + n_gpr = crtl->args.info.regno; + n_fpr = crtl->args.info.sse_regno; if (cfun->va_list_gpr_size) { @@ -5610,8 +5750,8 @@ ix86_can_use_return_insn_p (void) /* Don't allow more than 32 pop, since that's all we can do with one instruction. */ - if (current_function_pops_args - && current_function_args_size >= 32768) + if (crtl->args.pops_args + && crtl->args.size >= 32768) return 0; ix86_compute_frame_layout (&frame); @@ -5643,7 +5783,7 @@ ix86_frame_pointer_required (void) || ix86_current_function_calls_tls_descriptor)) return 1; - if (current_function_profile) + if (crtl->profile) return 1; return 0; @@ -5735,11 +5875,20 @@ ix86_file_end (void) switch_to_section (text_section); ASM_OUTPUT_LABEL (asm_out_file, name); } - - xops[0] = gen_rtx_REG (SImode, regno); - xops[1] = gen_rtx_MEM (SImode, stack_pointer_rtx); - output_asm_insn ("mov{l}\t{%1, %0|%0, %1}", xops); - output_asm_insn ("ret", xops); + if (TARGET_64BIT_MS_ABI) + { + xops[0] = gen_rtx_REG (Pmode, regno); + xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx); + output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", xops); + output_asm_insn ("ret", xops); + } + else + { + xops[0] = gen_rtx_REG (SImode, regno); + xops[1] = gen_rtx_MEM (SImode, stack_pointer_rtx); + output_asm_insn ("mov{l}\t{%1, %0|%0, %1}", xops); + output_asm_insn ("ret", xops); + } } if (NEED_INDICATE_EXEC_STACK) @@ -5844,7 +5993,7 @@ gen_push (rtx arg) static unsigned int ix86_select_alt_pic_regnum (void) { - if (current_function_is_leaf && !current_function_profile + if (current_function_is_leaf && !crtl->profile && !ix86_current_function_calls_tls_descriptor) { int i; @@ -5863,16 +6012,16 @@ ix86_save_reg (unsigned int regno, int maybe_eh_return) if (pic_offset_table_rtx && regno == REAL_PIC_OFFSET_TABLE_REGNUM && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM) - || current_function_profile - || current_function_calls_eh_return - || current_function_uses_const_pool)) + || crtl->profile + || crtl->calls_eh_return + || crtl->uses_const_pool)) { if (ix86_select_alt_pic_regnum () != INVALID_REGNUM) return 0; return 1; } - if (current_function_calls_eh_return && maybe_eh_return) + if (crtl->calls_eh_return && maybe_eh_return) { unsigned i; for (i = 0; ; i++) @@ -5949,8 +6098,8 @@ ix86_compute_frame_layout (struct ix86_frame *frame) frame->nregs = ix86_nsaved_regs (); total_size = size; - stack_alignment_needed = cfun->stack_alignment_needed / BITS_PER_UNIT; - preferred_alignment = cfun->preferred_stack_boundary / BITS_PER_UNIT; + stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT; + preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT; /* During reload iteration the amount of registers saved can change. Recompute the value as needed. Do not recompute when amount of registers @@ -6033,21 +6182,21 @@ ix86_compute_frame_layout (struct ix86_frame *frame) /* Add outgoing arguments area. Can be skipped if we eliminated all the function calls as dead code. Skipping is however impossible when function calls alloca. Alloca - expander assumes that last current_function_outgoing_args_size + expander assumes that last crtl->outgoing_args_size of stack frame are unused. */ if (ACCUMULATE_OUTGOING_ARGS - && (!current_function_is_leaf || current_function_calls_alloca + && (!current_function_is_leaf || cfun->calls_alloca || ix86_current_function_calls_tls_descriptor)) { - offset += current_function_outgoing_args_size; - frame->outgoing_arguments_size = current_function_outgoing_args_size; + offset += crtl->outgoing_args_size; + frame->outgoing_arguments_size = crtl->outgoing_args_size; } else frame->outgoing_arguments_size = 0; /* Align stack boundary. Only needed if we're calling another function or using alloca. */ - if (!current_function_is_leaf || current_function_calls_alloca + if (!current_function_is_leaf || cfun->calls_alloca || ix86_current_function_calls_tls_descriptor) frame->padding2 = ((offset + preferred_alignment - 1) & -preferred_alignment) - offset; @@ -6097,7 +6246,7 @@ ix86_compute_frame_layout (struct ix86_frame *frame) (long)frame->hard_frame_pointer_offset); fprintf (stderr, "stack_pointer_offset: %ld\n", (long)frame->stack_pointer_offset); fprintf (stderr, "current_function_is_leaf: %ld\n", (long)current_function_is_leaf); - fprintf (stderr, "current_function_calls_alloca: %ld\n", (long)current_function_calls_alloca); + fprintf (stderr, "cfun->calls_alloca: %ld\n", (long)cfun->calls_alloca); fprintf (stderr, "x86_current_function_calls_tls_descriptor: %ld\n", (long)ix86_current_function_calls_tls_descriptor); #endif } @@ -6307,8 +6456,12 @@ ix86_expand_prologue (void) allocate += frame.nregs * UNITS_PER_WORD; /* When using red zone we may start register saving before allocating - the stack frame saving one cycle of the prologue. */ - if (TARGET_RED_ZONE && frame.save_regs_using_mov) + the stack frame saving one cycle of the prologue. However I will + avoid doing this if I am going to have to probe the stack since + at least on x86_64 the stack probe can turn into a call that clobbers + a red zone location */ + if (TARGET_RED_ZONE && frame.save_regs_using_mov + && (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)) ix86_emit_save_regs_using_mov (frame_pointer_needed ? hard_frame_pointer_rtx : stack_pointer_rtx, -frame.nregs * UNITS_PER_WORD); @@ -6364,7 +6517,9 @@ ix86_expand_prologue (void) } } - if (frame.save_regs_using_mov && !TARGET_RED_ZONE) + if (frame.save_regs_using_mov + && !(TARGET_RED_ZONE + && (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT))) { if (!frame_pointer_needed || !frame.to_allocate) ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate); @@ -6376,7 +6531,7 @@ ix86_expand_prologue (void) pic_reg_used = false; if (pic_offset_table_rtx && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM) - || current_function_profile)) + || crtl->profile)) { unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum (); @@ -6411,7 +6566,7 @@ ix86_expand_prologue (void) /* Prevent function calls from being scheduled before the call to mcount. In the pic_reg_used case, make sure that the got load isn't deleted. */ - if (current_function_profile) + if (crtl->profile) { if (pic_reg_used) emit_insn (gen_prologue_use (pic_offset_table_rtx)); @@ -6466,7 +6621,7 @@ ix86_expand_epilogue (int style) eh_return: the eax and edx registers are marked as saved, but not restored along this path. */ offset = frame.nregs; - if (current_function_calls_eh_return && style != 2) + if (crtl->calls_eh_return && style != 2) offset -= 2; offset *= -UNITS_PER_WORD; @@ -6488,7 +6643,7 @@ ix86_expand_epilogue (int style) || (frame_pointer_needed && TARGET_USE_LEAVE && cfun->machine->use_fast_prologue_epilogue && frame.nregs == 1) - || current_function_calls_eh_return) + || crtl->calls_eh_return) { /* Restore registers. We can use ebp or esp to address the memory locations. If both are available, default to ebp, since offsets @@ -6595,15 +6750,15 @@ ix86_expand_epilogue (int style) if (style == 0) return; - if (current_function_pops_args && current_function_args_size) + if (crtl->args.pops_args && crtl->args.size) { - rtx popc = GEN_INT (current_function_pops_args); + rtx popc = GEN_INT (crtl->args.pops_args); /* i386 can only pop 64K bytes. If asked to pop more, pop return address, do explicit add, and jump indirectly to the caller. */ - if (current_function_pops_args >= 65536) + if (crtl->args.pops_args >= 65536) { rtx ecx = gen_rtx_REG (SImode, CX_REG); @@ -7506,10 +7661,18 @@ legitimize_pic_address (rtx orig, rtx reg) see gotoff_operand. */ || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF)) { - /* Given that we've already handled dllimport variables separately - in legitimize_address, and all other variables should satisfy - legitimate_pic_address_disp_p, we should never arrive here. */ - gcc_assert (!TARGET_64BIT_MS_ABI); + if (TARGET_DLLIMPORT_DECL_ATTRIBUTES) + { + if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr)) + return legitimize_dllimport_symbol (addr, true); + if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS + && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF + && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0))) + { + rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true); + return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1)); + } + } if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC) { @@ -9125,7 +9288,7 @@ print_operand (FILE *file, rtx x, int code) if (ASSEMBLER_DIALECT == ASM_ATT) putc ('$', file); - fprintf (file, "0x%08lx", l); + fprintf (file, "0x%08lx", (long unsigned int) l); } /* These float cases don't actually occur as immediate operands. */ @@ -10819,6 +10982,14 @@ ix86_expand_convert_uns_didf_sse (rtx target, rtx input) ix86_expand_vector_extract (false, target, fp_xmm, 0); } +/* Not used, but eases macroization of patterns. */ +void +ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED, + rtx input ATTRIBUTE_UNUSED) +{ + gcc_unreachable (); +} + /* Convert an unsigned SImode value into a DFmode. Only currently used for SSE, but applicable anywhere. */ @@ -11002,7 +11173,6 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, rtx operands[]) { rtx mask, set, use, clob, dst, src; - bool matching_memory; bool use_sse = false; bool vector_mode = VECTOR_MODE_P (mode); enum machine_mode elt_mode = mode; @@ -11027,19 +11197,6 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, dst = operands[0]; src = operands[1]; - /* If the destination is memory, and we don't have matching source - operands or we're using the x87, do things in registers. */ - matching_memory = false; - if (MEM_P (dst)) - { - if (use_sse && rtx_equal_p (dst, src)) - matching_memory = true; - else - dst = gen_reg_rtx (mode); - } - if (MEM_P (src) && !matching_memory) - src = force_reg (mode, src); - if (vector_mode) { set = gen_rtx_fmt_ee (code == NEG ? XOR : AND, mode, src, mask); @@ -11060,9 +11217,6 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode, else emit_insn (set); } - - if (dst != operands[0]) - emit_move_insn (operands[0], dst); } /* Expand a copysign operation. Special case operand 0 being a constant. */ @@ -12082,16 +12236,30 @@ ix86_expand_branch (enum rtx_code code, rtx label) /* Otherwise, if we are doing less-than or greater-or-equal-than, op1 is a constant and the low word is zero, then we can just - examine the high word. */ + examine the high word. Similarly for low word -1 and + less-or-equal-than or greater-than. */ - if (CONST_INT_P (hi[1]) && lo[1] == const0_rtx) + if (CONST_INT_P (hi[1])) switch (code) { case LT: case LTU: case GE: case GEU: - ix86_compare_op0 = hi[0]; - ix86_compare_op1 = hi[1]; - ix86_expand_branch (code, label); - return; + if (lo[1] == const0_rtx) + { + ix86_compare_op0 = hi[0]; + ix86_compare_op1 = hi[1]; + ix86_expand_branch (code, label); + return; + } + break; + case LE: case LEU: case GT: case GTU: + if (lo[1] == constm1_rtx) + { + ix86_compare_op0 = hi[0]; + ix86_compare_op1 = hi[1]; + ix86_expand_branch (code, label); + return; + } + break; default: break; } @@ -15099,8 +15267,9 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset, additionally, memset wants eax and memcpy wants esi. Don't consider such algorithms if the user has appropriated those registers for their own purposes. */ - bool rep_prefix_usable = !(global_regs[CX_REG] || global_regs[DI_REG] - || (memset ? global_regs[AX_REG] : global_regs[SI_REG])); + bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG] + || (memset + ? fixed_regs[AX_REG] : fixed_regs[SI_REG])); #define ALG_USABLE_P(alg) (rep_prefix_usable \ || (alg != rep_prefix_1_byte \ @@ -15270,7 +15439,7 @@ smallest_pow2_greater_than (int val) } /* Expand string move (memcpy) operation. Use i386 string operations when - profitable. expand_clrmem contains similar code. The code depends upon + profitable. expand_setmem contains similar code. The code depends upon architecture, block size and alignment, but always has the same overall structure: @@ -15321,6 +15490,10 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, if (CONST_INT_P (expected_size_exp) && count == 0) expected_size = INTVAL (expected_size_exp); + /* Make sure we don't need to care about overflow later on. */ + if (count > ((unsigned HOST_WIDE_INT) 1 << 30)) + return 0; + /* Step 0: Decide on preferred algorithm, desired alignment and size of chunks to be copied by main loop. */ @@ -15366,12 +15539,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, /* Alignment code needs count to be in register. */ if (CONST_INT_P (count_exp) && desired_align > align) - { - enum machine_mode mode = SImode; - if (TARGET_64BIT && (count & ~0xffffffff)) - mode = DImode; - count_exp = force_reg (mode, count_exp); - } + count_exp = force_reg (counter_mode (count_exp), count_exp); gcc_assert (desired_align >= 1 && align >= 1); /* Ensure that alignment prologue won't copy past end of block. */ @@ -15382,29 +15550,48 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, Make sure it is power of 2. */ epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed); - label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, - GEN_INT (epilogue_size_needed), - LTU, 0, counter_mode (count_exp), 1, label); - if (GET_CODE (count_exp) == CONST_INT) - ; - else if (expected_size == -1 || expected_size < epilogue_size_needed) - predict_jump (REG_BR_PROB_BASE * 60 / 100); + if (CONST_INT_P (count_exp)) + { + if (UINTVAL (count_exp) < (unsigned HOST_WIDE_INT)epilogue_size_needed) + goto epilogue; + } else - predict_jump (REG_BR_PROB_BASE * 20 / 100); + { + label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, + GEN_INT (epilogue_size_needed), + LTU, 0, counter_mode (count_exp), 1, label); + if (expected_size == -1 || expected_size < epilogue_size_needed) + predict_jump (REG_BR_PROB_BASE * 60 / 100); + else + predict_jump (REG_BR_PROB_BASE * 20 / 100); + } } + /* Emit code to decide on runtime whether library call or inline should be used. */ if (dynamic_check != -1) { - rtx hot_label = gen_label_rtx (); - jump_around_label = gen_label_rtx (); - emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), - LEU, 0, GET_MODE (count_exp), 1, hot_label); - predict_jump (REG_BR_PROB_BASE * 90 / 100); - emit_block_move_via_libcall (dst, src, count_exp, false); - emit_jump (jump_around_label); - emit_label (hot_label); + if (CONST_INT_P (count_exp)) + { + if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check) + { + emit_block_move_via_libcall (dst, src, count_exp, false); + count_exp = const0_rtx; + goto epilogue; + } + } + else + { + rtx hot_label = gen_label_rtx (); + jump_around_label = gen_label_rtx (); + emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1), + LEU, 0, GET_MODE (count_exp), 1, hot_label); + predict_jump (REG_BR_PROB_BASE * 90 / 100); + emit_block_move_via_libcall (dst, src, count_exp, false); + emit_jump (jump_around_label); + emit_label (hot_label); + } } /* Step 2: Alignment prologue. */ @@ -15477,7 +15664,7 @@ ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp, } /* Step 4: Epilogue to copy the remaining bytes. */ - + epilogue: if (label) { /* When the main loop is done, COUNT_EXP might hold original count, @@ -15631,6 +15818,10 @@ ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp, if (CONST_INT_P (expected_size_exp) && count == 0) expected_size = INTVAL (expected_size_exp); + /* Make sure we don't need to care about overflow later on. */ + if (count > ((unsigned HOST_WIDE_INT) 1 << 30)) + return 0; + /* Step 0: Decide on preferred algorithm, desired alignment and size of chunks to be copied by main loop. */ @@ -16061,7 +16252,7 @@ ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align) rtx unspec; /* Can't use this if the user has appropriated eax, ecx, or edi. */ - if (global_regs[AX_REG] || global_regs[CX_REG] || global_regs[DI_REG]) + if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG]) return false; scratch2 = gen_reg_rtx (Pmode); @@ -16685,7 +16876,8 @@ ia32_multipass_dfa_lookahead (void) int ix86_constant_alignment (tree exp, int align) { - if (TREE_CODE (exp) == REAL_CST) + if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST + || TREE_CODE (exp) == INTEGER_CST) { if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64) return 64; @@ -17047,9 +17239,11 @@ enum ix86_builtins IX86_BUILTIN_RCPPS, IX86_BUILTIN_RCPSS, IX86_BUILTIN_RSQRTPS, + IX86_BUILTIN_RSQRTPS_NR, IX86_BUILTIN_RSQRTSS, IX86_BUILTIN_RSQRTF, IX86_BUILTIN_SQRTPS, + IX86_BUILTIN_SQRTPS_NR, IX86_BUILTIN_SQRTSS, IX86_BUILTIN_UNPCKHPS, @@ -17440,6 +17634,17 @@ enum ix86_builtins IX86_BUILTIN_PCMPGTQ, + /* AES instructions */ + IX86_BUILTIN_AESENC128, + IX86_BUILTIN_AESENCLAST128, + IX86_BUILTIN_AESDEC128, + IX86_BUILTIN_AESDECLAST128, + IX86_BUILTIN_AESIMC128, + IX86_BUILTIN_AESKEYGENASSIST128, + + /* PCLMUL instruction */ + IX86_BUILTIN_PCLMULQDQ128, + /* TFmode support builtins. */ IX86_BUILTIN_INFQ, IX86_BUILTIN_FABSQ, @@ -17779,22 +17984,56 @@ static const struct builtin_description bdesc_crc32[] = { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_crc32di, 0, IX86_BUILTIN_CRC32DI, UNKNOWN, 0 }, }; -/* SSE builtins with 3 arguments and the last argument must be an immediate or xmm0. */ -static const struct builtin_description bdesc_sse_3arg[] = +/* SSE */ +enum sse_builtin_type +{ + SSE_CTYPE_UNKNOWN, + V4SF_FTYPE_V4SF_INT, + V2DI_FTYPE_V2DI_INT, + V2DF_FTYPE_V2DF_INT, + V16QI_FTYPE_V16QI_V16QI_V16QI, + V4SF_FTYPE_V4SF_V4SF_V4SF, + V2DF_FTYPE_V2DF_V2DF_V2DF, + V16QI_FTYPE_V16QI_V16QI_INT, + V8HI_FTYPE_V8HI_V8HI_INT, + V4SI_FTYPE_V4SI_V4SI_INT, + V4SF_FTYPE_V4SF_V4SF_INT, + V2DI_FTYPE_V2DI_V2DI_INT, + V2DF_FTYPE_V2DF_V2DF_INT +}; + +/* SSE builtins with variable number of arguments. */ +static const struct builtin_description bdesc_sse_args[] = { + /* SSE */ + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + + /* SSE2 */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + /* SSE4.1 */ - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, 0, IX86_BUILTIN_ROUNDSD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, 0, IX86_BUILTIN_ROUNDSS, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI }, + { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT }, + + /* SSE4.1 and SSE5 */ + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT }, + + /* AES */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT }, + + /* PCLMUL */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT }, }; static const struct builtin_description bdesc_2arg[] = @@ -17803,7 +18042,7 @@ static const struct builtin_description bdesc_2arg[] = { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE, CODE_FOR_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, 0 }, @@ -17852,11 +18091,11 @@ static const struct builtin_description bdesc_2arg[] = { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_adddi3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subdi3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, 0 }, @@ -17907,25 +18146,6 @@ static const struct builtin_description bdesc_2arg[] = { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, 0, IX86_BUILTIN_CVTSI2SS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, 0, IX86_BUILTIN_CVTSI642SS, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLW, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLWI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLDI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQ, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQI, UNKNOWN, 0 }, - - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLW, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLWI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLDI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQ, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQI, UNKNOWN, 0 }, - - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAW, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAWI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRAD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRADI, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, 0, IX86_BUILTIN_PSADBW, UNKNOWN, 0 }, { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, 0, IX86_BUILTIN_PMADDWD, UNKNOWN, 0 }, @@ -18034,20 +18254,9 @@ static const struct builtin_description bdesc_2arg[] = { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, 0, IX86_BUILTIN_PSADBW128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulsidi3, 0, IX86_BUILTIN_PMULUDQ, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, 0, IX86_BUILTIN_PMULUDQ, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, 0, IX86_BUILTIN_PMULUDQ128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, 0, IX86_BUILTIN_PSLLWI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, 0, IX86_BUILTIN_PSLLDI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, 0, IX86_BUILTIN_PSLLQI128, UNKNOWN, 0 }, - - { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, 0, IX86_BUILTIN_PSRLWI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, 0, IX86_BUILTIN_PSRLDI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, 0, IX86_BUILTIN_PSRLQI128, UNKNOWN, 0 }, - - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, 0, IX86_BUILTIN_PSRAWI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, 0, IX86_BUILTIN_PSRADI128, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, 0, IX86_BUILTIN_PMADDWD128, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, 0, IX86_BUILTIN_CVTSI2SD, UNKNOWN, 0 }, @@ -18105,15 +18314,24 @@ static const struct builtin_description bdesc_2arg[] = /* SSE4.2 */ { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, 0 }, + + /* AES */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, 0 }, }; static const struct builtin_description bdesc_1arg[] = { + /* SSE */ { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, 0, IX86_BUILTIN_MOVMSKPS, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS_NR, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, 0, IX86_BUILTIN_RCPPS, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, 0, IX86_BUILTIN_CVTPS2PI, UNKNOWN, 0 }, @@ -18123,6 +18341,7 @@ static const struct builtin_description bdesc_1arg[] = { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, 0, IX86_BUILTIN_CVTTSS2SI, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, 0, IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, 0 }, + /* SSE2 */ { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB128, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, 0, IX86_BUILTIN_MOVMSKPD, UNKNOWN, 0 }, @@ -18175,9 +18394,8 @@ static const struct builtin_description bdesc_1arg[] = { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, 0, IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, 0 }, - /* Fake 1 arg builtins with a constant smaller than 8 bits as the 2nd arg. */ - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_roundpd, 0, IX86_BUILTIN_ROUNDPD, UNKNOWN, 0 }, - { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_roundps, 0, IX86_BUILTIN_ROUNDPS, UNKNOWN, 0 }, + /* AES */ + { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, 0 }, }; /* SSE5 */ @@ -18480,6 +18698,8 @@ ix86_init_mmx_sse_builtins (void) tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode); tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode); + tree V1DI_type_node + = build_vector_type_for_mode (long_long_integer_type_node, V1DImode); tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode); tree V2DI_type_node = build_vector_type_for_mode (long_long_integer_type_node, V2DImode); @@ -18544,14 +18764,13 @@ ix86_init_mmx_sse_builtins (void) tree v4hi_ftype_v4hi_int = build_function_type_list (V4HI_type_node, V4HI_type_node, integer_type_node, NULL_TREE); - tree v4hi_ftype_v4hi_di - = build_function_type_list (V4HI_type_node, - V4HI_type_node, long_long_unsigned_type_node, - NULL_TREE); - tree v2si_ftype_v2si_di + tree v2si_ftype_v2si_int = build_function_type_list (V2SI_type_node, - V2SI_type_node, long_long_unsigned_type_node, - NULL_TREE); + V2SI_type_node, integer_type_node, NULL_TREE); + tree v1di_ftype_v1di_int + = build_function_type_list (V1DI_type_node, + V1DI_type_node, integer_type_node, NULL_TREE); + tree void_ftype_void = build_function_type (void_type_node, void_list_node); tree void_ftype_unsigned @@ -18618,10 +18837,9 @@ ix86_init_mmx_sse_builtins (void) tree v2si_ftype_v2si_v2si = build_function_type_list (V2SI_type_node, V2SI_type_node, V2SI_type_node, NULL_TREE); - tree di_ftype_di_di - = build_function_type_list (long_long_unsigned_type_node, - long_long_unsigned_type_node, - long_long_unsigned_type_node, NULL_TREE); + tree v1di_ftype_v1di_v1di + = build_function_type_list (V1DI_type_node, + V1DI_type_node, V1DI_type_node, NULL_TREE); tree di_ftype_di_di_int = build_function_type_list (long_long_unsigned_type_node, @@ -18745,11 +18963,11 @@ ix86_init_mmx_sse_builtins (void) tree v4si_ftype_v8hi_v8hi = build_function_type_list (V4SI_type_node, V8HI_type_node, V8HI_type_node, NULL_TREE); - tree di_ftype_v8qi_v8qi - = build_function_type_list (long_long_unsigned_type_node, + tree v1di_ftype_v8qi_v8qi + = build_function_type_list (V1DI_type_node, V8QI_type_node, V8QI_type_node, NULL_TREE); - tree di_ftype_v2si_v2si - = build_function_type_list (long_long_unsigned_type_node, + tree v1di_ftype_v2si_v2si + = build_function_type_list (V1DI_type_node, V2SI_type_node, V2SI_type_node, NULL_TREE); tree v2di_ftype_v16qi_v16qi = build_function_type_list (V2DI_type_node, @@ -18977,61 +19195,58 @@ ix86_init_mmx_sse_builtins (void) def_builtin_const (OPTION_MASK_ISA_64BIT, "__builtin_copysignq", ftype, IX86_BUILTIN_COPYSIGNQ); } - /* Add all SSE builtins that are more or less simple operations on - three operands. */ - for (i = 0, d = bdesc_sse_3arg; - i < ARRAY_SIZE (bdesc_sse_3arg); + /* Add all SSE builtins with variable number of operands. */ + for (i = 0, d = bdesc_sse_args; + i < ARRAY_SIZE (bdesc_sse_args); i++, d++) { - /* Use one of the operands; the target can have a different mode for - mask-generating compares. */ - enum machine_mode mode; tree type; if (d->name == 0) continue; - mode = insn_data[d->icode].operand[1].mode; - switch (mode) + switch ((enum sse_builtin_type) d->flag) { - case V16QImode: + case V4SF_FTYPE_V4SF_INT: + type = v4sf_ftype_v4sf_int; + break; + case V2DI_FTYPE_V2DI_INT: + type = v2di_ftype_v2di_int; + break; + case V2DF_FTYPE_V2DF_INT: + type = v2df_ftype_v2df_int; + break; + case V16QI_FTYPE_V16QI_V16QI_V16QI: + type = v16qi_ftype_v16qi_v16qi_v16qi; + break; + case V4SF_FTYPE_V4SF_V4SF_V4SF: + type = v4sf_ftype_v4sf_v4sf_v4sf; + break; + case V2DF_FTYPE_V2DF_V2DF_V2DF: + type = v2df_ftype_v2df_v2df_v2df; + break; + case V16QI_FTYPE_V16QI_V16QI_INT: type = v16qi_ftype_v16qi_v16qi_int; break; - case V8HImode: + case V8HI_FTYPE_V8HI_V8HI_INT: type = v8hi_ftype_v8hi_v8hi_int; break; - case V4SImode: + case V4SI_FTYPE_V4SI_V4SI_INT: type = v4si_ftype_v4si_v4si_int; break; - case V2DImode: + case V4SF_FTYPE_V4SF_V4SF_INT: + type = v4sf_ftype_v4sf_v4sf_int; + break; + case V2DI_FTYPE_V2DI_V2DI_INT: type = v2di_ftype_v2di_v2di_int; break; - case V2DFmode: + case V2DF_FTYPE_V2DF_V2DF_INT: type = v2df_ftype_v2df_v2df_int; break; - case V4SFmode: - type = v4sf_ftype_v4sf_v4sf_int; - break; default: gcc_unreachable (); } - /* Override for variable blends. */ - switch (d->icode) - { - case CODE_FOR_sse4_1_blendvpd: - type = v2df_ftype_v2df_v2df_v2df; - break; - case CODE_FOR_sse4_1_blendvps: - type = v4sf_ftype_v4sf_v4sf_v4sf; - break; - case CODE_FOR_sse4_1_pblendvb: - type = v16qi_ftype_v16qi_v16qi_v16qi; - break; - default: - break; - } - def_builtin_const (d->mask, d->name, type, d->code); } @@ -19077,8 +19292,8 @@ ix86_init_mmx_sse_builtins (void) case V2SImode: type = v2si_ftype_v2si_v2si; break; - case DImode: - type = di_ftype_di_di; + case V1DImode: + type = v1di_ftype_v1di_v1di; break; default: @@ -19170,16 +19385,25 @@ ix86_init_mmx_sse_builtins (void) /* Add the remaining MMX insns with somewhat more complicated types. */ def_builtin (OPTION_MASK_ISA_MMX, "__builtin_ia32_emms", void_ftype_void, IX86_BUILTIN_EMMS); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSLLW); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_di, IX86_BUILTIN_PSLLD); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllq", di_ftype_di_di, IX86_BUILTIN_PSLLQ); - - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRLW); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_di, IX86_BUILTIN_PSRLD); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlq", di_ftype_di_di, IX86_BUILTIN_PSRLQ); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRAW); - def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_di, IX86_BUILTIN_PSRAD); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllwi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSLLWI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslldi", v2si_ftype_v2si_int, IX86_BUILTIN_PSLLDI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllqi", v1di_ftype_v1di_int, IX86_BUILTIN_PSLLQI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSLLW); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSLLD); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllq", v1di_ftype_v1di_v1di, IX86_BUILTIN_PSLLQ); + + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlwi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSRLWI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrldi", v2si_ftype_v2si_int, IX86_BUILTIN_PSRLDI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlqi", v1di_ftype_v1di_int, IX86_BUILTIN_PSRLQI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSRLW); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSRLD); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlq", v1di_ftype_v1di_v1di, IX86_BUILTIN_PSRLQ); + + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrawi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSRAWI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psradi", v2si_ftype_v2si_int, IX86_BUILTIN_PSRADI); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSRAW); + def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSRAD); def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_pshufw", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSHUFW); def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pmaddwd", v2si_ftype_v4hi_v4hi, IX86_BUILTIN_PMADDWD); @@ -19228,21 +19452,21 @@ ix86_init_mmx_sse_builtins (void) def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_sfence", void_ftype_void, IX86_BUILTIN_SFENCE); - def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_psadbw", di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW); + def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_psadbw", v1di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpps", v4sf_ftype_v4sf, IX86_BUILTIN_RCPPS); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS); + def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps_nr", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS_NR); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS); ftype = build_function_type_list (float_type_node, float_type_node, NULL_TREE); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtf", ftype, IX86_BUILTIN_RSQRTF); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS); + def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps_nr", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS_NR); def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS); - def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS); - /* Original 3DNow! */ def_builtin (OPTION_MASK_ISA_3DNOW, "__builtin_ia32_femms", void_ftype_void, IX86_BUILTIN_FEMMS); def_builtin_const (OPTION_MASK_ISA_3DNOW, "__builtin_ia32_pavgusb", v8qi_ftype_v8qi_v8qi, IX86_BUILTIN_PAVGUSB); @@ -19296,8 +19520,6 @@ ix86_init_mmx_sse_builtins (void) def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD); def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD); - def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_shufpd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_SHUFPD); - def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD); def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS); @@ -19330,7 +19552,7 @@ ix86_init_mmx_sse_builtins (void) def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_loaddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LOADDQU); def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_storedqu", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQU); - def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pmuludq", di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pmuludq", v1di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ); def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pmuludq128", v2di_ftype_v4si_v4si, IX86_BUILTIN_PMULUDQ128); def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pslldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLDQI128); @@ -19381,12 +19603,6 @@ ix86_init_mmx_sse_builtins (void) def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_pmovzxdq128", v2di_ftype_v4si, IX86_BUILTIN_PMOVZXDQ128); def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_pmuldq128", v2di_ftype_v4si_v4si, IX86_BUILTIN_PMULDQ128); - /* SSE4.1 and SSE5 */ - def_builtin_const (OPTION_MASK_ISA_ROUND, "__builtin_ia32_roundpd", v2df_ftype_v2df_int, IX86_BUILTIN_ROUNDPD); - def_builtin_const (OPTION_MASK_ISA_ROUND, "__builtin_ia32_roundps", v4sf_ftype_v4sf_int, IX86_BUILTIN_ROUNDPS); - def_builtin_const (OPTION_MASK_ISA_ROUND, "__builtin_ia32_roundsd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_ROUNDSD); - def_builtin_const (OPTION_MASK_ISA_ROUND, "__builtin_ia32_roundss", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_ROUNDSS); - /* SSE4.2. */ ftype = build_function_type_list (unsigned_type_node, unsigned_type_node, @@ -19409,6 +19625,25 @@ ix86_init_mmx_sse_builtins (void) NULL_TREE); def_builtin_const (OPTION_MASK_ISA_SSE4_2, "__builtin_ia32_crc32di", ftype, IX86_BUILTIN_CRC32DI); + /* AES */ + if (TARGET_AES) + { + /* Define AES built-in functions only if AES is enabled. */ + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_aesenc128", v2di_ftype_v2di_v2di, IX86_BUILTIN_AESENC128); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_aesenclast128", v2di_ftype_v2di_v2di, IX86_BUILTIN_AESENCLAST128); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_aesdec128", v2di_ftype_v2di_v2di, IX86_BUILTIN_AESDEC128); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_aesdeclast128", v2di_ftype_v2di_v2di, IX86_BUILTIN_AESDECLAST128); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_aesimc128", v2di_ftype_v2di, IX86_BUILTIN_AESIMC128); + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_aeskeygenassist128", v2di_ftype_v2di_int, IX86_BUILTIN_AESKEYGENASSIST128); + } + + /* PCLMUL */ + if (TARGET_PCLMUL) + { + /* Define PCLMUL built-in function only if PCLMUL is enabled. */ + def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pclmulqdq128", v2di_ftype_v2di_v2di_int, IX86_BUILTIN_PCLMULQDQ128); + } + /* AMDFAM10 SSE4A New built-ins */ def_builtin (OPTION_MASK_ISA_SSE4A, "__builtin_ia32_movntsd", void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD); def_builtin (OPTION_MASK_ISA_SSE4A, "__builtin_ia32_movntss", void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS); @@ -19571,6 +19806,10 @@ ix86_init_builtins (void) { if (TARGET_MMX) ix86_init_mmx_sse_builtins (); + +#ifdef SUBTARGET_INIT_BUILTINS + SUBTARGET_INIT_BUILTINS; +#endif } /* Errors in the source file can cause expand_expr to return const0_rtx @@ -19585,66 +19824,128 @@ safe_vector_operand (rtx x, enum machine_mode mode) } /* Subroutine of ix86_expand_builtin to take care of SSE insns with - 4 operands. The third argument must be a constant smaller than 8 - bits or xmm0. */ + variable number of operands. */ static rtx -ix86_expand_sse_4_operands_builtin (enum insn_code icode, tree exp, - rtx target) +ix86_expand_sse_operands_builtin (enum insn_code icode, tree exp, + enum sse_builtin_type type, + rtx target) { rtx pat; - tree arg0 = CALL_EXPR_ARG (exp, 0); - tree arg1 = CALL_EXPR_ARG (exp, 1); - tree arg2 = CALL_EXPR_ARG (exp, 2); - rtx op0 = expand_normal (arg0); - rtx op1 = expand_normal (arg1); - rtx op2 = expand_normal (arg2); - enum machine_mode tmode = insn_data[icode].operand[0].mode; - enum machine_mode mode1 = insn_data[icode].operand[1].mode; - enum machine_mode mode2 = insn_data[icode].operand[2].mode; - enum machine_mode mode3 = insn_data[icode].operand[3].mode; + unsigned int i, nargs; + int num_memory = 0; + struct + { + rtx op; + enum machine_mode mode; + } args[3]; + bool last_arg_constant = false; + const struct insn_data *insn_p = &insn_data[icode]; + enum machine_mode tmode = insn_p->operand[0].mode; - if (VECTOR_MODE_P (mode1)) - op0 = safe_vector_operand (op0, mode1); - if (VECTOR_MODE_P (mode2)) - op1 = safe_vector_operand (op1, mode2); - if (VECTOR_MODE_P (mode3)) - op2 = safe_vector_operand (op2, mode3); + switch (type) + { + case V4SF_FTYPE_V4SF_INT: + case V2DI_FTYPE_V2DI_INT: + case V2DF_FTYPE_V2DF_INT: + nargs = 2; + last_arg_constant = true; + break; + case V16QI_FTYPE_V16QI_V16QI_V16QI: + case V4SF_FTYPE_V4SF_V4SF_V4SF: + case V2DF_FTYPE_V2DF_V2DF_V2DF: + nargs = 3; + break; + case V16QI_FTYPE_V16QI_V16QI_INT: + case V8HI_FTYPE_V8HI_V8HI_INT: + case V4SI_FTYPE_V4SI_V4SI_INT: + case V4SF_FTYPE_V4SF_V4SF_INT: + case V2DI_FTYPE_V2DI_V2DI_INT: + case V2DF_FTYPE_V2DF_V2DF_INT: + nargs = 3; + last_arg_constant = true; + break; + default: + gcc_unreachable (); + } + + gcc_assert (nargs <= ARRAY_SIZE (args)); if (optimize || target == 0 || GET_MODE (target) != tmode - || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) + || ! (*insn_p->operand[0].predicate) (target, tmode)) target = gen_reg_rtx (tmode); - if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) - op0 = copy_to_mode_reg (mode1, op0); - if ((optimize && !register_operand (op1, mode2)) - || !(*insn_data[icode].operand[2].predicate) (op1, mode2)) - op1 = copy_to_mode_reg (mode2, op1); + for (i = 0; i < nargs; i++) + { + tree arg = CALL_EXPR_ARG (exp, i); + rtx op = expand_normal (arg); + enum machine_mode mode = insn_p->operand[i + 1].mode; + bool match = (*insn_p->operand[i + 1].predicate) (op, mode); - if (! (*insn_data[icode].operand[3].predicate) (op2, mode3)) - switch (icode) - { - case CODE_FOR_sse4_1_blendvpd: - case CODE_FOR_sse4_1_blendvps: - case CODE_FOR_sse4_1_pblendvb: - op2 = copy_to_mode_reg (mode3, op2); - break; + if (last_arg_constant && (i + 1) == nargs) + { + if (!match) + switch (icode) + { + case CODE_FOR_sse4_1_roundpd: + case CODE_FOR_sse4_1_roundps: + case CODE_FOR_sse4_1_roundsd: + case CODE_FOR_sse4_1_roundss: + case CODE_FOR_sse4_1_blendps: + error ("the last argument must be a 4-bit immediate"); + return const0_rtx; + + case CODE_FOR_sse4_1_blendpd: + error ("the last argument must be a 2-bit immediate"); + return const0_rtx; + + default: + error ("the last argument must be an 8-bit immediate"); + return const0_rtx; + } + } + else + { + if (VECTOR_MODE_P (mode)) + op = safe_vector_operand (op, mode); - case CODE_FOR_sse4_1_roundsd: - case CODE_FOR_sse4_1_roundss: - error ("the third argument must be a 4-bit immediate"); - return const0_rtx; + /* If we aren't optimizing, only allow one memory operand to + be generated. */ + if (memory_operand (op, mode)) + num_memory++; - default: - error ("the third argument must be an 8-bit immediate"); - return const0_rtx; - } + gcc_assert (GET_MODE (op) == mode + || GET_MODE (op) == VOIDmode); + + if (optimize || !match || num_memory > 1) + op = copy_to_mode_reg (mode, op); + } + + args[i].op = op; + args[i].mode = mode; + } + + switch (nargs) + { + case 1: + pat = GEN_FCN (icode) (target, args[0].op); + break; + case 2: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op); + break; + case 3: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, + args[2].op); + break; + default: + gcc_unreachable (); + } - pat = GEN_FCN (icode) (target, op0, op1, op2); if (! pat) return 0; + emit_insn (pat); return target; } @@ -19684,6 +19985,44 @@ ix86_expand_crc32 (enum insn_code icode, tree exp, rtx target) return target; } +/* Subroutine of ix86_expand_builtin to take care of binop insns + with an immediate. */ + +static rtx +ix86_expand_binop_imm_builtin (enum insn_code icode, tree exp, + rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + enum machine_mode tmode = insn_data[icode].operand[0].mode; + enum machine_mode mode0 = insn_data[icode].operand[1].mode; + enum machine_mode mode1 = insn_data[icode].operand[2].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + { + op0 = copy_to_reg (op0); + op0 = simplify_gen_subreg (mode0, op0, GET_MODE (op0), 0); + } + + if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) + { + error ("the last operand must be an immediate"); + return const0_rtx; + } + + target = gen_reg_rtx (V2DImode); + pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, + V2DImode, 0), + op0, op1); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + /* Subroutine of ix86_expand_builtin to take care of binop insns. */ static rtx @@ -19976,28 +20315,7 @@ ix86_expand_unop_builtin (enum insn_code icode, tree exp, op0 = copy_to_mode_reg (mode0, op0); } - switch (icode) - { - case CODE_FOR_sse4_1_roundpd: - case CODE_FOR_sse4_1_roundps: - { - tree arg1 = CALL_EXPR_ARG (exp, 1); - rtx op1 = expand_normal (arg1); - enum machine_mode mode1 = insn_data[icode].operand[2].mode; - - if (! (*insn_data[icode].operand[2].predicate) (op1, mode1)) - { - error ("the second argument must be a 4-bit immediate"); - return const0_rtx; - } - pat = GEN_FCN (icode) (target, op0, op1); - } - break; - default: - pat = GEN_FCN (icode) (target, op0); - break; - } - + pat = GEN_FCN (icode) (target, op0); if (! pat) return 0; emit_insn (pat); @@ -20651,43 +20969,6 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, emit_insn (gen_sse_stmxcsr (target)); return copy_to_mode_reg (SImode, target); - case IX86_BUILTIN_SHUFPS: - case IX86_BUILTIN_SHUFPD: - icode = (fcode == IX86_BUILTIN_SHUFPS - ? CODE_FOR_sse_shufps - : CODE_FOR_sse2_shufpd); - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - arg2 = CALL_EXPR_ARG (exp, 2); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - op2 = expand_normal (arg2); - tmode = insn_data[icode].operand[0].mode; - mode0 = insn_data[icode].operand[1].mode; - mode1 = insn_data[icode].operand[2].mode; - mode2 = insn_data[icode].operand[3].mode; - - if (! (*insn_data[icode].operand[1].predicate) (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); - if ((optimize && !register_operand (op1, mode1)) - || !(*insn_data[icode].operand[2].predicate) (op1, mode1)) - op1 = copy_to_mode_reg (mode1, op1); - if (! (*insn_data[icode].operand[3].predicate) (op2, mode2)) - { - /* @@@ better error message */ - error ("mask must be an immediate"); - return gen_reg_rtx (tmode); - } - if (optimize || target == 0 - || GET_MODE (target) != tmode - || ! (*insn_data[icode].operand[0].predicate) (target, tmode)) - target = gen_reg_rtx (tmode); - pat = GEN_FCN (icode) (target, op0, op1, op2); - if (! pat) - return 0; - emit_insn (pat); - return target; - case IX86_BUILTIN_PSHUFW: case IX86_BUILTIN_PSHUFD: case IX86_BUILTIN_PSHUFHW: @@ -20722,6 +21003,39 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, emit_insn (pat); return target; + case IX86_BUILTIN_PSLLW: + case IX86_BUILTIN_PSLLWI: + icode = CODE_FOR_mmx_ashlv4hi3; + goto do_pshift; + case IX86_BUILTIN_PSLLD: + case IX86_BUILTIN_PSLLDI: + icode = CODE_FOR_mmx_ashlv2si3; + goto do_pshift; + case IX86_BUILTIN_PSLLQ: + case IX86_BUILTIN_PSLLQI: + icode = CODE_FOR_mmx_ashlv1di3; + goto do_pshift; + case IX86_BUILTIN_PSRAW: + case IX86_BUILTIN_PSRAWI: + icode = CODE_FOR_mmx_ashrv4hi3; + goto do_pshift; + case IX86_BUILTIN_PSRAD: + case IX86_BUILTIN_PSRADI: + icode = CODE_FOR_mmx_ashrv2si3; + goto do_pshift; + case IX86_BUILTIN_PSRLW: + case IX86_BUILTIN_PSRLWI: + icode = CODE_FOR_mmx_lshrv4hi3; + goto do_pshift; + case IX86_BUILTIN_PSRLD: + case IX86_BUILTIN_PSRLDI: + icode = CODE_FOR_mmx_lshrv2si3; + goto do_pshift; + case IX86_BUILTIN_PSRLQ: + case IX86_BUILTIN_PSRLQI: + icode = CODE_FOR_mmx_lshrv1di3; + goto do_pshift; + case IX86_BUILTIN_PSLLW128: case IX86_BUILTIN_PSLLWI128: icode = CODE_FOR_ashlv8hi3; @@ -20780,34 +21094,14 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, return target; case IX86_BUILTIN_PSLLDQI128: - case IX86_BUILTIN_PSRLDQI128: - icode = (fcode == IX86_BUILTIN_PSLLDQI128 ? CODE_FOR_sse2_ashlti3 - : CODE_FOR_sse2_lshrti3); - arg0 = CALL_EXPR_ARG (exp, 0); - arg1 = CALL_EXPR_ARG (exp, 1); - op0 = expand_normal (arg0); - op1 = expand_normal (arg1); - tmode = insn_data[icode].operand[0].mode; - mode1 = insn_data[icode].operand[1].mode; - mode2 = insn_data[icode].operand[2].mode; + return ix86_expand_binop_imm_builtin (CODE_FOR_sse2_ashlti3, + exp, target); + break; - if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) - { - op0 = copy_to_reg (op0); - op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0); - } - if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) - { - error ("shift must be an immediate"); - return const0_rtx; - } - target = gen_reg_rtx (V2DImode); - pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, V2DImode, 0), - op0, op1); - if (! pat) - return 0; - emit_insn (pat); - return target; + case IX86_BUILTIN_PSRLDQI128: + return ix86_expand_binop_imm_builtin (CODE_FOR_sse2_lshrti3, + exp, target); + break; case IX86_BUILTIN_FEMMS: emit_insn (gen_mmx_femms ()); @@ -21167,12 +21461,15 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, break; } - for (i = 0, d = bdesc_sse_3arg; - i < ARRAY_SIZE (bdesc_sse_3arg); + for (i = 0, d = bdesc_sse_args; + i < ARRAY_SIZE (bdesc_sse_args); i++, d++) if (d->code == fcode) - return ix86_expand_sse_4_operands_builtin (d->icode, exp, - target); + { + enum sse_builtin_type type = (enum sse_builtin_type) d->flag; + return ix86_expand_sse_operands_builtin (d->icode, exp, + type, target); + } for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++) if (d->code == fcode) @@ -21255,7 +21552,7 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out, case BUILT_IN_SQRTF: if (out_mode == SFmode && out_n == 4 && in_mode == SFmode && in_n == 4) - return ix86_builtins[IX86_BUILTIN_SQRTPS]; + return ix86_builtins[IX86_BUILTIN_SQRTPS_NR]; break; case BUILT_IN_LRINT: @@ -21281,8 +21578,120 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out, return NULL_TREE; } -/* Handler for an ACML-style interface to a library with vectorized - intrinsics. */ +/* Handler for an SVML-style interface to + a library with vectorized intrinsics. */ + +static tree +ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in) +{ + char name[20]; + tree fntype, new_fndecl, args; + unsigned arity; + const char *bname; + enum machine_mode el_mode, in_mode; + int n, in_n; + + /* The SVML is suitable for unsafe math only. */ + if (!flag_unsafe_math_optimizations) + return NULL_TREE; + + el_mode = TYPE_MODE (TREE_TYPE (type_out)); + n = TYPE_VECTOR_SUBPARTS (type_out); + in_mode = TYPE_MODE (TREE_TYPE (type_in)); + in_n = TYPE_VECTOR_SUBPARTS (type_in); + if (el_mode != in_mode + || n != in_n) + return NULL_TREE; + + switch (fn) + { + case BUILT_IN_EXP: + case BUILT_IN_LOG: + case BUILT_IN_LOG10: + case BUILT_IN_POW: + case BUILT_IN_TANH: + case BUILT_IN_TAN: + case BUILT_IN_ATAN: + case BUILT_IN_ATAN2: + case BUILT_IN_ATANH: + case BUILT_IN_CBRT: + case BUILT_IN_SINH: + case BUILT_IN_SIN: + case BUILT_IN_ASINH: + case BUILT_IN_ASIN: + case BUILT_IN_COSH: + case BUILT_IN_COS: + case BUILT_IN_ACOSH: + case BUILT_IN_ACOS: + if (el_mode != DFmode || n != 2) + return NULL_TREE; + break; + + case BUILT_IN_EXPF: + case BUILT_IN_LOGF: + case BUILT_IN_LOG10F: + case BUILT_IN_POWF: + case BUILT_IN_TANHF: + case BUILT_IN_TANF: + case BUILT_IN_ATANF: + case BUILT_IN_ATAN2F: + case BUILT_IN_ATANHF: + case BUILT_IN_CBRTF: + case BUILT_IN_SINHF: + case BUILT_IN_SINF: + case BUILT_IN_ASINHF: + case BUILT_IN_ASINF: + case BUILT_IN_COSHF: + case BUILT_IN_COSF: + case BUILT_IN_ACOSHF: + case BUILT_IN_ACOSF: + if (el_mode != SFmode || n != 4) + return NULL_TREE; + break; + + default: + return NULL_TREE; + } + + bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn])); + + if (fn == BUILT_IN_LOGF) + strcpy (name, "vmlsLn4"); + else if (fn == BUILT_IN_LOG) + strcpy (name, "vmldLn2"); + else if (n == 4) + { + sprintf (name, "vmls%s", bname+10); + name[strlen (name)-1] = '4'; + } + else + sprintf (name, "vmld%s2", bname+10); + + /* Convert to uppercase. */ + name[4] &= ~0x20; + + arity = 0; + for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args; + args = TREE_CHAIN (args)) + arity++; + + if (arity == 1) + fntype = build_function_type_list (type_out, type_in, NULL); + else + fntype = build_function_type_list (type_out, type_in, type_in, NULL); + + /* Build a function declaration for the vectorized function. */ + new_fndecl = build_decl (FUNCTION_DECL, get_identifier (name), fntype); + TREE_PUBLIC (new_fndecl) = 1; + DECL_EXTERNAL (new_fndecl) = 1; + DECL_IS_NOVOPS (new_fndecl) = 1; + TREE_READONLY (new_fndecl) = 1; + + return new_fndecl; +} + +/* Handler for an ACML-style interface to + a library with vectorized intrinsics. */ static tree ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in) @@ -21417,8 +21826,8 @@ ix86_builtin_reciprocal (unsigned int fn, bool md_fn, switch (fn) { /* Vectorized version of sqrt to rsqrt conversion. */ - case IX86_BUILTIN_SQRTPS: - return ix86_builtins[IX86_BUILTIN_RSQRTPS]; + case IX86_BUILTIN_SQRTPS_NR: + return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR]; default: return NULL_TREE; @@ -21914,7 +22323,7 @@ ix86_register_move_cost (enum machine_mode mode, enum reg_class class1, where integer modes in MMX/SSE registers are not tieable because of missing QImode and HImode moves to, from or between MMX/SSE registers. */ - return MAX (ix86_cost->mmxsse_to_integer, 8); + return MAX (8, ix86_cost->mmxsse_to_integer); if (MAYBE_FLOAT_CLASS_P (class1)) return ix86_cost->fp_move; @@ -22202,7 +22611,7 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total) nbits = 7; /* Compute costs correctly for widening multiplication. */ - if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op1) == ZERO_EXTEND) + if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND) && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2 == GET_MODE_SIZE (mode)) { @@ -22573,6 +22982,7 @@ x86_this_parameter (tree function) { tree type = TREE_TYPE (function); bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0; + int nregs; if (TARGET_64BIT) { @@ -22585,11 +22995,25 @@ x86_this_parameter (tree function) return gen_rtx_REG (DImode, parm_regs[aggr]); } - if (ix86_function_regparm (type, function) > 0 && !stdarg_p (type)) + nregs = ix86_function_regparm (type, function); + + if (nregs > 0 && !stdarg_p (type)) { - int regno = AX_REG; + int regno; + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) - regno = CX_REG; + regno = aggr ? DX_REG : CX_REG; + else + { + regno = AX_REG; + if (aggr) + { + regno = DX_REG; + if (nregs == 1) + return gen_rtx_MEM (SImode, + plus_constant (stack_pointer_rtx, 4)); + } + } return gen_rtx_REG (SImode, regno); } @@ -23551,7 +23975,9 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals) for (i = 0; i < n_elts; ++i) { x = XVECEXP (vals, 0, i); - if (!CONSTANT_P (x)) + if (!(CONST_INT_P (x) + || GET_CODE (x) == CONST_DOUBLE + || GET_CODE (x) == CONST_FIXED)) n_var++, one_var = i; else if (x != CONST0_RTX (inner_mode)) all_const_zero = false; @@ -24128,7 +24554,7 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) /* a / b = a * rcp(b) * (2.0 - b * rcp(b)) */ - /* x0 = 1./b estimate */ + /* x0 = rcp(b) estimate */ emit_insn (gen_rtx_SET (VOIDmode, x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b), UNSPEC_RCP))); @@ -24152,7 +24578,8 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode) void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, bool recip) { - rtx x0, e0, e1, e2, e3, three, half, zero, mask; + rtx x0, e0, e1, e2, e3, mthree, mhalf; + REAL_VALUE_TYPE r; x0 = gen_reg_rtx (mode); e0 = gen_reg_rtx (mode); @@ -24160,42 +24587,41 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, e2 = gen_reg_rtx (mode); e3 = gen_reg_rtx (mode); - three = CONST_DOUBLE_FROM_REAL_VALUE (dconst3, SFmode); - half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode); + real_from_integer (&r, VOIDmode, -3, -1, 0); + mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode); - mask = gen_reg_rtx (mode); + real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL); + mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode); if (VECTOR_MODE_P (mode)) { - three = ix86_build_const_vector (SFmode, true, three); - half = ix86_build_const_vector (SFmode, true, half); + mthree = ix86_build_const_vector (SFmode, true, mthree); + mhalf = ix86_build_const_vector (SFmode, true, mhalf); } - three = force_reg (mode, three); - half = force_reg (mode, half); - - zero = force_reg (mode, CONST0_RTX(mode)); + /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) + rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */ - /* sqrt(a) = 0.5 * a * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) - 1.0 / sqrt(a) = 0.5 * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) */ - - /* Compare a to zero. */ - emit_insn (gen_rtx_SET (VOIDmode, mask, - gen_rtx_NE (mode, a, zero))); - - /* x0 = 1./sqrt(a) estimate */ + /* x0 = rsqrt(a) estimate */ emit_insn (gen_rtx_SET (VOIDmode, x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a), UNSPEC_RSQRT))); - /* Filter out infinity. */ - if (VECTOR_MODE_P (mode)) - emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (V4SFmode, x0), - gen_rtx_AND (mode, - gen_lowpart (V4SFmode, x0), - gen_lowpart (V4SFmode, mask)))); - else - emit_insn (gen_rtx_SET (VOIDmode, x0, - gen_rtx_AND (mode, x0, mask))); + + /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */ + if (!recip) + { + rtx zero, mask; + + zero = gen_reg_rtx (mode); + mask = gen_reg_rtx (mode); + + zero = force_reg (mode, CONST0_RTX(mode)); + emit_insn (gen_rtx_SET (VOIDmode, mask, + gen_rtx_NE (mode, zero, a))); + + emit_insn (gen_rtx_SET (VOIDmode, x0, + gen_rtx_AND (mode, x0, mask))); + } /* e0 = x0 * a */ emit_insn (gen_rtx_SET (VOIDmode, e0, @@ -24203,17 +24629,21 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode, /* e1 = e0 * x0 */ emit_insn (gen_rtx_SET (VOIDmode, e1, gen_rtx_MULT (mode, e0, x0))); - /* e2 = 3. - e1 */ + + /* e2 = e1 - 3. */ + mthree = force_reg (mode, mthree); emit_insn (gen_rtx_SET (VOIDmode, e2, - gen_rtx_MINUS (mode, three, e1))); + gen_rtx_PLUS (mode, e1, mthree))); + + mhalf = force_reg (mode, mhalf); if (recip) - /* e3 = .5 * x0 */ + /* e3 = -.5 * x0 */ emit_insn (gen_rtx_SET (VOIDmode, e3, - gen_rtx_MULT (mode, half, x0))); + gen_rtx_MULT (mode, x0, mhalf))); else - /* e3 = .5 * e0 */ + /* e3 = -.5 * e0 */ emit_insn (gen_rtx_SET (VOIDmode, e3, - gen_rtx_MULT (mode, half, e0))); + gen_rtx_MULT (mode, e0, mhalf))); /* ret = e2 * e3 */ emit_insn (gen_rtx_SET (VOIDmode, res, gen_rtx_MULT (mode, e2, e3))); @@ -25235,6 +25665,9 @@ x86_builtin_vectorization_cost (bool runtime_test) #undef TARGET_BUILD_BUILTIN_VA_LIST #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list +#undef TARGET_EXPAND_BUILTIN_VA_START +#define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start + #undef TARGET_MD_ASM_CLOBBERS #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers