/* Subroutines used for code generation on IA-32.
Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
- 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
+ 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ Free Software Foundation, Inc.
This file is part of GCC.
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
- 5, /* Branch cost */
+ 3, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2, /* vec_align_load_cost. */
3, /* vec_unalign_load_cost. */
3, /* vec_store_cost. */
- 6, /* cond_taken_branch_cost. */
- 1, /* cond_not_taken_branch_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 2, /* cond_not_taken_branch_cost. */
};
struct processor_costs amdfam10_cost = {
to limit number of prefetches at all, as their execution also takes some
time). */
100, /* number of parallel prefetches */
- 5, /* Branch cost */
+ 2, /* Branch cost */
COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
COSTS_N_INSNS (4), /* cost of FMUL instruction. */
COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2, /* vec_align_load_cost. */
2, /* vec_unalign_load_cost. */
2, /* vec_store_cost. */
- 6, /* cond_taken_branch_cost. */
+ 2, /* cond_taken_branch_cost. */
1, /* cond_not_taken_branch_cost. */
};
replacement is long decoded, so this split helps here as well. */
m_K6,
- /* X86_TUNE_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion
+ /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
from integer to FP. */
m_AMDFAM10,
};
was set or cleared on the command line. */
static int ix86_isa_flags_explicit;
-/* Define a set of ISAs which aren't available for a given ISA. MMX
- and SSE ISAs are handled separately. */
+/* Define a set of ISAs which are available when a given ISA is
+ enabled. MMX and SSE ISAs are handled separately. */
+
+#define OPTION_MASK_ISA_MMX_SET OPTION_MASK_ISA_MMX
+#define OPTION_MASK_ISA_3DNOW_SET \
+ (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_MMX_SET)
+
+#define OPTION_MASK_ISA_SSE_SET OPTION_MASK_ISA_SSE
+#define OPTION_MASK_ISA_SSE2_SET \
+ (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE_SET)
+#define OPTION_MASK_ISA_SSE3_SET \
+ (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE2_SET)
+#define OPTION_MASK_ISA_SSSE3_SET \
+ (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE3_SET)
+#define OPTION_MASK_ISA_SSE4_1_SET \
+ (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSSE3_SET)
+#define OPTION_MASK_ISA_SSE4_2_SET \
+ (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_1_SET)
+
+/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
+ as -msse4.2. */
+#define OPTION_MASK_ISA_SSE4_SET OPTION_MASK_ISA_SSE4_2_SET
+
+#define OPTION_MASK_ISA_SSE4A_SET \
+ (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET)
+#define OPTION_MASK_ISA_SSE5_SET \
+ (OPTION_MASK_ISA_SSE5 | OPTION_MASK_ISA_SSE4A_SET)
+
+/* Define a set of ISAs which aren't available when a given ISA is
+ disabled. MMX and SSE ISAs are handled separately. */
#define OPTION_MASK_ISA_MMX_UNSET \
- (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET)
-#define OPTION_MASK_ISA_3DNOW_UNSET OPTION_MASK_ISA_3DNOW_A
+ (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_3DNOW_UNSET)
+#define OPTION_MASK_ISA_3DNOW_UNSET \
+ (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_A_UNSET)
+#define OPTION_MASK_ISA_3DNOW_A_UNSET OPTION_MASK_ISA_3DNOW_A
#define OPTION_MASK_ISA_SSE_UNSET \
- (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE2_UNSET)
+ (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2_UNSET)
#define OPTION_MASK_ISA_SSE2_UNSET \
- (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE3_UNSET)
+ (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE3_UNSET)
#define OPTION_MASK_ISA_SSE3_UNSET \
- (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSSE3_UNSET)
+ (OPTION_MASK_ISA_SSE3 \
+ | OPTION_MASK_ISA_SSSE3_UNSET \
+ | OPTION_MASK_ISA_SSE4A_UNSET )
#define OPTION_MASK_ISA_SSSE3_UNSET \
- (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_1_UNSET)
+ (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE4_1_UNSET)
#define OPTION_MASK_ISA_SSE4_1_UNSET \
- (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_2_UNSET)
-#define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4A
+ (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2_UNSET)
+#define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4_2
-/* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
- as -msse4.1 -msse4.2. -mno-sse4 should the same as -mno-sse4.1. */
-#define OPTION_MASK_ISA_SSE4 \
- (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2)
+/* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
+ as -mno-sse4.1. */
#define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
-#define OPTION_MASK_ISA_SSE4A_UNSET OPTION_MASK_ISA_SSE4
+#define OPTION_MASK_ISA_SSE4A_UNSET \
+ (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE5_UNSET)
-#define OPTION_MASK_ISA_SSE5_UNSET \
- (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET)
+#define OPTION_MASK_ISA_SSE5_UNSET OPTION_MASK_ISA_SSE5
/* Vectorization library interface and handlers. */
tree (*ix86_veclib_handler)(enum built_in_function, tree, tree) = NULL;
switch (code)
{
case OPT_mmmx:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_MMX_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_MMX_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_UNSET;
return true;
case OPT_m3dnow:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_3DNOW_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_UNSET;
return false;
case OPT_msse:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_SSE_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_SSE_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_UNSET;
return true;
case OPT_msse2:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_SSE2_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_SSE2_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_UNSET;
return true;
case OPT_msse3:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_SSE3_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_SSE3_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_UNSET;
return true;
case OPT_mssse3:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_SSSE3_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_SSSE3_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_UNSET;
return true;
case OPT_msse4_1:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_1_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_UNSET;
return true;
case OPT_msse4_2:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_2_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_UNSET;
return true;
case OPT_msse4:
- ix86_isa_flags |= OPTION_MASK_ISA_SSE4;
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4;
+ ix86_isa_flags |= OPTION_MASK_ISA_SSE4_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_SET;
return true;
case OPT_mno_sse4:
return true;
case OPT_msse4a:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_SSE4A_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4A_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_UNSET;
return true;
case OPT_msse5:
- ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5;
- if (!value)
+ if (value)
+ {
+ ix86_isa_flags |= OPTION_MASK_ISA_SSE5_SET;
+ ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_SET;
+ }
+ else
{
ix86_isa_flags &= ~OPTION_MASK_ISA_SSE5_UNSET;
ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_UNSET;
{&amdfam10_cost, 32, 24, 32, 7, 32}
};
- static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
+ static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
+ {
+ "generic",
+ "i386",
+ "i486",
+ "pentium",
+ "pentium-mmx",
+ "pentiumpro",
+ "pentium2",
+ "pentium3",
+ "pentium4",
+ "pentium-m",
+ "prescott",
+ "nocona",
+ "core2",
+ "geode",
+ "k6",
+ "k6-2",
+ "k6-3",
+ "athlon",
+ "athlon-4",
+ "k8",
+ "amdfam10"
+ };
+
enum pta_flags
{
PTA_SSE = 1 << 0,
ix86_tune_string = ix86_arch_string;
if (!ix86_tune_string)
{
- ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT];
+ ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
ix86_tune_defaulted = 1;
}
if (!TARGET_80387)
target_flags |= MASK_NO_FANCY_MATH_387;
- /* Turn on SSE4A bultins for -msse5. */
- if (TARGET_SSE5)
- ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
-
- /* Turn on SSE4.1 builtins for -msse4.2. */
- if (TARGET_SSE4_2)
- ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
-
- /* Turn on SSSE3 builtins for -msse4.1. */
- if (TARGET_SSE4_1)
- ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
-
- /* Turn on SSE3 builtins for -mssse3. */
- if (TARGET_SSSE3)
- ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
-
- /* Turn on SSE3 builtins for -msse4a. */
- if (TARGET_SSE4A)
- ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
-
- /* Turn on SSE2 builtins for -msse3. */
- if (TARGET_SSE3)
- ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
-
- /* Turn on SSE builtins for -msse2. */
- if (TARGET_SSE2)
- ix86_isa_flags |= OPTION_MASK_ISA_SSE;
-
/* Turn on MMX builtins for -msse. */
if (TARGET_SSE)
{
x86_prefetch_sse = true;
}
- /* Turn on MMX builtins for 3Dnow. */
- if (TARGET_3DNOW)
- ix86_isa_flags |= OPTION_MASK_ISA_MMX;
-
/* Turn on popcnt instruction for -msse4.2 or -mabm. */
if (TARGET_SSE4_2 || TARGET_ABM)
x86_popcnt = true;
target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
}
+ /* If stack probes are required, the space used for large function
+ arguments on the stack must also be probed, so enable
+ -maccumulate-outgoing-args so this happens in the prologue. */
+ if (TARGET_STACK_PROBE
+ && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
+ {
+ if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
+ warning (0, "stack probing requires -maccumulate-outgoing-args "
+ "for correctness");
+ target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
+ }
+
/* For sane SSE instruction set generation we need fcomi instruction.
It is safe to enable all CMOVE instructions. */
if (TARGET_SSE)
set_param_value ("l1-cache-size", ix86_cost->l1_cache_size);
if (!PARAM_SET_P (PARAM_L2_CACHE_SIZE))
set_param_value ("l2-cache-size", ix86_cost->l2_cache_size);
+
+ /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
+ can be optimized to ap = __builtin_next_arg (0). */
+ if (!TARGET_64BIT || TARGET_64BIT_MS_ABI)
+ targetm.expand_builtin_va_start = NULL;
}
\f
/* Return true if this goes in large data/bss. */
flag_omit_frame_pointer = 2;
flag_pcc_struct_return = 2;
flag_asynchronous_unwind_tables = 2;
+ flag_vect_cost_model = 1;
#ifdef SUBTARGET_OPTIMIZATION_OPTIONS
SUBTARGET_OPTIMIZATION_OPTIONS;
#endif
struct function *f;
/* Make sure no regparm register is taken by a
- global register variable. */
- for (local_regparm = 0; local_regparm < 3; local_regparm++)
- if (global_regs[local_regparm])
+ fixed register variable. */
+ for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
+ if (fixed_regs[local_regparm])
break;
/* We can't use regparm(3) for nested functions as these use
TYPE_ATTRIBUTES (TREE_TYPE (decl)))))
local_regparm = 2;
- /* Each global register variable increases register preassure,
- so the more global reg vars there are, the smaller regparm
- optimization use, unless requested by the user explicitly. */
- for (regno = 0; regno < 6; regno++)
- if (global_regs[regno])
+ /* Each fixed register usage increases register pressure,
+ so less registers should be used for argument passing.
+ This functionality can be overriden by an explicit
+ regparm value. */
+ for (regno = 0; regno <= DI_REG; regno++)
+ if (fixed_regs[regno])
globals++;
+
local_regparm
= globals < local_regparm ? local_regparm - globals : 0;
indirectly or considering a libcall. Otherwise return 0. */
static int
-ix86_function_sseregparm (const_tree type, const_tree decl)
+ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
{
gcc_assert (!TARGET_64BIT);
{
if (!TARGET_SSE)
{
- if (decl)
- error ("Calling %qD with attribute sseregparm without "
- "SSE/SSE2 enabled", decl);
- else
- error ("Calling %qT with attribute sseregparm without "
- "SSE/SSE2 enabled", type);
+ if (warn)
+ {
+ if (decl)
+ error ("Calling %qD with attribute sseregparm without "
+ "SSE/SSE2 enabled", decl);
+ else
+ error ("Calling %qT with attribute sseregparm without "
+ "SSE/SSE2 enabled", type);
+ }
return 0;
}
rtx libname, /* SYMBOL_REF of library name or 0 */
tree fndecl)
{
+ struct cgraph_local_info *i = fndecl ? cgraph_local_info (fndecl) : NULL;
memset (cum, 0, sizeof (*cum));
/* Set up the number of registers to use for passing arguments. */
cum->mmx_nregs = MMX_REGPARM_MAX;
cum->warn_sse = true;
cum->warn_mmx = true;
+
+ /* Because type might mismatch in between caller and callee, we need to
+ use actual type of function for local calls.
+ FIXME: cgraph_analyze can be told to actually record if function uses
+ va_start so for local functions maybe_vaarg can be made aggressive
+ helping K&R code.
+ FIXME: once typesytem is fixed, we won't need this code anymore. */
+ if (i && i->local)
+ fntype = TREE_TYPE (fndecl);
cum->maybe_vaarg = (fntype
? (!prototype_p (fntype) || stdarg_p (fntype))
: !libname);
/* Set up the number of SSE registers used for passing SFmode
and DFmode arguments. Warn for mismatching ABI. */
- cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl);
+ cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
}
}
}
/* for V1xx modes, just use the base mode */
- if (VECTOR_MODE_P (mode)
+ if (VECTOR_MODE_P (mode) && mode != V1DImode
&& GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
mode = GET_MODE_INNER (mode);
classes[0] = X86_64_SSE_CLASS;
classes[1] = X86_64_SSEUP_CLASS;
return 2;
+ case V1DImode:
case V2SFmode:
case V2SImode:
case V4HImode:
case V4HImode:
case V2SImode:
case V2SFmode:
+ case V1DImode:
if (!type || !AGGREGATE_TYPE_P (type))
{
cum->mmx_words += words;
int regno = cum->regno;
/* Fastcall allocates the first two DWORD (SImode) or
- smaller arguments to ECX and EDX. */
+ smaller arguments to ECX and EDX if it isn't an
+ aggregate type . */
if (cum->fastcall)
{
- if (mode == BLKmode || mode == DImode)
+ if (mode == BLKmode
+ || mode == DImode
+ || (type && AGGREGATE_TYPE_P (type)))
break;
/* ECX not EAX is the first allocated register. */
case V4HImode:
case V2SImode:
case V2SFmode:
+ case V1DImode:
if (!type || !AGGREGATE_TYPE_P (type))
{
if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
SSE math is enabled or for functions with sseregparm attribute. */
if ((fn || fntype) && (mode == SFmode || mode == DFmode))
{
- int sse_level = ix86_function_sseregparm (fntype, fn);
+ int sse_level = ix86_function_sseregparm (fntype, fn, false);
if ((sse_level >= 1 && mode == SFmode)
|| (sse_level == 2 && mode == DFmode))
regno = FIRST_SSE_REG;
/* Implement va_start. */
-void
+static void
ix86_va_start (tree valist, rtx nextarg)
{
HOST_WIDE_INT words, n_gpr, n_fpr;
allocate += frame.nregs * UNITS_PER_WORD;
/* When using red zone we may start register saving before allocating
- the stack frame saving one cycle of the prologue. */
- if (TARGET_RED_ZONE && frame.save_regs_using_mov)
+ the stack frame saving one cycle of the prologue. However I will
+ avoid doing this if I am going to have to probe the stack since
+ at least on x86_64 the stack probe can turn into a call that clobbers
+ a red zone location */
+ if (TARGET_RED_ZONE && frame.save_regs_using_mov
+ && (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT))
ix86_emit_save_regs_using_mov (frame_pointer_needed ? hard_frame_pointer_rtx
: stack_pointer_rtx,
-frame.nregs * UNITS_PER_WORD);
}
}
- if (frame.save_regs_using_mov && !TARGET_RED_ZONE)
+ if (frame.save_regs_using_mov
+ && !(TARGET_RED_ZONE
+ && (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)))
{
if (!frame_pointer_needed || !frame.to_allocate)
ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate);
/* Special case: on K6, [%esi] makes the instruction vector decoded.
Avoid this by transforming to [%esi+0]. */
- if (ix86_tune == PROCESSOR_K6 && !optimize_size
+ if (TARGET_K6 && !optimize_size
&& base_reg && !index_reg && !disp
&& REG_P (base_reg)
&& REGNO_REG_CLASS (REGNO (base_reg)) == SIREG)
fputs ("@PLTOFF", file);
break;
case UNSPEC_GOTPCREL:
- fputs ("@GOTPCREL(%rip)", file);
+ fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+ "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
break;
case UNSPEC_GOTTPOFF:
/* FIXME: This might be @TPOFF in Sun ld too. */
break;
case UNSPEC_GOTNTPOFF:
if (TARGET_64BIT)
- fputs ("@GOTTPOFF(%rip)", file);
+ fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+ "@GOTTPOFF(%rip)": "@GOTTPOFF[rip]", file);
else
fputs ("@GOTNTPOFF", file);
break;
void
print_reg (rtx x, int code, FILE *file)
{
- gcc_assert (REGNO (x) != ARG_POINTER_REGNUM
- && REGNO (x) != FRAME_POINTER_REGNUM
- && REGNO (x) != FLAGS_REG
- && REGNO (x) != FPSR_REG
- && REGNO (x) != FPCR_REG);
+ gcc_assert (x == pc_rtx
+ || (REGNO (x) != ARG_POINTER_REGNUM
+ && REGNO (x) != FRAME_POINTER_REGNUM
+ && REGNO (x) != FLAGS_REG
+ && REGNO (x) != FPSR_REG
+ && REGNO (x) != FPCR_REG));
- if (ASSEMBLER_DIALECT == ASM_ATT || USER_LABEL_PREFIX[0] == 0)
+ if (ASSEMBLER_DIALECT == ASM_ATT)
putc ('%', file);
+ if (x == pc_rtx)
+ {
+ gcc_assert (TARGET_64BIT);
+ fputs ("rip", file);
+ return;
+ }
+
if (code == 'w' || MMX_REG_P (x))
code = 2;
else if (code == 'b')
else if (MEM_P (x))
{
- /* No `byte ptr' prefix for call instructions. */
- if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
+ /* No `byte ptr' prefix for call instructions or BLKmode operands. */
+ if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
+ && GET_MODE (x) != BLKmode)
{
const char * size;
switch (GET_MODE_SIZE (GET_MODE (x)))
case 4: size = "DWORD"; break;
case 8: size = "QWORD"; break;
case 12: size = "XWORD"; break;
- case 16: size = "XMMWORD"; break;
+ case 16:
+ if (GET_MODE (x) == XFmode)
+ size = "XWORD";
+ else
+ size = "XMMWORD";
+ break;
default:
gcc_unreachable ();
}
break;
case SEG_FS:
case SEG_GS:
- if (USER_LABEL_PREFIX[0] == 0)
+ if (ASSEMBLER_DIALECT == ASM_ATT)
putc ('%', file);
fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
break;
gcc_unreachable ();
}
+ /* Use one byte shorter RIP relative addressing for 64bit mode. */
+ if (TARGET_64BIT && !base && !index)
+ {
+ rtx symbol = disp;
+
+ if (GET_CODE (disp) == CONST
+ && GET_CODE (XEXP (disp, 0)) == PLUS
+ && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
+ symbol = XEXP (XEXP (disp, 0), 0);
+
+ if (GET_CODE (symbol) == LABEL_REF
+ || (GET_CODE (symbol) == SYMBOL_REF
+ && SYMBOL_REF_TLS_MODEL (symbol) == 0))
+ base = pc_rtx;
+ }
if (!base && !index)
{
/* Displacement only requires special attention. */
if (CONST_INT_P (disp))
{
if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
- {
- if (USER_LABEL_PREFIX[0] == 0)
- putc ('%', file);
- fputs ("ds:", file);
- }
+ fputs ("ds:", file);
fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
}
else if (flag_pic)
output_pic_addr_const (file, disp, 0);
else
output_addr_const (file, disp);
-
- /* Use one byte shorter RIP relative addressing for 64bit mode. */
- if (TARGET_64BIT)
- {
- if (GET_CODE (disp) == CONST
- && GET_CODE (XEXP (disp, 0)) == PLUS
- && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
- disp = XEXP (XEXP (disp, 0), 0);
- if (GET_CODE (disp) == LABEL_REF
- || (GET_CODE (disp) == SYMBOL_REF
- && SYMBOL_REF_TLS_MODEL (disp) == 0))
- fputs ("(%rip)", file);
- }
}
else
{
case UNSPEC_GOTNTPOFF:
output_addr_const (file, op);
if (TARGET_64BIT)
- fputs ("@GOTTPOFF(%rip)", file);
+ fputs (ASSEMBLER_DIALECT == ASM_ATT ?
+ "@GOTTPOFF(%rip)" : "@GOTTPOFF[rip]", file);
else
fputs ("@GOTNTPOFF", file);
break;
ix86_expand_vector_extract (false, target, fp_xmm, 0);
}
+/* Not used, but eases macroization of patterns. */
+void
+ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
+ rtx input ATTRIBUTE_UNUSED)
+{
+ gcc_unreachable ();
+}
+
/* Convert an unsigned SImode value into a DFmode. Only currently used
for SSE, but applicable anywhere. */
rtx operands[])
{
rtx mask, set, use, clob, dst, src;
- bool matching_memory;
bool use_sse = false;
bool vector_mode = VECTOR_MODE_P (mode);
enum machine_mode elt_mode = mode;
dst = operands[0];
src = operands[1];
- /* If the destination is memory, and we don't have matching source
- operands or we're using the x87, do things in registers. */
- matching_memory = false;
- if (MEM_P (dst))
- {
- if (use_sse && rtx_equal_p (dst, src))
- matching_memory = true;
- else
- dst = gen_reg_rtx (mode);
- }
- if (MEM_P (src) && !matching_memory)
- src = force_reg (mode, src);
-
if (vector_mode)
{
set = gen_rtx_fmt_ee (code == NEG ? XOR : AND, mode, src, mask);
else
emit_insn (set);
}
-
- if (dst != operands[0])
- emit_move_insn (operands[0], dst);
}
/* Expand a copysign operation. Special case operand 0 being a constant. */
/* Otherwise, if we are doing less-than or greater-or-equal-than,
op1 is a constant and the low word is zero, then we can just
- examine the high word. */
+ examine the high word. Similarly for low word -1 and
+ less-or-equal-than or greater-than. */
- if (CONST_INT_P (hi[1]) && lo[1] == const0_rtx)
+ if (CONST_INT_P (hi[1]))
switch (code)
{
case LT: case LTU: case GE: case GEU:
- ix86_compare_op0 = hi[0];
- ix86_compare_op1 = hi[1];
- ix86_expand_branch (code, label);
- return;
+ if (lo[1] == const0_rtx)
+ {
+ ix86_compare_op0 = hi[0];
+ ix86_compare_op1 = hi[1];
+ ix86_expand_branch (code, label);
+ return;
+ }
+ break;
+ case LE: case LEU: case GT: case GTU:
+ if (lo[1] == constm1_rtx)
+ {
+ ix86_compare_op0 = hi[0];
+ ix86_compare_op1 = hi[1];
+ ix86_expand_branch (code, label);
+ return;
+ }
+ break;
default:
break;
}
#define PPERM_REV_INV 0x60 /* bit reverse & invert src */
#define PPERM_ZERO 0x80 /* all 0's */
#define PPERM_ONES 0xa0 /* all 1's */
-#define PPERM_SIGN 0xc0 /* propigate sign bit */
-#define PPERM_INV_SIGN 0xe0 /* invert & propigate sign */
+#define PPERM_SIGN 0xc0 /* propagate sign bit */
+#define PPERM_INV_SIGN 0xe0 /* invert & propagate sign */
#define PPERM_SRC1 0x00 /* use first source byte */
#define PPERM_SRC2 0x10 /* use second source byte */
additionally, memset wants eax and memcpy wants esi. Don't
consider such algorithms if the user has appropriated those
registers for their own purposes. */
- bool rep_prefix_usable = !(global_regs[2] || global_regs[5]
- || (memset ? global_regs[0] : global_regs[4]));
+ bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
+ || (memset
+ ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
#define ALG_USABLE_P(alg) (rep_prefix_usable \
|| (alg != rep_prefix_1_byte \
}
/* Expand string move (memcpy) operation. Use i386 string operations when
- profitable. expand_clrmem contains similar code. The code depends upon
+ profitable. expand_setmem contains similar code. The code depends upon
architecture, block size and alignment, but always has the same
overall structure:
if (CONST_INT_P (expected_size_exp) && count == 0)
expected_size = INTVAL (expected_size_exp);
+ /* Make sure we don't need to care about overflow later on. */
+ if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
+ return 0;
+
/* Step 0: Decide on preferred algorithm, desired alignment and
size of chunks to be copied by main loop. */
/* Alignment code needs count to be in register. */
if (CONST_INT_P (count_exp) && desired_align > align)
- {
- enum machine_mode mode = SImode;
- if (TARGET_64BIT && (count & ~0xffffffff))
- mode = DImode;
- count_exp = force_reg (mode, count_exp);
- }
+ count_exp = force_reg (counter_mode (count_exp), count_exp);
gcc_assert (desired_align >= 1 && align >= 1);
/* Ensure that alignment prologue won't copy past end of block. */
Make sure it is power of 2. */
epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
- label = gen_label_rtx ();
- emit_cmp_and_jump_insns (count_exp,
- GEN_INT (epilogue_size_needed),
- LTU, 0, counter_mode (count_exp), 1, label);
- if (GET_CODE (count_exp) == CONST_INT)
- ;
- else if (expected_size == -1 || expected_size < epilogue_size_needed)
- predict_jump (REG_BR_PROB_BASE * 60 / 100);
+ if (CONST_INT_P (count_exp))
+ {
+ if (UINTVAL (count_exp) < (unsigned HOST_WIDE_INT)epilogue_size_needed)
+ goto epilogue;
+ }
else
- predict_jump (REG_BR_PROB_BASE * 20 / 100);
+ {
+ label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp,
+ GEN_INT (epilogue_size_needed),
+ LTU, 0, counter_mode (count_exp), 1, label);
+ if (expected_size == -1 || expected_size < epilogue_size_needed)
+ predict_jump (REG_BR_PROB_BASE * 60 / 100);
+ else
+ predict_jump (REG_BR_PROB_BASE * 20 / 100);
+ }
}
+
/* Emit code to decide on runtime whether library call or inline should be
used. */
if (dynamic_check != -1)
{
- rtx hot_label = gen_label_rtx ();
- jump_around_label = gen_label_rtx ();
- emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
- LEU, 0, GET_MODE (count_exp), 1, hot_label);
- predict_jump (REG_BR_PROB_BASE * 90 / 100);
- emit_block_move_via_libcall (dst, src, count_exp, false);
- emit_jump (jump_around_label);
- emit_label (hot_label);
+ if (CONST_INT_P (count_exp))
+ {
+ if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
+ {
+ emit_block_move_via_libcall (dst, src, count_exp, false);
+ count_exp = const0_rtx;
+ goto epilogue;
+ }
+ }
+ else
+ {
+ rtx hot_label = gen_label_rtx ();
+ jump_around_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
+ LEU, 0, GET_MODE (count_exp), 1, hot_label);
+ predict_jump (REG_BR_PROB_BASE * 90 / 100);
+ emit_block_move_via_libcall (dst, src, count_exp, false);
+ emit_jump (jump_around_label);
+ emit_label (hot_label);
+ }
}
/* Step 2: Alignment prologue. */
}
/* Step 4: Epilogue to copy the remaining bytes. */
-
+ epilogue:
if (label)
{
/* When the main loop is done, COUNT_EXP might hold original count,
if (CONST_INT_P (expected_size_exp) && count == 0)
expected_size = INTVAL (expected_size_exp);
+ /* Make sure we don't need to care about overflow later on. */
+ if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
+ return 0;
+
/* Step 0: Decide on preferred algorithm, desired alignment and
size of chunks to be copied by main loop. */
rtx unspec;
/* Can't use this if the user has appropriated eax, ecx, or edi. */
- if (global_regs[0] || global_regs[2] || global_regs[5])
+ if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
return false;
scratch2 = gen_reg_rtx (Pmode);
assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
{
struct stack_local_entry *s;
+ int align;
gcc_assert (n < MAX_386_STACK_LOCALS);
if (s->mode == mode && s->n == n)
return copy_rtx (s->rtl);
+ /* Align DImode slots to their natural alignment
+ to avoid store forwarding stalls. */
+ if (mode == DImode
+ && (GET_MODE_BITSIZE (mode) > GET_MODE_ALIGNMENT (mode)))
+ align = GET_MODE_BITSIZE (mode);
+ else
+ align = 0;
+
s = (struct stack_local_entry *)
ggc_alloc (sizeof (struct stack_local_entry));
s->n = n;
s->mode = mode;
- s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
+ s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), align);
s->next = ix86_stack_locals;
ix86_stack_locals = s;
static int
ia32_multipass_dfa_lookahead (void)
{
- if (ix86_tune == PROCESSOR_PENTIUM)
- return 2;
+ switch (ix86_tune)
+ {
+ case PROCESSOR_PENTIUM:
+ return 2;
- if (ix86_tune == PROCESSOR_PENTIUMPRO
- || ix86_tune == PROCESSOR_K6)
- return 1;
+ case PROCESSOR_PENTIUMPRO:
+ case PROCESSOR_K6:
+ return 1;
- else
- return 0;
+ default:
+ return 0;
+ }
}
\f
int
ix86_constant_alignment (tree exp, int align)
{
- if (TREE_CODE (exp) == REAL_CST)
+ if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
+ || TREE_CODE (exp) == INTEGER_CST)
{
if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
return 64;
IX86_BUILTIN_RCPPS,
IX86_BUILTIN_RCPSS,
IX86_BUILTIN_RSQRTPS,
+ IX86_BUILTIN_RSQRTPS_NR,
IX86_BUILTIN_RSQRTSS,
IX86_BUILTIN_RSQRTF,
IX86_BUILTIN_SQRTPS,
+ IX86_BUILTIN_SQRTPS_NR,
IX86_BUILTIN_SQRTSS,
IX86_BUILTIN_UNPCKHPS,
{ OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE, CODE_FOR_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, 0 },
{ OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, 0 },
{ OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, 0 },
{ OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_adddi3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, 0 },
{ OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, 0 },
{ OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, 0 },
{ OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subdi3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, 0 },
{ OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, 0 },
{ OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, 0, IX86_BUILTIN_CVTSI2SS, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, 0, IX86_BUILTIN_CVTSI642SS, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLW, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLWI, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLD, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLDI, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQ, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQI, UNKNOWN, 0 },
-
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLW, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLWI, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLD, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLDI, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQ, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQI, UNKNOWN, 0 },
-
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAW, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAWI, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRAD, UNKNOWN, 0 },
- { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRADI, UNKNOWN, 0 },
-
{ OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, 0, IX86_BUILTIN_PSADBW, UNKNOWN, 0 },
{ OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, 0, IX86_BUILTIN_PMADDWD, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, 0, IX86_BUILTIN_PSADBW128, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulsidi3, 0, IX86_BUILTIN_PMULUDQ, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, 0, IX86_BUILTIN_PMULUDQ, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, 0, IX86_BUILTIN_PMULUDQ128, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, 0, IX86_BUILTIN_PSLLWI128, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, 0, IX86_BUILTIN_PSLLDI128, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, 0, IX86_BUILTIN_PSLLQI128, UNKNOWN, 0 },
-
- { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, 0, IX86_BUILTIN_PSRLWI128, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, 0, IX86_BUILTIN_PSRLDI128, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, 0, IX86_BUILTIN_PSRLQI128, UNKNOWN, 0 },
-
- { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, 0, IX86_BUILTIN_PSRAWI128, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, 0, IX86_BUILTIN_PSRADI128, UNKNOWN, 0 },
-
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, 0, IX86_BUILTIN_PMADDWD128, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, 0, IX86_BUILTIN_CVTSI2SD, UNKNOWN, 0 },
static const struct builtin_description bdesc_1arg[] =
{
+ /* SSE */
{ OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, 0, IX86_BUILTIN_MOVMSKPS, UNKNOWN, 0 },
- { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS_NR, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS, UNKNOWN, 0 },
+ { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, 0, IX86_BUILTIN_RCPPS, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, 0, IX86_BUILTIN_CVTPS2PI, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, 0, IX86_BUILTIN_CVTTSS2SI, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, 0, IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, 0 },
+ /* SSE2 */
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB128, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, 0, IX86_BUILTIN_MOVMSKPD, UNKNOWN, 0 },
tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode);
tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode);
+ tree V1DI_type_node
+ = build_vector_type_for_mode (long_long_integer_type_node, V1DImode);
tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode);
tree V2DI_type_node
= build_vector_type_for_mode (long_long_integer_type_node, V2DImode);
tree v4hi_ftype_v4hi_int
= build_function_type_list (V4HI_type_node,
V4HI_type_node, integer_type_node, NULL_TREE);
- tree v4hi_ftype_v4hi_di
- = build_function_type_list (V4HI_type_node,
- V4HI_type_node, long_long_unsigned_type_node,
- NULL_TREE);
- tree v2si_ftype_v2si_di
+ tree v2si_ftype_v2si_int
= build_function_type_list (V2SI_type_node,
- V2SI_type_node, long_long_unsigned_type_node,
- NULL_TREE);
+ V2SI_type_node, integer_type_node, NULL_TREE);
+ tree v1di_ftype_v1di_int
+ = build_function_type_list (V1DI_type_node,
+ V1DI_type_node, integer_type_node, NULL_TREE);
+
tree void_ftype_void
= build_function_type (void_type_node, void_list_node);
tree void_ftype_unsigned
tree v2si_ftype_v2si_v2si
= build_function_type_list (V2SI_type_node,
V2SI_type_node, V2SI_type_node, NULL_TREE);
- tree di_ftype_di_di
- = build_function_type_list (long_long_unsigned_type_node,
- long_long_unsigned_type_node,
- long_long_unsigned_type_node, NULL_TREE);
+ tree v1di_ftype_v1di_v1di
+ = build_function_type_list (V1DI_type_node,
+ V1DI_type_node, V1DI_type_node, NULL_TREE);
tree di_ftype_di_di_int
= build_function_type_list (long_long_unsigned_type_node,
tree v4si_ftype_v8hi_v8hi
= build_function_type_list (V4SI_type_node,
V8HI_type_node, V8HI_type_node, NULL_TREE);
- tree di_ftype_v8qi_v8qi
- = build_function_type_list (long_long_unsigned_type_node,
+ tree v1di_ftype_v8qi_v8qi
+ = build_function_type_list (V1DI_type_node,
V8QI_type_node, V8QI_type_node, NULL_TREE);
- tree di_ftype_v2si_v2si
- = build_function_type_list (long_long_unsigned_type_node,
+ tree v1di_ftype_v2si_v2si
+ = build_function_type_list (V1DI_type_node,
V2SI_type_node, V2SI_type_node, NULL_TREE);
tree v2di_ftype_v16qi_v16qi
= build_function_type_list (V2DI_type_node,
case V2SImode:
type = v2si_ftype_v2si_v2si;
break;
- case DImode:
- type = di_ftype_di_di;
+ case V1DImode:
+ type = v1di_ftype_v1di_v1di;
break;
default:
/* Add the remaining MMX insns with somewhat more complicated types. */
def_builtin (OPTION_MASK_ISA_MMX, "__builtin_ia32_emms", void_ftype_void, IX86_BUILTIN_EMMS);
- def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSLLW);
- def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_di, IX86_BUILTIN_PSLLD);
- def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllq", di_ftype_di_di, IX86_BUILTIN_PSLLQ);
-
- def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRLW);
- def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_di, IX86_BUILTIN_PSRLD);
- def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlq", di_ftype_di_di, IX86_BUILTIN_PSRLQ);
- def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRAW);
- def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_di, IX86_BUILTIN_PSRAD);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllwi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSLLWI);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslldi", v2si_ftype_v2si_int, IX86_BUILTIN_PSLLDI);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllqi", v1di_ftype_v1di_int, IX86_BUILTIN_PSLLQI);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSLLW);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSLLD);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllq", v1di_ftype_v1di_v1di, IX86_BUILTIN_PSLLQ);
+
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlwi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSRLWI);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrldi", v2si_ftype_v2si_int, IX86_BUILTIN_PSRLDI);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlqi", v1di_ftype_v1di_int, IX86_BUILTIN_PSRLQI);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSRLW);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSRLD);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlq", v1di_ftype_v1di_v1di, IX86_BUILTIN_PSRLQ);
+
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrawi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSRAWI);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psradi", v2si_ftype_v2si_int, IX86_BUILTIN_PSRADI);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSRAW);
+ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSRAD);
def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_pshufw", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSHUFW);
def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pmaddwd", v2si_ftype_v4hi_v4hi, IX86_BUILTIN_PMADDWD);
def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_sfence", void_ftype_void, IX86_BUILTIN_SFENCE);
- def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_psadbw", di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW);
+ def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_psadbw", v1di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpps", v4sf_ftype_v4sf, IX86_BUILTIN_RCPPS);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS);
+ def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps_nr", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS_NR);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS);
ftype = build_function_type_list (float_type_node,
float_type_node,
NULL_TREE);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtf", ftype, IX86_BUILTIN_RSQRTF);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS);
+ def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps_nr", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS_NR);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS);
def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_loaddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LOADDQU);
def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_storedqu", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQU);
- def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pmuludq", di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ);
+ def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pmuludq", v1di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ);
def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pmuludq128", v2di_ftype_v4si_v4si, IX86_BUILTIN_PMULUDQ128);
def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_pslldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLDQI128);
emit_insn (pat);
return target;
- case IX86_BUILTIN_PSLLWI128:
- icode = CODE_FOR_ashlv8hi3;
- goto do_pshifti;
- case IX86_BUILTIN_PSLLDI128:
- icode = CODE_FOR_ashlv4si3;
- goto do_pshifti;
- case IX86_BUILTIN_PSLLQI128:
- icode = CODE_FOR_ashlv2di3;
- goto do_pshifti;
- case IX86_BUILTIN_PSRAWI128:
- icode = CODE_FOR_ashrv8hi3;
- goto do_pshifti;
- case IX86_BUILTIN_PSRADI128:
- icode = CODE_FOR_ashrv4si3;
- goto do_pshifti;
- case IX86_BUILTIN_PSRLWI128:
- icode = CODE_FOR_lshrv8hi3;
- goto do_pshifti;
- case IX86_BUILTIN_PSRLDI128:
- icode = CODE_FOR_lshrv4si3;
- goto do_pshifti;
- case IX86_BUILTIN_PSRLQI128:
- icode = CODE_FOR_lshrv2di3;
- goto do_pshifti;
- do_pshifti:
- arg0 = CALL_EXPR_ARG (exp, 0);
- arg1 = CALL_EXPR_ARG (exp, 1);
- op0 = expand_normal (arg0);
- op1 = expand_normal (arg1);
-
- if (!CONST_INT_P (op1))
- {
- error ("shift must be an immediate");
- return const0_rtx;
- }
- if (INTVAL (op1) < 0 || INTVAL (op1) > 255)
- op1 = GEN_INT (255);
-
- tmode = insn_data[icode].operand[0].mode;
- mode1 = insn_data[icode].operand[1].mode;
- if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
- op0 = copy_to_reg (op0);
-
- target = gen_reg_rtx (tmode);
- pat = GEN_FCN (icode) (target, op0, op1);
- if (!pat)
- return 0;
- emit_insn (pat);
- return target;
+ case IX86_BUILTIN_PSLLW:
+ case IX86_BUILTIN_PSLLWI:
+ icode = CODE_FOR_mmx_ashlv4hi3;
+ goto do_pshift;
+ case IX86_BUILTIN_PSLLD:
+ case IX86_BUILTIN_PSLLDI:
+ icode = CODE_FOR_mmx_ashlv2si3;
+ goto do_pshift;
+ case IX86_BUILTIN_PSLLQ:
+ case IX86_BUILTIN_PSLLQI:
+ icode = CODE_FOR_mmx_ashlv1di3;
+ goto do_pshift;
+ case IX86_BUILTIN_PSRAW:
+ case IX86_BUILTIN_PSRAWI:
+ icode = CODE_FOR_mmx_ashrv4hi3;
+ goto do_pshift;
+ case IX86_BUILTIN_PSRAD:
+ case IX86_BUILTIN_PSRADI:
+ icode = CODE_FOR_mmx_ashrv2si3;
+ goto do_pshift;
+ case IX86_BUILTIN_PSRLW:
+ case IX86_BUILTIN_PSRLWI:
+ icode = CODE_FOR_mmx_lshrv4hi3;
+ goto do_pshift;
+ case IX86_BUILTIN_PSRLD:
+ case IX86_BUILTIN_PSRLDI:
+ icode = CODE_FOR_mmx_lshrv2si3;
+ goto do_pshift;
+ case IX86_BUILTIN_PSRLQ:
+ case IX86_BUILTIN_PSRLQI:
+ icode = CODE_FOR_mmx_lshrv1di3;
+ goto do_pshift;
case IX86_BUILTIN_PSLLW128:
+ case IX86_BUILTIN_PSLLWI128:
icode = CODE_FOR_ashlv8hi3;
goto do_pshift;
case IX86_BUILTIN_PSLLD128:
+ case IX86_BUILTIN_PSLLDI128:
icode = CODE_FOR_ashlv4si3;
goto do_pshift;
case IX86_BUILTIN_PSLLQ128:
+ case IX86_BUILTIN_PSLLQI128:
icode = CODE_FOR_ashlv2di3;
goto do_pshift;
case IX86_BUILTIN_PSRAW128:
+ case IX86_BUILTIN_PSRAWI128:
icode = CODE_FOR_ashrv8hi3;
goto do_pshift;
case IX86_BUILTIN_PSRAD128:
+ case IX86_BUILTIN_PSRADI128:
icode = CODE_FOR_ashrv4si3;
goto do_pshift;
case IX86_BUILTIN_PSRLW128:
+ case IX86_BUILTIN_PSRLWI128:
icode = CODE_FOR_lshrv8hi3;
goto do_pshift;
case IX86_BUILTIN_PSRLD128:
+ case IX86_BUILTIN_PSRLDI128:
icode = CODE_FOR_lshrv4si3;
goto do_pshift;
case IX86_BUILTIN_PSRLQ128:
+ case IX86_BUILTIN_PSRLQI128:
icode = CODE_FOR_lshrv2di3;
- goto do_pshift;
+
do_pshift:
arg0 = CALL_EXPR_ARG (exp, 0);
arg1 = CALL_EXPR_ARG (exp, 1);
if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
op0 = copy_to_reg (op0);
- op1 = simplify_gen_subreg (SImode, op1, GET_MODE (op1), 0);
+ if (!CONST_INT_P (op1))
+ op1 = simplify_gen_subreg (SImode, op1, GET_MODE (op1), 0);
+
if (! (*insn_data[icode].operand[2].predicate) (op1, SImode))
op1 = copy_to_reg (op1);
case BUILT_IN_SQRTF:
if (out_mode == SFmode && out_n == 4
&& in_mode == SFmode && in_n == 4)
- return ix86_builtins[IX86_BUILTIN_SQRTPS];
+ return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
break;
case BUILT_IN_LRINT:
switch (fn)
{
/* Vectorized version of sqrt to rsqrt conversion. */
- case IX86_BUILTIN_SQRTPS:
- return ix86_builtins[IX86_BUILTIN_RSQRTPS];
+ case IX86_BUILTIN_SQRTPS_NR:
+ return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
default:
return NULL_TREE;
where integer modes in MMX/SSE registers are not tieable
because of missing QImode and HImode moves to, from or between
MMX/SSE registers. */
- return MAX (ix86_cost->mmxsse_to_integer, 8);
+ return MAX (8, ix86_cost->mmxsse_to_integer);
if (MAYBE_FLOAT_CLASS_P (class1))
return ix86_cost->fp_move;
nbits = 7;
/* Compute costs correctly for widening multiplication. */
- if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op1) == ZERO_EXTEND)
+ if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
&& GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
== GET_MODE_SIZE (mode))
{
{
tree type = TREE_TYPE (function);
bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
+ int nregs;
if (TARGET_64BIT)
{
return gen_rtx_REG (DImode, parm_regs[aggr]);
}
- if (ix86_function_regparm (type, function) > 0 && !stdarg_p (type))
+ nregs = ix86_function_regparm (type, function);
+
+ if (nregs > 0 && !stdarg_p (type))
{
- int regno = AX_REG;
+ int regno;
+
if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
- regno = CX_REG;
+ regno = aggr ? DX_REG : CX_REG;
+ else
+ {
+ regno = AX_REG;
+ if (aggr)
+ {
+ regno = DX_REG;
+ if (nregs == 1)
+ return gen_rtx_MEM (SImode,
+ plus_constant (stack_pointer_rtx, 4));
+ }
+ }
return gen_rtx_REG (SImode, regno);
}
if (X86_FILE_START_FLTUSED)
fputs ("\t.global\t__fltused\n", asm_out_file);
if (ix86_asm_dialect == ASM_INTEL)
- fputs ("\t.intel_syntax\n", asm_out_file);
+ fputs ("\t.intel_syntax noprefix\n", asm_out_file);
}
int
for (i = 0; i < n_elts; ++i)
{
x = XVECEXP (vals, 0, i);
- if (!CONSTANT_P (x))
+ if (!(CONST_INT_P (x)
+ || GET_CODE (x) == CONST_DOUBLE
+ || GET_CODE (x) == CONST_FIXED))
n_var++, one_var = i;
else if (x != CONST0_RTX (inner_mode))
all_const_zero = false;
/* a / b = a * rcp(b) * (2.0 - b * rcp(b)) */
- /* x0 = 1./b estimate */
+ /* x0 = rcp(b) estimate */
emit_insn (gen_rtx_SET (VOIDmode, x0,
gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
UNSPEC_RCP)));
void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
bool recip)
{
- rtx x0, e0, e1, e2, e3, three, half, zero, mask;
+ rtx x0, e0, e1, e2, e3, mthree, mhalf;
+ REAL_VALUE_TYPE r;
x0 = gen_reg_rtx (mode);
e0 = gen_reg_rtx (mode);
e2 = gen_reg_rtx (mode);
e3 = gen_reg_rtx (mode);
- three = CONST_DOUBLE_FROM_REAL_VALUE (dconst3, SFmode);
- half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode);
+ real_arithmetic (&r, NEGATE_EXPR, &dconst3, NULL);
+ mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
- mask = gen_reg_rtx (mode);
+ real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
+ mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
if (VECTOR_MODE_P (mode))
{
- three = ix86_build_const_vector (SFmode, true, three);
- half = ix86_build_const_vector (SFmode, true, half);
+ mthree = ix86_build_const_vector (SFmode, true, mthree);
+ mhalf = ix86_build_const_vector (SFmode, true, mhalf);
}
- three = force_reg (mode, three);
- half = force_reg (mode, half);
-
- zero = force_reg (mode, CONST0_RTX(mode));
+ /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
+ rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
- /* sqrt(a) = 0.5 * a * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a))
- 1.0 / sqrt(a) = 0.5 * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) */
-
- /* Compare a to zero. */
- emit_insn (gen_rtx_SET (VOIDmode, mask,
- gen_rtx_NE (mode, a, zero)));
-
- /* x0 = 1./sqrt(a) estimate */
+ /* x0 = rsqrt(a) estimate */
emit_insn (gen_rtx_SET (VOIDmode, x0,
gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
UNSPEC_RSQRT)));
- /* Filter out infinity. */
- if (VECTOR_MODE_P (mode))
- emit_insn (gen_rtx_SET (VOIDmode, gen_lowpart (V4SFmode, x0),
- gen_rtx_AND (mode,
- gen_lowpart (V4SFmode, x0),
- gen_lowpart (V4SFmode, mask))));
- else
- emit_insn (gen_rtx_SET (VOIDmode, x0,
- gen_rtx_AND (mode, x0, mask)));
+
+ /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
+ if (!recip)
+ {
+ rtx zero, mask;
+
+ zero = gen_reg_rtx (mode);
+ mask = gen_reg_rtx (mode);
+
+ zero = force_reg (mode, CONST0_RTX(mode));
+ emit_insn (gen_rtx_SET (VOIDmode, mask,
+ gen_rtx_NE (mode, zero, a)));
+
+ emit_insn (gen_rtx_SET (VOIDmode, x0,
+ gen_rtx_AND (mode, x0, mask)));
+ }
/* e0 = x0 * a */
emit_insn (gen_rtx_SET (VOIDmode, e0,
/* e1 = e0 * x0 */
emit_insn (gen_rtx_SET (VOIDmode, e1,
gen_rtx_MULT (mode, e0, x0)));
- /* e2 = 3. - e1 */
+
+ /* e2 = e1 - 3. */
+ mthree = force_reg (mode, mthree);
emit_insn (gen_rtx_SET (VOIDmode, e2,
- gen_rtx_MINUS (mode, three, e1)));
+ gen_rtx_PLUS (mode, e1, mthree)));
+
+ mhalf = force_reg (mode, mhalf);
if (recip)
- /* e3 = .5 * x0 */
+ /* e3 = -.5 * x0 */
emit_insn (gen_rtx_SET (VOIDmode, e3,
- gen_rtx_MULT (mode, half, x0)));
+ gen_rtx_MULT (mode, x0, mhalf)));
else
- /* e3 = .5 * e0 */
+ /* e3 = -.5 * e0 */
emit_insn (gen_rtx_SET (VOIDmode, e3,
- gen_rtx_MULT (mode, half, e0)));
+ gen_rtx_MULT (mode, e0, mhalf)));
/* ret = e2 * e3 */
emit_insn (gen_rtx_SET (VOIDmode, res,
gen_rtx_MULT (mode, e2, e3)));
/* Validate whether a SSE5 instruction is valid or not.
OPERANDS is the array of operands.
NUM is the number of operands.
- USES_OC0 is true if the instruction uses OC0 and provides 4 varients.
+ USES_OC0 is true if the instruction uses OC0 and provides 4 variants.
NUM_MEMORY is the maximum number of memory operands to accept. */
-bool ix86_sse5_valid_op_p (rtx operands[], rtx insn, int num, bool uses_oc0, int num_memory)
+bool
+ix86_sse5_valid_op_p (rtx operands[], rtx insn, int num, bool uses_oc0, int num_memory)
{
int mem_mask;
int mem_count;
else if (num == 4 && num_memory == 2)
{
/* If there are two memory operations, we can load one of the memory ops
- into the destination register. This is for optimizating the
+ into the destination register. This is for optimizing the
multiply/add ops, which the combiner has optimized both the multiply
and the add insns to have a memory operation. We have to be careful
that the destination doesn't overlap with the inputs. */
#undef TARGET_BUILD_BUILTIN_VA_LIST
#define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
+#undef TARGET_EXPAND_BUILTIN_VA_START
+#define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
+
#undef TARGET_MD_ASM_CLOBBERS
#define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers