#include "tm-constrs.h"
#include "params.h"
#include "cselib.h"
+#include "debug.h"
+#include "dwarf2out.h"
static rtx legitimize_dllimport_symbol (rtx, bool);
1, /* cond_not_taken_branch_cost. */
};
+struct processor_costs bdver1_cost = {
+ COSTS_N_INSNS (1), /* cost of an add instruction */
+ COSTS_N_INSNS (2), /* cost of a lea instruction */
+ COSTS_N_INSNS (1), /* variable shift costs */
+ COSTS_N_INSNS (1), /* constant shift costs */
+ {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
+ COSTS_N_INSNS (4), /* HI */
+ COSTS_N_INSNS (3), /* SI */
+ COSTS_N_INSNS (4), /* DI */
+ COSTS_N_INSNS (5)}, /* other */
+ 0, /* cost of multiply per each bit set */
+ {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
+ COSTS_N_INSNS (35), /* HI */
+ COSTS_N_INSNS (51), /* SI */
+ COSTS_N_INSNS (83), /* DI */
+ COSTS_N_INSNS (83)}, /* other */
+ COSTS_N_INSNS (1), /* cost of movsx */
+ COSTS_N_INSNS (1), /* cost of movzx */
+ 8, /* "large" insn */
+ 9, /* MOVE_RATIO */
+ 4, /* cost for loading QImode using movzbl */
+ {3, 4, 3}, /* cost of loading integer registers
+ in QImode, HImode and SImode.
+ Relative to reg-reg move (2). */
+ {3, 4, 3}, /* cost of storing integer registers */
+ 4, /* cost of reg,reg fld/fst */
+ {4, 4, 12}, /* cost of loading fp registers
+ in SFmode, DFmode and XFmode */
+ {6, 6, 8}, /* cost of storing fp registers
+ in SFmode, DFmode and XFmode */
+ 2, /* cost of moving MMX register */
+ {3, 3}, /* cost of loading MMX registers
+ in SImode and DImode */
+ {4, 4}, /* cost of storing MMX registers
+ in SImode and DImode */
+ 2, /* cost of moving SSE register */
+ {4, 4, 3}, /* cost of loading SSE registers
+ in SImode, DImode and TImode */
+ {4, 4, 5}, /* cost of storing SSE registers
+ in SImode, DImode and TImode */
+ 3, /* MMX or SSE register to integer */
+ /* On K8
+ MOVD reg64, xmmreg Double FSTORE 4
+ MOVD reg32, xmmreg Double FSTORE 4
+ On AMDFAM10
+ MOVD reg64, xmmreg Double FADD 3
+ 1/1 1/1
+ MOVD reg32, xmmreg Double FADD 3
+ 1/1 1/1 */
+ 64, /* size of l1 cache. */
+ 1024, /* size of l2 cache. */
+ 64, /* size of prefetch block */
+ /* New AMD processors never drop prefetches; if they cannot be performed
+ immediately, they are queued. We set number of simultaneous prefetches
+ to a large constant to reflect this (it probably is not a good idea not
+ to limit number of prefetches at all, as their execution also takes some
+ time). */
+ 100, /* number of parallel prefetches */
+ 2, /* Branch cost */
+ COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
+ COSTS_N_INSNS (4), /* cost of FMUL instruction. */
+ COSTS_N_INSNS (19), /* cost of FDIV instruction. */
+ COSTS_N_INSNS (2), /* cost of FABS instruction. */
+ COSTS_N_INSNS (2), /* cost of FCHS instruction. */
+ COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
+
+ /* BDVER1 has optimized REP instruction for medium sized blocks, but for
+ very small blocks it is better to use loop. For large blocks, libcall can
+ do nontemporary accesses and beat inline considerably. */
+ {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
+ {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ {{libcall, {{8, loop}, {24, unrolled_loop},
+ {2048, rep_prefix_4_byte}, {-1, libcall}}},
+ {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ 4, /* scalar_stmt_cost. */
+ 2, /* scalar load_cost. */
+ 2, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 2, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 2, /* vec_store_cost. */
+ 2, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
+};
+
static const
struct processor_costs pentium4_cost = {
COSTS_N_INSNS (1), /* cost of an add instruction */
#define m_ATHLON (1<<PROCESSOR_ATHLON)
#define m_ATHLON_K8 (m_K8 | m_ATHLON)
#define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
-#define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10)
+#define m_BDVER1 (1<<PROCESSOR_BDVER1)
+#define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10 | m_BDVER1)
#define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
#define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
~m_386,
/* X86_TUNE_USE_SAHF */
- m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
+ m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER1 | m_PENT4
| m_NOCONA | m_CORE2 | m_GENERIC,
/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
while enabling it on K8 brings roughly 2.4% regression that can be partly
masked by careful scheduling of moves. */
m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC
- | m_AMDFAM10,
+ | m_AMDFAM10 | m_BDVER1,
- /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
- m_AMDFAM10,
+ /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
+ m_AMDFAM10 | m_BDVER1,
+
+ /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
+ m_BDVER1,
+
+ /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
+ m_BDVER1,
/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
are resolved on SSE register parts instead of whole registers, so we may
m_AMD_MULTIPLE,
/* X86_TUNE_INTER_UNIT_MOVES */
- ~(m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
+ ~(m_AMD_MULTIPLE | m_GENERIC),
/* X86_TUNE_INTER_UNIT_CONVERSIONS */
- ~(m_AMDFAM10),
+ ~(m_AMDFAM10 | m_BDVER1),
/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
than 4 branch instructions in the 16 byte window. */
/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
vector path on AMD machines. */
- m_K8 | m_GENERIC64 | m_AMDFAM10,
+ m_K8 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1,
/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
machines. */
- m_K8 | m_GENERIC64 | m_AMDFAM10,
+ m_K8 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1,
/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
than a MOV. */
/* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
with a subsequent conditional jump instruction into a single
compare-and-branch uop. */
- m_CORE2,
+ m_CORE2 | m_BDVER1,
/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
will impact LEA instruction selection. */
\f
static struct machine_function * ix86_init_machine_status (void);
static rtx ix86_function_value (const_tree, const_tree, bool);
+static bool ix86_function_value_regno_p (const unsigned int);
static rtx ix86_static_chain (const_tree, bool);
static int ix86_function_regparm (const_tree, const_tree);
static void ix86_compute_frame_layout (struct ix86_frame *);
static enum calling_abi ix86_function_abi (const_tree);
\f
+#ifndef SUBTARGET32_DEFAULT_CPU
+#define SUBTARGET32_DEFAULT_CPU "i386"
+#endif
+
/* The svr4 ABI for the i386 says that records and unions are returned
in memory. */
#ifndef DEFAULT_PCC_STRUCT_RETURN
{&generic32_cost, 16, 7, 16, 7, 16},
{&generic64_cost, 16, 10, 16, 10, 16},
{&amdfam10_cost, 32, 24, 32, 7, 32},
+ {&bdver1_cost, 32, 24, 32, 7, 32},
{&atom_cost, 16, 7, 16, 7, 16}
};
"athlon",
"athlon-4",
"k8",
- "amdfam10"
+ "amdfam10",
+ "bdver1"
};
\f
/* Implement TARGET_HANDLE_OPTION. */
}
}
\f
-/* Return a string the documents the current -m options. The caller is
+/* Return a string that documents the current -m options. The caller is
responsible for freeing the string. */
static char *
{
{ "-m64", OPTION_MASK_ISA_64BIT },
{ "-mfma4", OPTION_MASK_ISA_FMA4 },
+ { "-mfma", OPTION_MASK_ISA_FMA },
{ "-mxop", OPTION_MASK_ISA_XOP },
{ "-mlwp", OPTION_MASK_ISA_LWP },
{ "-msse4a", OPTION_MASK_ISA_SSE4A },
if (isa && add_nl_p)
{
opts[num++][0] = isa_other;
- sprintf (isa_other, "(other isa: 0x%x)", isa);
+ sprintf (isa_other, "(other isa: %#x)", isa);
}
/* Add flag options. */
if (flags && add_nl_p)
{
opts[num++][0] = target_other;
- sprintf (target_other, "(other flags: 0x%x)", isa);
+ sprintf (target_other, "(other flags: %#x)", flags);
}
/* Add -fpmath= option. */
{
int i;
unsigned int ix86_arch_mask, ix86_tune_mask;
+ const bool ix86_tune_specified = (ix86_tune_string != NULL);
const char *prefix;
const char *suffix;
const char *sw;
{"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
+ {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
+ PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
+ | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM
+ | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES
+ | PTA_PCLMUL | PTA_AVX | PTA_FMA4 | PTA_XOP | PTA_LWP},
{"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
0 /* flags are only used for -march switch. */ },
{"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
|| !strcmp (ix86_tune_string, "generic64")))
;
else if (!strncmp (ix86_tune_string, "generic", 7))
- error ("bad value (%s) for %stune=%s %s",
+ error ("bad value (%s) for %stune=%s %s",
ix86_tune_string, prefix, suffix, sw);
+ else if (!strcmp (ix86_tune_string, "x86-64"))
+ warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated. Use "
+ "%stune=k8%s or %stune=generic%s instead as appropriate.",
+ prefix, suffix, prefix, suffix, prefix, suffix);
}
else
{
ix86_tune_string = "generic32";
}
}
+
if (ix86_stringop_string)
{
if (!strcmp (ix86_stringop_string, "rep_byte"))
error ("bad value (%s) for %sstringop-strategy=%s %s",
ix86_stringop_string, prefix, suffix, sw);
}
- if (!strcmp (ix86_tune_string, "x86-64"))
- warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated. Use "
- "%stune=k8%s or %stune=generic%s instead as appropriate.",
- prefix, suffix, prefix, suffix, prefix, suffix);
if (!ix86_arch_string)
- ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
+ ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
else
ix86_arch_specified = 1;
- if (!strcmp (ix86_arch_string, "generic"))
- error ("generic CPU can be used only for %stune=%s %s",
- prefix, suffix, sw);
- if (!strncmp (ix86_arch_string, "generic", 7))
- error ("bad value (%s) for %sarch=%s %s",
- ix86_arch_string, prefix, suffix, sw);
-
/* Validate -mabi= value. */
if (ix86_abi_string)
{
break;
}
- if (i == pta_size)
+ if (!strcmp (ix86_arch_string, "generic"))
+ error ("generic CPU can be used only for %stune=%s %s",
+ prefix, suffix, sw);
+ else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
error ("bad value (%s) for %sarch=%s %s",
ix86_arch_string, prefix, suffix, sw);
x86_prefetch_sse = true;
break;
}
- if (i == pta_size)
+
+ if (ix86_tune_specified && i == pta_size)
error ("bad value (%s) for %stune=%s %s",
ix86_tune_string, prefix, suffix, sw);
ix86_tls_dialect = TLS_DIALECT_GNU;
else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
ix86_tls_dialect = TLS_DIALECT_GNU2;
- else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
- ix86_tls_dialect = TLS_DIALECT_SUN;
else
error ("bad value (%s) for %stls-dialect=%s %s",
ix86_tls_dialect_string, prefix, suffix, sw);
flag_schedule_insns = 0;
#endif
+ /* For -O2 and beyond, turn on -fzee for x86_64 target. */
+ if (level > 1 && TARGET_64BIT)
+ flag_zee = 1;
+
if (TARGET_MACHO)
/* The Darwin libraries never set errno, so we might as well
avoid calling them when that's the only reason we would. */
return true;
}
-/* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm"
- calling convention attributes;
+/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
+ and "sseregparm" calling convention attributes;
arguments as in struct attribute_spec.handler. */
static tree
error ("fastcall and regparm attributes are not compatible");
}
+ if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("regparam and thiscall attributes are not compatible");
+ }
+
cst = TREE_VALUE (args);
if (TREE_CODE (cst) != INTEGER_CST)
{
if (TARGET_64BIT)
{
/* Do not warn when emulating the MS ABI. */
- if (TREE_CODE (*node) != FUNCTION_TYPE
+ if ((TREE_CODE (*node) != FUNCTION_TYPE
+ && TREE_CODE (*node) != METHOD_TYPE)
|| ix86_function_type_abi (*node) != MS_ABI)
warning (OPT_Wattributes, "%qE attribute ignored",
name);
{
error ("fastcall and regparm attributes are not compatible");
}
+ if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("fastcall and thiscall attributes are not compatible");
+ }
}
/* Can combine stdcall with fastcall (redundant), regparm and
{
error ("stdcall and fastcall attributes are not compatible");
}
+ if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("stdcall and thiscall attributes are not compatible");
+ }
}
/* Can combine cdecl with regparm and sseregparm. */
{
error ("fastcall and cdecl attributes are not compatible");
}
+ if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("cdecl and thiscall attributes are not compatible");
+ }
+ }
+ else if (is_attribute_p ("thiscall", name))
+ {
+ if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
+ warning (OPT_Wattributes, "%qE attribute is used for none class-method",
+ name);
+ if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("stdcall and thiscall attributes are not compatible");
+ }
+ if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("fastcall and thiscall attributes are not compatible");
+ }
+ if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
+ {
+ error ("cdecl and thiscall attributes are not compatible");
+ }
}
/* Can combine sseregparm with all attributes. */
!= !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2)))
return 0;
+ /* Check for mismatched thiscall types. */
+ if (!lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type1))
+ != !lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type2)))
+ return 0;
+
/* Check for mismatched return types (cdecl vs stdcall). */
if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1))
!= !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2)))
if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
return 2;
+ if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type)))
+ return 1;
+
/* Use register calling convention for local functions when possible. */
if (decl
&& TREE_CODE (decl) == FUNCTION_DECL
/* Stdcall and fastcall functions will pop the stack if not
variable args. */
if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype))
- || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)))
+ || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype))
+ || lookup_attribute ("thiscall", TYPE_ATTRIBUTES (funtype)))
rtd = 1;
if (rtd && ! stdarg_p (funtype))
else look for regparm information. */
if (fntype)
{
- if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)))
+ if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)))
+ {
+ cum->nregs = 1;
+ cum->fastcall = 1; /* Same first register as in fastcall. */
+ }
+ else if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)))
{
cum->nregs = 2;
cum->fastcall = 1;
/* Return true if N is a possible register number of function value. */
-bool
-ix86_function_value_regno_p (int regno)
+static bool
+ix86_function_value_regno_p (const unsigned int regno)
{
switch (regno)
{
{
rtx save_area, mem;
rtx label;
- rtx label_ref;
rtx tmp_reg;
rtx nsse_reg;
alias_set_type set;
SSE saves. We need some preparation work to get this working. */
label = gen_label_rtx ();
- label_ref = gen_rtx_LABEL_REF (Pmode, label);
- /* Compute address to jump to :
- label - eax*4 + nnamed_sse_arguments*4 Or
- label - eax*5 + nnamed_sse_arguments*5 for AVX. */
- tmp_reg = gen_reg_rtx (Pmode);
nsse_reg = gen_reg_rtx (Pmode);
emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, AX_REG)));
- emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
- gen_rtx_MULT (Pmode, nsse_reg,
- GEN_INT (4))));
-
- /* vmovaps is one byte longer than movaps. */
- if (TARGET_AVX)
- emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
- gen_rtx_PLUS (Pmode, tmp_reg,
- nsse_reg)));
-
- if (cum->sse_regno)
- emit_move_insn
- (nsse_reg,
- gen_rtx_CONST (DImode,
- gen_rtx_PLUS (DImode,
- label_ref,
- GEN_INT (cum->sse_regno
- * (TARGET_AVX ? 5 : 4)))));
- else
- emit_move_insn (nsse_reg, label_ref);
- emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg));
/* Compute address of memory block we save into. We always use pointer
pointing 127 bytes after first byte to store - this is needed to keep
mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127));
MEM_NOTRAP_P (mem) = 1;
set_mem_alias_set (mem, set);
- set_mem_align (mem, BITS_PER_WORD);
+ set_mem_align (mem, 64);
/* And finally do the dirty job! */
emit_insn (gen_sse_prologue_save (mem, nsse_reg,
- GEN_INT (cum->sse_regno), label));
+ GEN_INT (cum->sse_regno), label,
+ gen_reg_rtx (Pmode)));
}
}
int indirect_p = 0;
tree ptrtype;
enum machine_mode nat_mode;
- int arg_boundary;
+ unsigned int arg_boundary;
/* Only 64bit target needs something special. */
if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
size_int (-align));
t = fold_convert (TREE_TYPE (ovf), t);
+ if (crtl->stack_alignment_needed < arg_boundary)
+ crtl->stack_alignment_needed = arg_boundary;
}
gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
gimplify_assign (addr, t, pre_p);
case MODE_V4SF:
return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0";
case MODE_V2DF:
- return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0";
+ if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0";
+ else
+ return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0";
case MODE_TI:
- return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0";
+ if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0";
+ else
+ return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0";
case MODE_V8SF:
return "vxorps\t%x0, %x0, %x0";
case MODE_V4DF:
- return "vxorpd\t%x0, %x0, %x0";
+ if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ return "vxorps\t%x0, %x0, %x0";
+ else
+ return "vxorpd\t%x0, %x0, %x0";
case MODE_OI:
- return "vpxor\t%x0, %x0, %x0";
+ if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ return "vxorps\t%x0, %x0, %x0";
+ else
+ return "vpxor\t%x0, %x0, %x0";
default:
break;
}
/* This function generates code for -fpic that loads %ebx with
the return address of the caller and then returns. */
-void
-ix86_file_end (void)
+static void
+ix86_code_end (void)
{
rtx xops[2];
int regno;
for (regno = 0; regno < 8; ++regno)
{
char name[32];
+ tree decl;
if (! ((pic_labels_used >> regno) & 1))
continue;
get_pc_thunk_name (name, regno);
+ decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+ get_identifier (name),
+ build_function_type (void_type_node, void_list_node));
+ DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+ NULL_TREE, void_type_node);
+ TREE_PUBLIC (decl) = 1;
+ TREE_STATIC (decl) = 1;
+
#if TARGET_MACHO
if (TARGET_MACHO)
{
assemble_name (asm_out_file, name);
fputs ("\n", asm_out_file);
ASM_OUTPUT_LABEL (asm_out_file, name);
+ DECL_WEAK (decl) = 1;
}
else
#endif
if (USE_HIDDEN_LINKONCE)
{
- tree decl;
-
- decl = build_decl (BUILTINS_LOCATION,
- FUNCTION_DECL, get_identifier (name),
- error_mark_node);
- TREE_PUBLIC (decl) = 1;
- TREE_STATIC (decl) = 1;
DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
(*targetm.asm_out.unique_section) (decl, 0);
ASM_OUTPUT_LABEL (asm_out_file, name);
}
+ DECL_INITIAL (decl) = make_node (BLOCK);
+ current_function_decl = decl;
+ init_function_start (decl);
+ first_function_block_is_cold = false;
+ /* Make sure unwind info is emitted for the thunk if needed. */
+ final_start_function (emit_barrier (), asm_out_file, 1);
+
xops[0] = gen_rtx_REG (Pmode, regno);
xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
output_asm_insn ("ret", xops);
+ final_end_function ();
+ init_insn_lengths ();
+ free_after_compilation (cfun);
+ set_cfun (NULL);
+ current_function_decl = NULL;
}
-
- if (NEED_INDICATE_EXEC_STACK)
- file_end_indicate_exec_stack ();
}
/* Emit code for the SET_GOT patterns. */
if (!flag_pic)
output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
else
- output_asm_insn ("call\t%a2", xops);
+ {
+ output_asm_insn ("call\t%a2", xops);
+#ifdef DWARF2_UNWIND_INFO
+ /* The call to next label acts as a push. */
+ if (dwarf2out_do_frame ())
+ {
+ rtx insn;
+ start_sequence ();
+ insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+ gen_rtx_PLUS (Pmode,
+ stack_pointer_rtx,
+ GEN_INT (-4))));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ dwarf2out_frame_debug (insn, true);
+ end_sequence ();
+ }
+#endif
+ }
#if TARGET_MACHO
/* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
if (flag_pic)
- output_asm_insn ("pop%z0\t%0", xops);
+ {
+ output_asm_insn ("pop%z0\t%0", xops);
+#ifdef DWARF2_UNWIND_INFO
+ /* The pop is a pop and clobbers dest, but doesn't restore it
+ for unwind info purposes. */
+ if (dwarf2out_do_frame ())
+ {
+ rtx insn;
+ start_sequence ();
+ insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx));
+ dwarf2out_frame_debug (insn, true);
+ insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+ gen_rtx_PLUS (Pmode,
+ stack_pointer_rtx,
+ GEN_INT (4))));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ dwarf2out_frame_debug (insn, true);
+ end_sequence ();
+ }
+#endif
+ }
}
else
{
get_pc_thunk_name (name, REGNO (dest));
pic_labels_used |= 1 << REGNO (dest);
+#ifdef DWARF2_UNWIND_INFO
+ /* Ensure all queued register saves are flushed before the
+ call. */
+ if (dwarf2out_do_frame ())
+ {
+ rtx insn;
+ start_sequence ();
+ insn = emit_barrier ();
+ end_sequence ();
+ dwarf2out_frame_debug (insn, false);
+ }
+#endif
xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
xops[2] = gen_rtx_MEM (QImode, xops[2]);
output_asm_insn ("call\t%X2", xops);
&& cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
{
int count = frame->nregs;
+ struct cgraph_node *node = cgraph_node (current_function_decl);
cfun->machine->use_fast_prologue_epilogue_nregs = count;
/* The fast prologue uses move instead of push to save registers. This
slow to use many of them. */
if (count)
count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
- if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL
+ if (node->frequency < NODE_FREQUENCY_NORMAL
|| (flag_branch_probabilities
- && cfun->function_frequency < FUNCTION_FREQUENCY_HOT))
+ && node->frequency < NODE_FREQUENCY_HOT))
cfun->machine->use_fast_prologue_epilogue = false;
else
cfun->machine->use_fast_prologue_epilogue
passing. */
if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2
&& !lookup_attribute ("fastcall",
+ TYPE_ATTRIBUTES (TREE_TYPE (decl)))
+ && !lookup_attribute ("thiscall",
TYPE_ATTRIBUTES (TREE_TYPE (decl))))
return CX_REG;
else
end_sequence ();
insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
- RTX_FRAME_RELATED_P (insn) = 1;
+ if (!optimize)
+ {
+ add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
return drap_vreg;
}
else
ix86_cfa_state->reg == stack_pointer_rtx);
else
{
- /* Only valid for Win32. */
rtx eax = gen_rtx_REG (Pmode, AX_REG);
bool eax_live;
rtx t;
- gcc_assert (!TARGET_64BIT || cfun->machine->call_abi == MS_ABI);
-
if (cfun->machine->call_abi == MS_ABI)
eax_live = false;
else
rtx base_reg, index_reg;
HOST_WIDE_INT scale = 1;
rtx scale_rtx = NULL_RTX;
+ rtx tmp;
int retval = 1;
enum ix86_address_seg seg = SEG_DEFAULT;
scale_rtx = XEXP (op, 1);
break;
+ case ASHIFT:
+ if (index)
+ return 0;
+ index = XEXP (op, 0);
+ tmp = XEXP (op, 1);
+ if (!CONST_INT_P (tmp))
+ return 0;
+ scale = INTVAL (tmp);
+ if ((unsigned HOST_WIDE_INT) scale > 3)
+ return 0;
+ scale = 1 << scale;
+ break;
+
case UNSPEC:
if (XINT (op, 1) == UNSPEC_TP
&& TARGET_TLS_DIRECT_SEG_REFS
}
else if (GET_CODE (addr) == ASHIFT)
{
- rtx tmp;
-
/* We're called for lea too, which implements ashift on occasion. */
index = XEXP (addr, 0);
tmp = XEXP (addr, 1);
break;
case UNSPEC_GOTTPOFF:
/* FIXME: This might be @TPOFF in Sun ld too. */
- fputs ("@GOTTPOFF", file);
+ fputs ("@gottpoff", file);
break;
case UNSPEC_TPOFF:
- fputs ("@TPOFF", file);
+ fputs ("@tpoff", file);
break;
case UNSPEC_NTPOFF:
if (TARGET_64BIT)
- fputs ("@TPOFF", file);
+ fputs ("@tpoff", file);
else
- fputs ("@NTPOFF", file);
+ fputs ("@ntpoff", file);
break;
case UNSPEC_DTPOFF:
- fputs ("@DTPOFF", file);
+ fputs ("@dtpoff", file);
break;
case UNSPEC_GOTNTPOFF:
if (TARGET_64BIT)
fputs (ASSEMBLER_DIALECT == ASM_ATT ?
- "@GOTTPOFF(%rip)": "@GOTTPOFF[rip]", file);
+ "@gottpoff(%rip)": "@gottpoff[rip]", file);
else
- fputs ("@GOTNTPOFF", file);
+ fputs ("@gotntpoff", file);
break;
case UNSPEC_INDNTPOFF:
- fputs ("@INDNTPOFF", file);
+ fputs ("@indntpoff", file);
break;
#if TARGET_MACHO
case UNSPEC_MACHOPIC_OFFSET:
{
fputs (ASM_LONG, file);
output_addr_const (file, x);
- fputs ("@DTPOFF", file);
+ fputs ("@dtpoff", file);
switch (size)
{
case 4:
ix86_delegitimize_address (rtx x)
{
rtx orig_x = delegitimize_mem_from_attrs (x);
+ /* addend is NULL or some rtx if x is something+GOTOFF where
+ something doesn't include the PIC register. */
+ rtx addend = NULL_RTX;
/* reg_addend is NULL or a multiple of some register. */
rtx reg_addend = NULL_RTX;
/* const_addend is NULL or a const_int. */
|| XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
|| !MEM_P (orig_x))
return orig_x;
- return XVECEXP (XEXP (x, 0), 0, 0);
+ x = XVECEXP (XEXP (x, 0), 0, 0);
+ if (GET_MODE (orig_x) != Pmode)
+ return simplify_gen_subreg (GET_MODE (orig_x), x, Pmode, 0);
+ return x;
}
if (GET_CODE (x) != PLUS
else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
reg_addend = XEXP (reg_addend, 0);
else
- return orig_x;
- if (!REG_P (reg_addend)
- && GET_CODE (reg_addend) != MULT
- && GET_CODE (reg_addend) != ASHIFT)
- return orig_x;
+ {
+ reg_addend = NULL_RTX;
+ addend = XEXP (x, 0);
+ }
}
else
- return orig_x;
+ addend = XEXP (x, 0);
x = XEXP (XEXP (x, 1), 0);
if (GET_CODE (x) == PLUS
}
if (GET_CODE (x) == UNSPEC
- && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x))
+ && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
|| (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
result = XVECEXP (x, 0, 0);
result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
if (reg_addend)
result = gen_rtx_PLUS (Pmode, reg_addend, result);
+ if (addend)
+ {
+ /* If the rest of original X doesn't involve the PIC register, add
+ addend and subtract pic_offset_table_rtx. This can happen e.g.
+ for code like:
+ leal (%ebx, %ecx, 4), %ecx
+ ...
+ movl foo@GOTOFF(%ecx), %edx
+ in which case we return (%ecx - %ebx) + foo. */
+ if (pic_offset_table_rtx)
+ result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
+ pic_offset_table_rtx),
+ result);
+ else
+ return orig_x;
+ }
+ if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
+ return simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
return result;
}
return cfun->machine->some_ld_name;
for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
- if (INSN_P (insn)
+ if (NONDEBUG_INSN_P (insn)
&& for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
return cfun->machine->some_ld_name;
L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
C -- print opcode suffix for set/cmov insn.
c -- like C, but print reversed condition
- E,e -- likewise, but for compare-and-branch fused insn.
F,f -- likewise, but for floating-point.
O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
otherwise nothing
put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
return;
- case 'E':
- put_condition_code (GET_CODE (x), CCmode, 0, 0, file);
- return;
-
- case 'e':
- put_condition_code (GET_CODE (x), CCmode, 1, 0, file);
- return;
-
case 'H':
/* It doesn't actually matter what mode we use here, as we're
only going to use this for printing. */
return;
case ';':
-#if TARGET_MACHO
- fputs (" ; ", file);
-#else
- putc (' ', file);
+#if TARGET_MACHO || !HAVE_AS_IX86_REP_LOCK_PREFIX
+ fputs (";", file);
#endif
return;
case UNSPEC_GOTTPOFF:
output_addr_const (file, op);
/* FIXME: This might be @TPOFF in Sun ld. */
- fputs ("@GOTTPOFF", file);
+ fputs ("@gottpoff", file);
break;
case UNSPEC_TPOFF:
output_addr_const (file, op);
- fputs ("@TPOFF", file);
+ fputs ("@tpoff", file);
break;
case UNSPEC_NTPOFF:
output_addr_const (file, op);
if (TARGET_64BIT)
- fputs ("@TPOFF", file);
+ fputs ("@tpoff", file);
else
- fputs ("@NTPOFF", file);
+ fputs ("@ntpoff", file);
break;
case UNSPEC_DTPOFF:
output_addr_const (file, op);
- fputs ("@DTPOFF", file);
+ fputs ("@dtpoff", file);
break;
case UNSPEC_GOTNTPOFF:
output_addr_const (file, op);
if (TARGET_64BIT)
fputs (ASSEMBLER_DIALECT == ASM_ATT ?
- "@GOTTPOFF(%rip)" : "@GOTTPOFF[rip]", file);
+ "@gottpoff(%rip)" : "@gottpoff[rip]", file);
else
- fputs ("@GOTNTPOFF", file);
+ fputs ("@gotntpoff", file);
break;
case UNSPEC_INDNTPOFF:
output_addr_const (file, op);
- fputs ("@INDNTPOFF", file);
+ fputs ("@indntpoff", file);
break;
#if TARGET_MACHO
case UNSPEC_MACHOPIC_OFFSET:
switch (GET_MODE_SIZE (mode))
{
case 16:
+ /* If we're optimizing for size, movups is the smallest. */
+ if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ {
+ op0 = gen_lowpart (V4SFmode, op0);
+ op1 = gen_lowpart (V4SFmode, op1);
+ emit_insn (gen_avx_movups (op0, op1));
+ return;
+ }
op0 = gen_lowpart (V16QImode, op0);
op1 = gen_lowpart (V16QImode, op1);
emit_insn (gen_avx_movdqu (op0, op1));
emit_insn (gen_avx_movups256 (op0, op1));
break;
case V2DFmode:
+ if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+ {
+ op0 = gen_lowpart (V4SFmode, op0);
+ op1 = gen_lowpart (V4SFmode, op1);
+ emit_insn (gen_avx_movups (op0, op1));
+ return;
+ }
emit_insn (gen_avx_movupd (op0, op1));
break;
case V4DFmode:
if (MEM_P (op1))
{
/* If we're optimizing for size, movups is the smallest. */
- if (optimize_insn_for_size_p ())
+ if (optimize_insn_for_size_p ()
+ || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
{
op0 = gen_lowpart (V4SFmode, op0);
op1 = gen_lowpart (V4SFmode, op1);
{
rtx zero;
- if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
- {
- op0 = gen_lowpart (V2DFmode, op0);
- op1 = gen_lowpart (V2DFmode, op1);
- emit_insn (gen_sse2_movupd (op0, op1));
- return;
- }
+ if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
+ {
+ op0 = gen_lowpart (V2DFmode, op0);
+ op1 = gen_lowpart (V2DFmode, op1);
+ emit_insn (gen_sse2_movupd (op0, op1));
+ return;
+ }
/* When SSE registers are split into halves, we can avoid
writing to the top half twice. */
}
else
{
- if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
- {
- op0 = gen_lowpart (V4SFmode, op0);
- op1 = gen_lowpart (V4SFmode, op1);
- emit_insn (gen_sse_movups (op0, op1));
- return;
+ if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
+ {
+ op0 = gen_lowpart (V4SFmode, op0);
+ op1 = gen_lowpart (V4SFmode, op1);
+ emit_insn (gen_sse_movups (op0, op1));
+ return;
}
if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
else if (MEM_P (op0))
{
/* If we're optimizing for size, movups is the smallest. */
- if (optimize_insn_for_size_p ())
+ if (optimize_insn_for_size_p ()
+ || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
{
op0 = gen_lowpart (V4SFmode, op0);
op1 = gen_lowpart (V4SFmode, op1);
if (TARGET_SSE2 && mode == V2DFmode)
{
- m = adjust_address (op0, DFmode, 0);
- emit_insn (gen_sse2_storelpd (m, op1));
- m = adjust_address (op0, DFmode, 8);
- emit_insn (gen_sse2_storehpd (m, op1));
+ if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+ {
+ op0 = gen_lowpart (V2DFmode, op0);
+ op1 = gen_lowpart (V2DFmode, op1);
+ emit_insn (gen_sse2_movupd (op0, op1));
+ }
+ else
+ {
+ m = adjust_address (op0, DFmode, 0);
+ emit_insn (gen_sse2_storelpd (m, op1));
+ m = adjust_address (op0, DFmode, 8);
+ emit_insn (gen_sse2_storehpd (m, op1));
+ }
}
else
{
if (mode != V4SFmode)
op1 = gen_lowpart (V4SFmode, op1);
- m = adjust_address (op0, V2SFmode, 0);
- emit_insn (gen_sse_storelps (m, op1));
- m = adjust_address (op0, V2SFmode, 8);
- emit_insn (gen_sse_storehps (m, op1));
+
+ if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
+ {
+ op0 = gen_lowpart (V4SFmode, op0);
+ emit_insn (gen_sse_movups (op0, op1));
+ }
+ else
+ {
+ m = adjust_address (op0, V2SFmode, 0);
+ emit_insn (gen_sse_storelps (m, op1));
+ m = adjust_address (op0, V2SFmode, 8);
+ emit_insn (gen_sse_storehps (m, op1));
+ }
}
}
else
rtx prev = PREV_INSN (insn);
while (prev && distance < LEA_SEARCH_THRESHOLD)
{
- if (INSN_P (prev))
+ if (NONDEBUG_INSN_P (prev))
{
distance++;
for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
&& prev != insn
&& distance < LEA_SEARCH_THRESHOLD)
{
- if (INSN_P (prev))
+ if (NONDEBUG_INSN_P (prev))
{
distance++;
for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
rtx next = NEXT_INSN (insn);
while (next && distance < LEA_SEARCH_THRESHOLD)
{
- if (INSN_P (next))
+ if (NONDEBUG_INSN_P (next))
{
distance++;
&& next != insn
&& distance < LEA_SEARCH_THRESHOLD)
{
- if (INSN_P (next))
+ if (NONDEBUG_INSN_P (next))
{
distance++;
enum rtx_code code = GET_CODE (operands[1]), compare_code;
rtx compare_seq, compare_op;
enum machine_mode mode = GET_MODE (operands[0]);
- bool sign_bit_compare_p = false;;
+ bool sign_bit_compare_p = false;
start_sequence ();
ix86_compare_op0 = XEXP (operands[1], 0);
if (!sign_bit_compare_p)
{
rtx flags;
- rtx (*insn)(rtx, rtx, rtx);
bool fpcmp = false;
compare_code = GET_CODE (compare_op);
tmp = gen_reg_rtx (mode);
if (mode == DImode)
- insn = gen_x86_movdicc_0_m1;
+ emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
else
- insn = gen_x86_movsicc_0_m1;
-
- emit_insn (insn (tmp, flags, compare_op));
+ emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
+ flags, compare_op));
}
else
{
: gen_x86_64_shld) (high[0], low[0], operands[2]));
}
- emit_insn ((mode == DImode ? gen_ashlsi3 : gen_ashldi3) (low[0], low[0], operands[2]));
+ emit_insn ((mode == DImode
+ ? gen_ashlsi3
+ : gen_ashldi3) (low[0], low[0], operands[2]));
if (TARGET_CMOVE && scratch)
{
ix86_expand_clear (scratch);
emit_insn ((mode == DImode
- ? gen_x86_shift_adj_1
- : gen_x86_64_shift_adj_1) (high[0], low[0], operands[2],
- scratch));
+ ? gen_x86_shiftsi_adj_1
+ : gen_x86_shiftdi_adj_1) (high[0], low[0], operands[2],
+ scratch));
}
else
emit_insn ((mode == DImode
- ? gen_x86_shift_adj_2
- : gen_x86_64_shift_adj_2) (high[0], low[0], operands[2]));
+ ? gen_x86_shiftsi_adj_2
+ : gen_x86_shiftdi_adj_2) (high[0], low[0], operands[2]));
}
void
: gen_ashrdi3) (scratch, scratch,
GEN_INT (single_width - 1)));
emit_insn ((mode == DImode
- ? gen_x86_shift_adj_1
- : gen_x86_64_shift_adj_1) (low[0], high[0], operands[2],
- scratch));
+ ? gen_x86_shiftsi_adj_1
+ : gen_x86_shiftdi_adj_1) (low[0], high[0], operands[2],
+ scratch));
}
else
emit_insn ((mode == DImode
- ? gen_x86_shift_adj_3
- : gen_x86_64_shift_adj_3) (low[0], high[0], operands[2]));
+ ? gen_x86_shiftsi_adj_3
+ : gen_x86_shiftdi_adj_3) (low[0], high[0], operands[2]));
}
}
{
ix86_expand_clear (scratch);
emit_insn ((mode == DImode
- ? gen_x86_shift_adj_1
- : gen_x86_64_shift_adj_1) (low[0], high[0], operands[2],
- scratch));
+ ? gen_x86_shiftsi_adj_1
+ : gen_x86_shiftdi_adj_1) (low[0], high[0], operands[2],
+ scratch));
}
else
emit_insn ((mode == DImode
- ? gen_x86_shift_adj_2
- : gen_x86_64_shift_adj_2) (low[0], high[0], operands[2]));
+ ? gen_x86_shiftsi_adj_2
+ : gen_x86_shiftdi_adj_2) (low[0], high[0], operands[2]));
}
}
case PROCESSOR_NOCONA:
case PROCESSOR_GENERIC32:
case PROCESSOR_GENERIC64:
+ case PROCESSOR_BDVER1:
return 3;
case PROCESSOR_CORE2:
case PROCESSOR_ATHLON:
case PROCESSOR_K8:
case PROCESSOR_AMDFAM10:
+ case PROCESSOR_BDVER1:
case PROCESSOR_ATOM:
case PROCESSOR_GENERIC32:
case PROCESSOR_GENERIC64:
}
/* x86-64 ABI requires arrays greater than 16 bytes to be aligned
- to 16byte boundary. */
- if (TARGET_64BIT)
+ to 16byte boundary. Exact wording is:
+
+ An array uses the same alignment as its elements, except that a local or
+ global array variable of length at least 16 bytes or
+ a C99 variable-length array variable always has alignment of at least 16 bytes.
+
+ This was added to allow use of aligned SSE instructions at arrays. This
+ rule is meant for static storage (where compiler can not do the analysis
+ by itself). We follow it for automatic variables only when convenient.
+ We fully control everything in the function compiled and functions from
+ other unit can not rely on the alignment.
+
+ Exclude va_list type. It is the common case of local array where
+ we can not benefit from the alignment. */
+ if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
+ && TARGET_SSE)
{
if (AGGREGATE_TYPE_P (type)
+ && (TYPE_MAIN_VARIANT (type)
+ != TYPE_MAIN_VARIANT (va_list_type_node))
&& TYPE_SIZE (type)
&& TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
&& (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
us with EAX for the static chain. */
regno = AX_REG;
}
+ else if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)))
+ {
+ /* Thiscall functions use ecx for arguments, which leaves
+ us with EAX for the static chain. */
+ regno = AX_REG;
+ }
else if (ix86_function_regparm (fntype, fndecl) == 3)
{
/* For regparm 3, we have no free call-clobbered registers in
IX86_BUILTIN_VPERMILPS,
IX86_BUILTIN_VPERMILPD256,
IX86_BUILTIN_VPERMILPS256,
+ IX86_BUILTIN_VPERMIL2PD,
+ IX86_BUILTIN_VPERMIL2PS,
+ IX86_BUILTIN_VPERMIL2PD256,
+ IX86_BUILTIN_VPERMIL2PS256,
IX86_BUILTIN_VPERM2F128PD256,
IX86_BUILTIN_VPERM2F128PS256,
IX86_BUILTIN_VPERM2F128SI256,
};
/* FMA4 and XOP. */
+#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
+#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
+#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
+#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
#define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
#define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
#define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
{ OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
{ OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
+ { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
+ { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
+ { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
+ { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
+
};
/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
switch (m_type)
{
+ case MULTI_ARG_4_DF2_DI_I:
+ case MULTI_ARG_4_DF2_DI_I1:
+ case MULTI_ARG_4_SF2_SI_I:
+ case MULTI_ARG_4_SF2_SI_I1:
+ nargs = 4;
+ last_arg_constant = true;
+ break;
+
case MULTI_ARG_3_SF:
case MULTI_ARG_3_DF:
case MULTI_ARG_3_SF2:
pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
break;
+ case 4:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
+ break;
+
default:
gcc_unreachable ();
}
nargs = 3;
nargs_constant = 2;
break;
+ case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
+ case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
+ case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
+ case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
+ nargs = 4;
+ nargs_constant = 1;
+ break;
case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
nargs = 4;
nargs_constant = 2;
case CODE_FOR_sse4_1_blendpd:
case CODE_FOR_avx_vpermilv2df:
+ case CODE_FOR_xop_vpermil2v2df3:
+ case CODE_FOR_xop_vpermil2v4sf3:
+ case CODE_FOR_xop_vpermil2v4df3:
+ case CODE_FOR_xop_vpermil2v8sf3:
error ("the last argument must be a 2-bit immediate");
return const0_rtx;
if it is not available. */
static tree
-ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
+ix86_builtin_vectorized_function (tree fndecl, tree type_out,
tree type_in)
{
enum machine_mode in_mode, out_mode;
int in_n, out_n;
+ enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
if (TREE_CODE (type_out) != VECTOR_TYPE
- || TREE_CODE (type_in) != VECTOR_TYPE)
+ || TREE_CODE (type_in) != VECTOR_TYPE
+ || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
return NULL_TREE;
out_mode = TYPE_MODE (TREE_TYPE (type_out));
/* Returns a decl of a function that implements conversion of an integer vector
- into a floating-point vector, or vice-versa. TYPE is the type of the integer
- side of the conversion.
+ into a floating-point vector, or vice-versa. DEST_TYPE and SRC_TYPE
+ are the types involved when converting according to CODE.
Return NULL_TREE if it is not available. */
static tree
-ix86_vectorize_builtin_conversion (unsigned int code, tree type)
+ix86_vectorize_builtin_conversion (unsigned int code,
+ tree dest_type, tree src_type)
{
- if (! (TARGET_SSE2 && TREE_CODE (type) == VECTOR_TYPE))
+ if (! TARGET_SSE2)
return NULL_TREE;
switch (code)
{
case FLOAT_EXPR:
- switch (TYPE_MODE (type))
+ switch (TYPE_MODE (src_type))
{
case V4SImode:
- return TYPE_UNSIGNED (type)
- ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS]
- : ix86_builtins[IX86_BUILTIN_CVTDQ2PS];
+ switch (TYPE_MODE (dest_type))
+ {
+ case V4SFmode:
+ return (TYPE_UNSIGNED (src_type)
+ ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS]
+ : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]);
+ case V4DFmode:
+ return (TYPE_UNSIGNED (src_type)
+ ? NULL_TREE
+ : ix86_builtins[IX86_BUILTIN_CVTDQ2PD256]);
+ default:
+ return NULL_TREE;
+ }
+ break;
+ case V8SImode:
+ switch (TYPE_MODE (dest_type))
+ {
+ case V8SFmode:
+ return (TYPE_UNSIGNED (src_type)
+ ? NULL_TREE
+ : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]);
+ default:
+ return NULL_TREE;
+ }
+ break;
default:
return NULL_TREE;
}
case FIX_TRUNC_EXPR:
- switch (TYPE_MODE (type))
+ switch (TYPE_MODE (dest_type))
{
case V4SImode:
- return TYPE_UNSIGNED (type)
- ? NULL_TREE
- : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ];
+ switch (TYPE_MODE (src_type))
+ {
+ case V4SFmode:
+ return (TYPE_UNSIGNED (dest_type)
+ ? NULL_TREE
+ : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]);
+ case V4DFmode:
+ return (TYPE_UNSIGNED (dest_type)
+ ? NULL_TREE
+ : ix86_builtins[IX86_BUILTIN_CVTTPD2DQ256]);
+ default:
+ return NULL_TREE;
+ }
+ break;
+
+ case V8SImode:
+ switch (TYPE_MODE (src_type))
+ {
+ case V8SFmode:
+ return (TYPE_UNSIGNED (dest_type)
+ ? NULL_TREE
+ : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ256]);
+ default:
+ return NULL_TREE;
+ }
+ break;
+
default:
return NULL_TREE;
}
+
default:
return NULL_TREE;
-
}
+
+ return NULL_TREE;
}
/* Returns a code for a target-specific builtin that implements
if (!CONST_INT_P (er))
return 0;
ei = INTVAL (er);
- if (ei >= 2 * nelt)
+ if (ei >= nelt)
return 0;
ipar[i] = ei;
}
fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
fprintf (file, ASM_LONG "%s\n", binder_name);
}
-
-void
-darwin_x86_file_end (void)
-{
- darwin_file_end ();
- ix86_file_end ();
-}
#endif /* TARGET_MACHO */
/* Order the registers for register allocator. */
if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
regno = aggr ? DX_REG : CX_REG;
+ else if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type)))
+ {
+ regno = CX_REG;
+ if (aggr)
+ return gen_rtx_MEM (SImode,
+ plus_constant (stack_pointer_rtx, 4));
+ }
else
{
regno = AX_REG;
*(*this + vcall_offset) should be added to THIS. */
static void
-x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED,
+x86_output_mi_thunk (FILE *file,
tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
HOST_WIDE_INT vcall_offset, tree function)
{
rtx this_param = x86_this_parameter (function);
rtx this_reg, tmp;
+ /* Make sure unwind info is emitted for the thunk if needed. */
+ final_start_function (emit_barrier (), file, 1);
+
/* If VCALL_OFFSET, we'll need THIS in a register. Might as well
pull it in now and let DELTA benefit. */
if (REG_P (this_param))
/* Adjust the this parameter by a fixed constant. */
if (delta)
{
- /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
- Exceptions: -128 encodes smaller than 128, so swap sign and op. */
- bool sub = delta < 0 || delta == 128;
- xops[0] = GEN_INT (sub ? -delta : delta);
+ xops[0] = GEN_INT (delta);
xops[1] = this_reg ? this_reg : this_param;
if (TARGET_64BIT)
{
xops[0] = tmp;
xops[1] = this_param;
}
- if (sub)
+ if (x86_maybe_negate_const_int (&xops[0], DImode))
output_asm_insn ("sub{q}\t{%0, %1|%1, %0}", xops);
else
output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
}
- else if (sub)
+ else if (x86_maybe_negate_const_int (&xops[0], SImode))
output_asm_insn ("sub{l}\t{%0, %1|%1, %0}", xops);
else
output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
{
int tmp_regno = CX_REG;
if (lookup_attribute ("fastcall",
- TYPE_ATTRIBUTES (TREE_TYPE (function))))
+ TYPE_ATTRIBUTES (TREE_TYPE (function)))
+ || lookup_attribute ("thiscall",
+ TYPE_ATTRIBUTES (TREE_TYPE (function))))
tmp_regno = AX_REG;
tmp = gen_rtx_REG (SImode, tmp_regno);
}
output_asm_insn ("jmp\t{*}%1", xops);
}
}
+ final_end_function ();
}
static void
if (TARGET_64BIT)
{
#ifndef NO_PROFILE_COUNTERS
- fprintf (file, "\tleaq\t" LPREFIX "P%d@(%%rip),%%r11\n", labelno);
+ fprintf (file, "\tleaq\t" LPREFIX "P%d(%%rip),%%r11\n", labelno);
#endif
if (DEFAULT_ABI == SYSV_ABI && flag_pic)
replace = true;
/* Empty functions get branch mispredict even when the jump destination
is not visible to us. */
- if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED)
+ if (!prev && !optimize_function_for_size_p (cfun))
replace = true;
}
if (replace)
extended_reg_mentioned_1, NULL);
}
+/* If profitable, negate (without causing overflow) integer constant
+ of mode MODE at location LOC. Return true in this case. */
+bool
+x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
+{
+ HOST_WIDE_INT val;
+
+ if (!CONST_INT_P (*loc))
+ return false;
+
+ switch (mode)
+ {
+ case DImode:
+ /* DImode x86_64 constants must fit in 32 bits. */
+ gcc_assert (x86_64_immediate_operand (*loc, mode));
+
+ mode = SImode;
+ break;
+
+ case SImode:
+ case HImode:
+ case QImode:
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Avoid overflows. */
+ if (mode_signbit_p (mode, *loc))
+ return false;
+
+ val = INTVAL (*loc);
+
+ /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
+ Exceptions: -128 encodes smaller than 128, so swap sign and op. */
+ if ((val < 0 && val != -128)
+ || val == 128)
+ {
+ *loc = GEN_INT (-val);
+ return true;
+ }
+
+ return false;
+}
+
/* Generate an unsigned DImode/SImode to FP conversion. This is the same code
optabs would emit if we didn't have TFmode patterns. */
/* Fastcall attribute says callee is responsible for popping arguments
if they are not variable. */
{ "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute },
+ /* Thiscall attribute says callee is responsible for popping arguments
+ if they are not variable. */
+ { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute },
/* Cdecl attribute says the callee is a normal C declaration */
{ "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute },
/* Regparm attribute specifies how many integer arguments are to be
do_subreg:
vmode = V8HImode;
target = gen_lowpart (vmode, target);
- op0 = gen_lowpart (vmode, target);
- op1 = gen_lowpart (vmode, target);
+ op0 = gen_lowpart (vmode, op0);
+ op1 = gen_lowpart (vmode, op1);
break;
default:
}
/* This matches five different patterns with the different modes. */
- x = gen_rtx_VEC_MERGE (vmode, op0, op1, GEN_INT (mask));
+ x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
x = gen_rtx_SET (VOIDmode, target, x);
emit_insn (x);
input where SEL+CONCAT may not. */
if (d->op0 == d->op1)
{
- if (expand_vselect (d->target, d->op0, d->perm, nelt))
+ int mask = nelt - 1;
+
+ for (i = 0; i < nelt; i++)
+ perm2[i] = d->perm[i] & mask;
+
+ if (expand_vselect (d->target, d->op0, perm2, nelt))
return true;
/* There are plenty of patterns in sse.md that are written for
every other permutation operand. */
for (i = 0; i < nelt; i += 2)
{
- perm2[i] = d->perm[i];
- perm2[i+1] = d->perm[i+1] + nelt;
+ perm2[i] = d->perm[i] & mask;
+ perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
}
if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
return true;
/* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
if (nelt >= 4)
{
- memcpy (perm2, d->perm, nelt);
- for (i = 2; i < nelt; i += 4)
+ for (i = 0; i < nelt; i += 4)
{
- perm2[i+0] += nelt;
- perm2[i+1] += nelt;
+ perm2[i + 0] = d->perm[i + 0] & mask;
+ perm2[i + 1] = d->perm[i + 1] & mask;
+ perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
+ perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
}
if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
#undef TARGET_FUNCTION_VALUE
#define TARGET_FUNCTION_VALUE ix86_function_value
+#undef TARGET_FUNCTION_VALUE_REGNO_P
+#define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
+
#undef TARGET_SECONDARY_RELOAD
#define TARGET_SECONDARY_RELOAD ix86_secondary_reload
#undef TARGET_CAN_ELIMINATE
#define TARGET_CAN_ELIMINATE ix86_can_eliminate
+#undef TARGET_ASM_CODE_END
+#define TARGET_ASM_CODE_END ix86_code_end
+
struct gcc_target targetm = TARGET_INITIALIZER;
\f
#include "gt-i386.h"