/* Subroutines used for code generation on IA-32.
Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
- 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
Free Software Foundation, Inc.
This file is part of GCC.
<- HARD_FRAME_POINTER
[saved regs]
+ [padding0]
+
+ [saved SSE regs]
+
[padding1] \
)
[va_arg registers] (
*/
struct ix86_frame
{
+ int padding0;
+ int nsseregs;
int nregs;
int padding1;
int va_arg_size;
{ "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
};
- const char *opts[ (sizeof (isa_opts) / sizeof (isa_opts[0])
- + sizeof (flag_opts) / sizeof (flag_opts[0])
- + 6)][2];
+ const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
char isa_other[40];
char target_other[40];
}
/* Pick out the options in isa options. */
- for (i = 0; i < sizeof (isa_opts) / sizeof (isa_opts[0]); i++)
+ for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
{
if ((isa & isa_opts[i].mask) != 0)
{
}
/* Add flag options. */
- for (i = 0; i < sizeof (flag_opts) / sizeof (flag_opts[0]); i++)
+ for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
{
if ((flags & flag_opts[i].mask) != 0)
{
if (num == 0)
return NULL;
- gcc_assert (num < sizeof (opts) / sizeof (opts[0]));
+ gcc_assert (num < ARRAY_SIZE (opts));
/* Size the string. */
len = 0;
{"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
- {"barcelona", PROCESSOR_AMDFAM10, PROCESSOR_AMDFAM10,
+ {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
| PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
{"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
IX86_ATTR_ISA ("abm", OPT_mabm),
IX86_ATTR_ISA ("aes", OPT_maes),
+ IX86_ATTR_ISA ("avx", OPT_mavx),
IX86_ATTR_ISA ("mmx", OPT_mmmx),
IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
/* Find the option. */
ch = *p;
opt = N_OPTS;
- for (i = 0; i < sizeof (attrs) / sizeof (attrs[0]); i++)
+ for (i = 0; i < ARRAY_SIZE (attrs); i++)
{
type = attrs[i].type;
opt_len = attrs[i].len;
&& ix86_function_regparm (TREE_TYPE (decl), NULL) >= 3)
return false;
+ /* If we need to align the outgoing stack, then sibcalling would
+ unalign the stack, which may break the called function. */
+ if (ix86_incoming_stack_boundary < PREFERRED_STACK_BOUNDARY)
+ return false;
+
/* Otherwise okay. That also includes certain types of indirect calls. */
return true;
}
int
ix86_reg_parm_stack_space (const_tree fndecl)
{
- int call_abi = 0;
- /* For libcalls it is possible that there is no fndecl at hand.
- Therefore assume for this case the default abi of the target. */
- if (!fndecl)
- call_abi = DEFAULT_ABI;
- else
+ int call_abi = SYSV_ABI;
+ if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
call_abi = ix86_function_abi (fndecl);
- if (call_abi == 1)
+ else
+ call_abi = ix86_function_type_abi (fndecl);
+ if (call_abi == MS_ABI)
return 32;
return 0;
}
/* Implementation of call abi switching target hook. Specific to FNDECL
the specific call register sets are set. See also CONDITIONAL_REGISTER_USAGE
- for more details.
- To prevent redudant calls of costy function init_regs (), it checks not to
- reset register usage for default abi. */
+ for more details. */
void
ix86_call_abi_override (const_tree fndecl)
{
cfun->machine->call_abi = DEFAULT_ABI;
else
cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
- if (TARGET_64BIT && cfun->machine->call_abi == MS_ABI)
- {
- if (call_used_regs[4 /*RSI*/] != 0 || call_used_regs[5 /*RDI*/] != 0)
- {
- call_used_regs[4 /*RSI*/] = 0;
- call_used_regs[5 /*RDI*/] = 0;
- init_regs ();
- }
- }
- else if (TARGET_64BIT)
- {
- if (call_used_regs[4 /*RSI*/] != 1 || call_used_regs[5 /*RDI*/] != 1)
- {
- call_used_regs[4 /*RSI*/] = 1;
- call_used_regs[5 /*RDI*/] = 1;
- init_regs ();
- }
- }
+}
+
+/* MS and SYSV ABI have different set of call used registers. Avoid expensive
+ re-initialization of init_regs each time we switch function context since
+ this is needed only during RTL expansion. */
+static void
+ix86_maybe_switch_abi (void)
+{
+ if (TARGET_64BIT &&
+ call_used_regs[4 /*RSI*/] == (cfun->machine->call_abi == MS_ABI))
+ init_regs ();
}
/* Initialize a variable CUM of type CUMULATIVE_ARGS
struct cgraph_local_info *i = fndecl ? cgraph_local_info (fndecl) : NULL;
memset (cum, 0, sizeof (*cum));
- cum->call_abi = ix86_function_type_abi (fntype);
+ if (fndecl)
+ cum->call_abi = ix86_function_abi (fndecl);
+ else
+ cum->call_abi = ix86_function_type_abi (fntype);
/* Set up the number of registers to use for passing arguments. */
+
+ if (cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
+ sorry ("ms_abi attribute require -maccumulate-outgoing-args or subtarget optimization implying it");
cum->nregs = ix86_regparm;
if (TARGET_64BIT)
{
{
unsigned int regno;
- /* Avoid the AL settings for the Unix64 ABI. */
+ /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
+ We use value of -2 to specify that current function call is MSABI. */
if (mode == VOIDmode)
- return constm1_rtx;
+ return GEN_INT (-2);
/* If we've run out of registers, it goes on the stack. */
if (cum->nregs == 0)
&& (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
}
-/* Return number of registers to be saved on the stack. */
+/* Return number of saved general prupose registers. */
static int
ix86_nsaved_regs (void)
int nregs = 0;
int regno;
- for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
- if (ix86_save_reg (regno, true))
- nregs++;
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+ nregs ++;
+ return nregs;
+}
+
+/* Return number of saved SSE registrers. */
+
+static int
+ix86_nsaved_sseregs (void)
+{
+ int nregs = 0;
+ int regno;
+
+ if (ix86_cfun_abi () != MS_ABI)
+ return 0;
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+ nregs ++;
return nregs;
}
HOST_WIDE_INT size = get_frame_size ();
frame->nregs = ix86_nsaved_regs ();
+ frame->nsseregs = ix86_nsaved_sseregs ();
total_size = size;
stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
+ /* MS ABI seem to require stack alignment to be always 16 except for function
+ prologues. */
+ if (ix86_cfun_abi () == MS_ABI && preferred_alignment < 16)
+ {
+ preferred_alignment = 16;
+ stack_alignment_needed = 16;
+ crtl->preferred_stack_boundary = 128;
+ crtl->stack_alignment_needed = 128;
+ }
+
gcc_assert (!size || stack_alignment_needed);
gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
gcc_assert (preferred_alignment <= stack_alignment_needed);
/* Register save area */
offset += frame->nregs * UNITS_PER_WORD;
+ /* Align SSE reg save area. */
+ if (frame->nsseregs)
+ frame->padding0 = ((offset + 16 - 1) & -16) - offset;
+ else
+ frame->padding0 = 0;
+
+ /* SSE register save area. */
+ offset += frame->padding0 + frame->nsseregs * 16;
+
/* Va-arg area */
frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
offset += frame->va_arg_size;
frame->stack_pointer_offset -= frame->red_zone_size;
#if 0
fprintf (stderr, "\n");
- fprintf (stderr, "nregs: %ld\n", (long)frame->nregs);
fprintf (stderr, "size: %ld\n", (long)size);
+ fprintf (stderr, "nregs: %ld\n", (long)frame->nregs);
+ fprintf (stderr, "nsseregs: %ld\n", (long)frame->nsseregs);
+ fprintf (stderr, "padding0: %ld\n", (long)frame->padding0);
fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed);
fprintf (stderr, "padding1: %ld\n", (long)frame->padding1);
fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size);
unsigned int regno;
rtx insn;
- for (regno = FIRST_PSEUDO_REGISTER; regno-- > 0; )
- if (ix86_save_reg (regno, true))
+ for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
+ if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
{
insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
RTX_FRAME_RELATED_P (insn) = 1;
rtx insn;
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (ix86_save_reg (regno, true))
+ if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
{
insn = emit_move_insn (adjust_address (gen_rtx_MEM (Pmode, pointer),
Pmode, offset),
}
}
+/* Emit code to save registers using MOV insns. First register
+ is restored from POINTER + OFFSET. */
+static void
+ix86_emit_save_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset)
+{
+ unsigned int regno;
+ rtx insn;
+ rtx mem;
+
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
+ {
+ mem = adjust_address (gen_rtx_MEM (TImode, pointer), TImode, offset);
+ set_mem_align (mem, 128);
+ insn = emit_move_insn (mem, gen_rtx_REG (TImode, regno));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ offset += 16;
+ }
+}
+
/* Expand prologue or epilogue stack adjustment.
The pattern exist to put a dependency on all ebp-based memory accesses.
STYLE should be negative if instructions should be marked as frame related,
TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
ix86_incoming_stack_boundary = MIN_STACK_BOUNDARY;
+ /* The incoming stack frame has to be aligned at least at
+ parm_stack_boundary. */
+ if (ix86_incoming_stack_boundary < crtl->parm_stack_boundary)
+ ix86_incoming_stack_boundary = crtl->parm_stack_boundary;
+
/* Stack at entrance of main is aligned by runtime. We use the
smallest incoming stack boundary. */
if (ix86_incoming_stack_boundary > MAIN_STACK_BOUNDARY
RTX_FRAME_RELATED_P (insn) = 1;
}
- allocate = frame.to_allocate;
+ allocate = frame.to_allocate + frame.nsseregs * 16 + frame.padding0;
if (!frame.save_regs_using_mov)
ix86_emit_save_regs ();
|| !frame.to_allocate
|| crtl->stack_realign_needed)
ix86_emit_save_regs_using_mov (stack_pointer_rtx,
- frame.to_allocate);
+ frame.to_allocate
+ + frame.nsseregs * 16 + frame.padding0);
else
ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
-frame.nregs * UNITS_PER_WORD);
}
+ if (!frame_pointer_needed
+ || !frame.to_allocate
+ || crtl->stack_realign_needed)
+ ix86_emit_save_sse_regs_using_mov (stack_pointer_rtx,
+ frame.to_allocate);
+ else
+ ix86_emit_save_sse_regs_using_mov (hard_frame_pointer_rtx,
+ - frame.nregs * UNITS_PER_WORD
+ - frame.nsseregs * 16
+ - frame.padding0);
pic_reg_used = false;
if (pic_offset_table_rtx
rtx base_address = gen_rtx_MEM (Pmode, pointer);
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (ix86_save_reg (regno, maybe_eh_return))
+ if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
{
/* Ensure that adjust_address won't be forced to produce pointer
out of range allowed by x86-64 instruction set. */
offset = 0;
}
emit_move_insn (gen_rtx_REG (Pmode, regno),
- adjust_address (base_address, Pmode, offset));
+ adjust_address (base_address, Pmode, offset));
offset += UNITS_PER_WORD;
}
}
+/* Emit code to restore saved registers using MOV insns. First register
+ is restored from POINTER + OFFSET. */
+static void
+ix86_emit_restore_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
+ int maybe_eh_return)
+{
+ int regno;
+ rtx base_address = gen_rtx_MEM (TImode, pointer);
+ rtx mem;
+
+ for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
+ {
+ /* Ensure that adjust_address won't be forced to produce pointer
+ out of range allowed by x86-64 instruction set. */
+ if (TARGET_64BIT && offset != trunc_int_for_mode (offset, SImode))
+ {
+ rtx r11;
+
+ r11 = gen_rtx_REG (DImode, R11_REG);
+ emit_move_insn (r11, GEN_INT (offset));
+ emit_insn (gen_adddi3 (r11, r11, pointer));
+ base_address = gen_rtx_MEM (TImode, r11);
+ offset = 0;
+ }
+ mem = adjust_address (base_address, TImode, offset);
+ set_mem_align (mem, 128);
+ emit_move_insn (gen_rtx_REG (TImode, regno), mem);
+ offset += 16;
+ }
+}
+
/* Restore function stack, frame, and registers. */
void
if (crtl->calls_eh_return && style != 2)
offset -= 2;
offset *= -UNITS_PER_WORD;
+ offset -= frame.nsseregs * 16 + frame.padding0;
/* If we're only restoring one register and sp is not valid then
using a move instruction to restore the register since it's
if (!frame_pointer_needed
|| (sp_valid && !frame.to_allocate)
|| stack_realign_fp)
- ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
- frame.to_allocate, style == 2);
+ {
+ ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
+ frame.to_allocate, style == 2);
+ ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
+ frame.to_allocate
+ + frame.nsseregs * 16
+ + frame.padding0, style == 2);
+ }
else
- ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
- offset, style == 2);
+ {
+ ix86_emit_restore_sse_regs_using_mov (hard_frame_pointer_rtx,
+ offset, style == 2);
+ ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
+ offset
+ + frame.nsseregs * 16
+ + frame.padding0, style == 2);
+ }
/* eh_return epilogues need %ecx added to the stack pointer. */
if (style == 2)
{
tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
tmp = plus_constant (tmp, (frame.to_allocate
- + frame.nregs * UNITS_PER_WORD));
+ + frame.nregs * UNITS_PER_WORD
+ + frame.nsseregs * 16
+ + frame.padding0));
emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp));
}
}
else if (!frame_pointer_needed)
pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
GEN_INT (frame.to_allocate
- + frame.nregs * UNITS_PER_WORD),
+ + frame.nregs * UNITS_PER_WORD
+ + frame.nsseregs * 16
+ + frame.padding0),
style);
/* If not an i386, mov & pop is faster than "leave". */
else if (TARGET_USE_LEAVE || optimize_function_for_size_p (cfun)
pro_epilogue_adjust_stack (stack_pointer_rtx,
hard_frame_pointer_rtx,
GEN_INT (offset), style);
+ ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
+ frame.to_allocate, style == 2);
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (frame.nsseregs * 16), style);
+ }
+ else if (frame.to_allocate || frame.nsseregs)
+ {
+ ix86_emit_restore_sse_regs_using_mov (stack_pointer_rtx,
+ frame.to_allocate,
+ style == 2);
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (frame.to_allocate
+ + frame.nsseregs * 16
+ + frame.padding0), style);
}
- else if (frame.to_allocate)
- pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
- GEN_INT (frame.to_allocate), style);
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
- if (ix86_save_reg (regno, false))
+ if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
emit_insn ((*ix86_gen_pop1) (gen_rtx_REG (Pmode, regno)));
if (frame_pointer_needed)
{
destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
}
+ if (CONST_INT_P (count))
+ {
+ count = GEN_INT (INTVAL (count)
+ & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
+ destmem = shallow_copy_rtx (destmem);
+ srcmem = shallow_copy_rtx (srcmem);
+ set_mem_size (destmem, count);
+ set_mem_size (srcmem, count);
+ }
+ else
+ {
+ if (MEM_SIZE (destmem))
+ set_mem_size (destmem, NULL_RTX);
+ if (MEM_SIZE (srcmem))
+ set_mem_size (srcmem, NULL_RTX);
+ }
emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
destexp, srcexp));
}
Arguments have same meaning as for previous function */
static void
expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
- rtx count,
- enum machine_mode mode)
+ rtx count, enum machine_mode mode,
+ rtx orig_value)
{
rtx destexp;
rtx countreg;
}
else
destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
+ if (orig_value == const0_rtx && CONST_INT_P (count))
+ {
+ count = GEN_INT (INTVAL (count)
+ & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
+ destmem = shallow_copy_rtx (destmem);
+ set_mem_size (destmem, count);
+ }
+ else if (MEM_SIZE (destmem))
+ set_mem_size (destmem, NULL_RTX);
emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
}
gcc_assert (desired_alignment <= 8);
}
+/* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
+ ALIGN_BYTES is how many bytes need to be copied. */
+static rtx
+expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
+ int desired_align, int align_bytes)
+{
+ rtx src = *srcp;
+ rtx src_size, dst_size;
+ int off = 0;
+ int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
+ if (src_align_bytes >= 0)
+ src_align_bytes = desired_align - src_align_bytes;
+ src_size = MEM_SIZE (src);
+ dst_size = MEM_SIZE (dst);
+ if (align_bytes & 1)
+ {
+ dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
+ src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
+ off = 1;
+ emit_insn (gen_strmov (destreg, dst, srcreg, src));
+ }
+ if (align_bytes & 2)
+ {
+ dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
+ src = adjust_automodify_address_nv (src, HImode, srcreg, off);
+ if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
+ set_mem_align (dst, 2 * BITS_PER_UNIT);
+ if (src_align_bytes >= 0
+ && (src_align_bytes & 1) == (align_bytes & 1)
+ && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
+ set_mem_align (src, 2 * BITS_PER_UNIT);
+ off = 2;
+ emit_insn (gen_strmov (destreg, dst, srcreg, src));
+ }
+ if (align_bytes & 4)
+ {
+ dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
+ src = adjust_automodify_address_nv (src, SImode, srcreg, off);
+ if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
+ set_mem_align (dst, 4 * BITS_PER_UNIT);
+ if (src_align_bytes >= 0)
+ {
+ unsigned int src_align = 0;
+ if ((src_align_bytes & 3) == (align_bytes & 3))
+ src_align = 4;
+ else if ((src_align_bytes & 1) == (align_bytes & 1))
+ src_align = 2;
+ if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
+ set_mem_align (src, src_align * BITS_PER_UNIT);
+ }
+ off = 4;
+ emit_insn (gen_strmov (destreg, dst, srcreg, src));
+ }
+ dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
+ src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
+ if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
+ set_mem_align (dst, desired_align * BITS_PER_UNIT);
+ if (src_align_bytes >= 0)
+ {
+ unsigned int src_align = 0;
+ if ((src_align_bytes & 7) == (align_bytes & 7))
+ src_align = 8;
+ else if ((src_align_bytes & 3) == (align_bytes & 3))
+ src_align = 4;
+ else if ((src_align_bytes & 1) == (align_bytes & 1))
+ src_align = 2;
+ if (src_align > (unsigned int) desired_align)
+ src_align = desired_align;
+ if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
+ set_mem_align (src, src_align * BITS_PER_UNIT);
+ }
+ if (dst_size)
+ set_mem_size (dst, GEN_INT (INTVAL (dst_size) - align_bytes));
+ if (src_size)
+ set_mem_size (dst, GEN_INT (INTVAL (src_size) - align_bytes));
+ *srcp = src;
+ return dst;
+}
+
/* Set enough from DEST to align DEST known to by aligned by ALIGN to
DESIRED_ALIGNMENT. */
static void
gcc_assert (desired_alignment <= 8);
}
+/* Set enough from DST to align DST known to by aligned by ALIGN to
+ DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
+static rtx
+expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
+ int desired_align, int align_bytes)
+{
+ int off = 0;
+ rtx dst_size = MEM_SIZE (dst);
+ if (align_bytes & 1)
+ {
+ dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
+ off = 1;
+ emit_insn (gen_strset (destreg, dst,
+ gen_lowpart (QImode, value)));
+ }
+ if (align_bytes & 2)
+ {
+ dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
+ if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
+ set_mem_align (dst, 2 * BITS_PER_UNIT);
+ off = 2;
+ emit_insn (gen_strset (destreg, dst,
+ gen_lowpart (HImode, value)));
+ }
+ if (align_bytes & 4)
+ {
+ dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
+ if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
+ set_mem_align (dst, 4 * BITS_PER_UNIT);
+ off = 4;
+ emit_insn (gen_strset (destreg, dst,
+ gen_lowpart (SImode, value)));
+ }
+ dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
+ if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
+ set_mem_align (dst, desired_align * BITS_PER_UNIT);
+ if (dst_size)
+ set_mem_size (dst, GEN_INT (INTVAL (dst_size) - align_bytes));
+ return dst;
+}
+
/* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
static enum stringop_alg
decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
unsigned HOST_WIDE_INT count = 0;
HOST_WIDE_INT expected_size = -1;
int size_needed = 0, epilogue_size_needed;
- int desired_align = 0;
+ int desired_align = 0, align_bytes = 0;
enum stringop_alg alg;
int dynamic_check;
bool need_zero_guard = false;
if (CONST_INT_P (expected_align_exp)
&& INTVAL (expected_align_exp) > align)
align = INTVAL (expected_align_exp);
+ /* ALIGN is the minimum of destination and source alignment, but we care here
+ just about destination alignment. */
+ else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
+ align = MEM_ALIGN (dst) / BITS_PER_UNIT;
+
if (CONST_INT_P (count_exp))
count = expected_size = INTVAL (count_exp);
if (CONST_INT_P (expected_size_exp) && count == 0)
/* Alignment code needs count to be in register. */
if (CONST_INT_P (count_exp) && desired_align > align)
- count_exp = force_reg (counter_mode (count_exp), count_exp);
+ {
+ if (INTVAL (count_exp) > desired_align
+ && INTVAL (count_exp) > size_needed)
+ {
+ align_bytes
+ = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
+ if (align_bytes <= 0)
+ align_bytes = 0;
+ else
+ align_bytes = desired_align - align_bytes;
+ }
+ if (align_bytes == 0)
+ count_exp = force_reg (counter_mode (count_exp), count_exp);
+ }
gcc_assert (desired_align >= 1 && align >= 1);
/* Ensure that alignment prologue won't copy past end of block. */
if (desired_align > align)
{
- /* Except for the first move in epilogue, we no longer know
- constant offset in aliasing info. It don't seems to worth
- the pain to maintain it for the first move, so throw away
- the info early. */
- src = change_address (src, BLKmode, srcreg);
- dst = change_address (dst, BLKmode, destreg);
- expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
- desired_align);
+ if (align_bytes == 0)
+ {
+ /* Except for the first move in epilogue, we no longer know
+ constant offset in aliasing info. It don't seems to worth
+ the pain to maintain it for the first move, so throw away
+ the info early. */
+ src = change_address (src, BLKmode, srcreg);
+ dst = change_address (dst, BLKmode, destreg);
+ expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
+ desired_align);
+ }
+ else
+ {
+ /* If we know how many bytes need to be stored before dst is
+ sufficiently aligned, maintain aliasing info accurately. */
+ dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
+ desired_align, align_bytes);
+ count_exp = plus_constant (count_exp, -align_bytes);
+ count -= align_bytes;
+ }
if (need_zero_guard && !count)
{
/* It is possible that we copied enough so the main loop will not
unsigned HOST_WIDE_INT count = 0;
HOST_WIDE_INT expected_size = -1;
int size_needed = 0, epilogue_size_needed;
- int desired_align = 0;
+ int desired_align = 0, align_bytes = 0;
enum stringop_alg alg;
rtx promoted_val = NULL;
bool force_loopy_epilogue = false;
/* Alignment code needs count to be in register. */
if (CONST_INT_P (count_exp) && desired_align > align)
{
- enum machine_mode mode = SImode;
- if (TARGET_64BIT && (count & ~0xffffffff))
- mode = DImode;
- count_exp = force_reg (mode, count_exp);
+ if (INTVAL (count_exp) > desired_align
+ && INTVAL (count_exp) > size_needed)
+ {
+ align_bytes
+ = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
+ if (align_bytes <= 0)
+ align_bytes = 0;
+ else
+ align_bytes = desired_align - align_bytes;
+ }
+ if (align_bytes == 0)
+ {
+ enum machine_mode mode = SImode;
+ if (TARGET_64BIT && (count & ~0xffffffff))
+ mode = DImode;
+ count_exp = force_reg (mode, count_exp);
+ }
}
/* Do the cheap promotion to allow better CSE across the
main loop and epilogue (ie one load of the big constant in the
if (size_needed > 1 || (desired_align > 1 && desired_align > align))
{
epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
- /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
+ /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
Make sure it is power of 2. */
epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
if (desired_align > align)
{
- /* Except for the first move in epilogue, we no longer know
- constant offset in aliasing info. It don't seems to worth
- the pain to maintain it for the first move, so throw away
- the info early. */
- dst = change_address (dst, BLKmode, destreg);
- expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
- desired_align);
+ if (align_bytes == 0)
+ {
+ /* Except for the first move in epilogue, we no longer know
+ constant offset in aliasing info. It don't seems to worth
+ the pain to maintain it for the first move, so throw away
+ the info early. */
+ dst = change_address (dst, BLKmode, destreg);
+ expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
+ desired_align);
+ }
+ else
+ {
+ /* If we know how many bytes need to be stored before dst is
+ sufficiently aligned, maintain aliasing info accurately. */
+ dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
+ desired_align, align_bytes);
+ count_exp = plus_constant (count_exp, -align_bytes);
+ count -= align_bytes;
+ }
if (need_zero_guard && !count)
{
/* It is possible that we copied enough so the main loop will not
break;
case rep_prefix_8_byte:
expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
- DImode);
+ DImode, val_exp);
break;
case rep_prefix_4_byte:
expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
- SImode);
+ SImode, val_exp);
break;
case rep_prefix_1_byte:
expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
- QImode);
+ QImode, val_exp);
break;
}
/* Adjust properly the offset of src and dest memory for aliasing. */
void
ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
- rtx callarg2 ATTRIBUTE_UNUSED,
+ rtx callarg2,
rtx pop, int sibcall)
{
rtx use = NULL, call;
+ enum calling_abi function_call_abi;
+ if (callarg2 && INTVAL (callarg2) == -2)
+ function_call_abi = MS_ABI;
+ else
+ function_call_abi = SYSV_ABI;
if (pop == const0_rtx)
pop = NULL;
gcc_assert (!TARGET_64BIT || !pop);
pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, call, pop));
+ gcc_assert (ix86_cfun_abi () != MS_ABI || function_call_abi != SYSV_ABI);
+ }
+ /* We need to represent that SI and DI registers are clobbered
+ by SYSV calls. */
+ if (ix86_cfun_abi () == MS_ABI && function_call_abi == SYSV_ABI)
+ {
+ static int clobbered_registers[] = {27, 28, 45, 46, 47, 48, 49, 50, 51,
+ 52, SI_REG, DI_REG};
+ unsigned int i;
+ rtx vec[ARRAY_SIZE (clobbered_registers) + 2];
+ rtx unspec = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
+ UNSPEC_MS_TO_SYSV_CALL);
+
+ vec[0] = call;
+ vec[1] = unspec;
+ for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
+ vec[i + 2] = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
+ ? TImode : DImode,
+ gen_rtx_REG
+ (SSE_REGNO_P (clobbered_registers[i])
+ ? TImode : DImode,
+ clobbered_registers[i]));
+
+ call = gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec_v (ARRAY_SIZE (clobbered_registers)
+ + 2, vec));
}
call = emit_call_insn (call);
IX86_BUILTIN_VPERMILPS,
IX86_BUILTIN_VPERMILPD256,
IX86_BUILTIN_VPERMILPS256,
- IX86_BUILTIN_VPERMIL2PD,
- IX86_BUILTIN_VPERMIL2PS,
- IX86_BUILTIN_VPERMIL2PD256,
- IX86_BUILTIN_VPERMIL2PS256,
IX86_BUILTIN_VPERM2F128PD256,
IX86_BUILTIN_VPERM2F128PS256,
IX86_BUILTIN_VPERM2F128SI256,
IX86_BUILTIN_STOREUPD256,
IX86_BUILTIN_STOREUPS256,
IX86_BUILTIN_LDDQU256,
+ IX86_BUILTIN_MOVNTDQ256,
+ IX86_BUILTIN_MOVNTPD256,
+ IX86_BUILTIN_MOVNTPS256,
IX86_BUILTIN_LOADDQU256,
IX86_BUILTIN_STOREDQU256,
IX86_BUILTIN_MASKLOADPD,
V2DF_FTYPE_PCV2DF_V2DF,
V2DI_FTYPE_PV2DI,
VOID_FTYPE_PV2SF_V4SF,
+ VOID_FTYPE_PV4DI_V4DI,
VOID_FTYPE_PV2DI_V2DI,
VOID_FTYPE_PCHAR_V32QI,
VOID_FTYPE_PCHAR_V16QI,
V2DI2TI_FTYPE_V2DI_V2DI_INT,
V1DI2DI_FTYPE_V1DI_V1DI_INT,
V2DF_FTYPE_V2DF_V2DF_INT,
- V8SF_FTYPE_V8SF_V8SF_V8SI_INT,
- V4DF_FTYPE_V4DF_V4DF_V4DI_INT,
- V4SF_FTYPE_V4SF_V4SF_V4SI_INT,
- V2DF_FTYPE_V2DF_V2DF_V2DI_INT,
V2DI_FTYPE_V2DI_UINT_UINT,
V2DI_FTYPE_V2DI_V2DI_UINT_UINT
};
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
+
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI_INT },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI_INT },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DI_INT },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SI_INT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
V4DF_type_node, V4DF_type_node,
integer_type_node,
NULL_TREE);
- tree v8sf_ftype_v8sf_v8sf_v8si_int
- = build_function_type_list (V8SF_type_node,
- V8SF_type_node, V8SF_type_node,
- V8SI_type_node, integer_type_node,
- NULL_TREE);
- tree v4df_ftype_v4df_v4df_v4di_int
- = build_function_type_list (V4DF_type_node,
- V4DF_type_node, V4DF_type_node,
- V4DI_type_node, integer_type_node,
- NULL_TREE);
- tree v4sf_ftype_v4sf_v4sf_v4si_int
- = build_function_type_list (V4SF_type_node,
- V4SF_type_node, V4SF_type_node,
- V4SI_type_node, integer_type_node,
- NULL_TREE);
- tree v2df_ftype_v2df_v2df_v2di_int
- = build_function_type_list (V2DF_type_node,
- V2DF_type_node, V2DF_type_node,
- V2DI_type_node, integer_type_node,
- NULL_TREE);
tree v8sf_ftype_pcfloat
= build_function_type_list (V8SF_type_node,
pcfloat_type_node,
V8SI_type_node, V4SI_type_node,
integer_type_node,
NULL_TREE);
+ tree pv4di_type_node = build_pointer_type (V4DI_type_node);
+ tree void_ftype_pv4di_v4di
+ = build_function_type_list (void_type_node,
+ pv4di_type_node, V4DI_type_node,
+ NULL_TREE);
tree v8sf_ftype_v8sf_v4sf_int
= build_function_type_list (V8SF_type_node,
V8SF_type_node, V4SF_type_node,
case VOID_FTYPE_PV2SF_V4SF:
type = void_ftype_pv2sf_v4sf;
break;
+ case VOID_FTYPE_PV4DI_V4DI:
+ type = void_ftype_pv4di_v4di;
+ break;
case VOID_FTYPE_PV2DI_V2DI:
type = void_ftype_pv2di_v2di;
break;
case V1DI2DI_FTYPE_V1DI_V1DI_INT:
type = v1di_ftype_v1di_v1di_int;
break;
- case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
- type = v8sf_ftype_v8sf_v8sf_v8si_int;
- break;
- case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
- type = v4df_ftype_v4df_v4df_v4di_int;
- break;
- case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
- type = v4sf_ftype_v4sf_v4sf_v4si_int;
- break;
- case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
- type = v2df_ftype_v2df_v2df_v2di_int;
- break;
default:
gcc_unreachable ();
}
nargs = 3;
nargs_constant = 2;
break;
- case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
- case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
- case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
- case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
- nargs = 4;
- nargs_constant = 1;
- break;
case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
nargs = 4;
nargs_constant = 2;
case CODE_FOR_sse4_1_blendpd:
case CODE_FOR_avx_vpermilv2df:
- case CODE_FOR_avx_vpermil2v2df3:
- case CODE_FOR_avx_vpermil2v4sf3:
- case CODE_FOR_avx_vpermil2v4df3:
- case CODE_FOR_avx_vpermil2v8sf3:
error ("the last argument must be a 2-bit immediate");
return const0_rtx;
memory = 0;
break;
case VOID_FTYPE_PV2SF_V4SF:
+ case VOID_FTYPE_PV4DI_V4DI:
case VOID_FTYPE_PV2DI_V2DI:
case VOID_FTYPE_PCHAR_V32QI:
case VOID_FTYPE_PCHAR_V16QI:
#undef TARGET_OPTION_CAN_INLINE_P
#define TARGET_OPTION_CAN_INLINE_P ix86_can_inline_p
+#undef TARGET_EXPAND_TO_RTL_HOOK
+#define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
+
struct gcc_target targetm = TARGET_INITIALIZER;
\f
#include "gt-i386.h"