/* Subroutines used for code generation on IA-32.
- Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
- 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+ Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+ 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
Free Software Foundation, Inc.
This file is part of GCC.
#include "tm-constrs.h"
#include "params.h"
#include "cselib.h"
+#include "debug.h"
+#include "dwarf2out.h"
-static int x86_builtin_vectorization_cost (bool);
static rtx legitimize_dllimport_symbol (rtx, bool);
#ifndef CHECK_STACK_LIMIT
m_AMD_MULTIPLE,
/* X86_TUNE_INTER_UNIT_MOVES */
- ~(m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
+ ~(m_AMD_MULTIPLE | m_GENERIC),
/* X86_TUNE_INTER_UNIT_CONVERSIONS */
~(m_AMDFAM10),
static rtx (*ix86_gen_pop1) (rtx);
static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
-static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx);
+static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
rtx, rtx, int);
static void ix86_add_new_builtins (int);
+static rtx ix86_expand_vec_perm_builtin (tree);
enum ix86_function_specific_strings
{
static enum calling_abi ix86_function_abi (const_tree);
\f
+#ifndef SUBTARGET32_DEFAULT_CPU
+#define SUBTARGET32_DEFAULT_CPU "i386"
+#endif
+
/* The svr4 ABI for the i386 says that records and unions are returned
in memory. */
#ifndef DEFAULT_PCC_STRUCT_RETURN
}
}
\f
-/* Return a string the documents the current -m options. The caller is
+/* Return a string that documents the current -m options. The caller is
responsible for freeing the string. */
static char *
{
{ "-m64", OPTION_MASK_ISA_64BIT },
{ "-mfma4", OPTION_MASK_ISA_FMA4 },
+ { "-mfma", OPTION_MASK_ISA_FMA },
{ "-mxop", OPTION_MASK_ISA_XOP },
{ "-mlwp", OPTION_MASK_ISA_LWP },
{ "-msse4a", OPTION_MASK_ISA_SSE4A },
{
int i;
unsigned int ix86_arch_mask, ix86_tune_mask;
+ const bool ix86_tune_specified = (ix86_tune_string != NULL);
const char *prefix;
const char *suffix;
const char *sw;
|| !strcmp (ix86_tune_string, "generic64")))
;
else if (!strncmp (ix86_tune_string, "generic", 7))
- error ("bad value (%s) for %stune=%s %s",
+ error ("bad value (%s) for %stune=%s %s",
ix86_tune_string, prefix, suffix, sw);
+ else if (!strcmp (ix86_tune_string, "x86-64"))
+ warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated. Use "
+ "%stune=k8%s or %stune=generic%s instead as appropriate.",
+ prefix, suffix, prefix, suffix, prefix, suffix);
}
else
{
ix86_tune_string = "generic32";
}
}
+
if (ix86_stringop_string)
{
if (!strcmp (ix86_stringop_string, "rep_byte"))
error ("bad value (%s) for %sstringop-strategy=%s %s",
ix86_stringop_string, prefix, suffix, sw);
}
- if (!strcmp (ix86_tune_string, "x86-64"))
- warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated. Use "
- "%stune=k8%s or %stune=generic%s instead as appropriate.",
- prefix, suffix, prefix, suffix, prefix, suffix);
if (!ix86_arch_string)
- ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
+ ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
else
ix86_arch_specified = 1;
- if (!strcmp (ix86_arch_string, "generic"))
- error ("generic CPU can be used only for %stune=%s %s",
- prefix, suffix, sw);
- if (!strncmp (ix86_arch_string, "generic", 7))
- error ("bad value (%s) for %sarch=%s %s",
- ix86_arch_string, prefix, suffix, sw);
-
/* Validate -mabi= value. */
if (ix86_abi_string)
{
break;
}
- if (i == pta_size)
+ if (!strcmp (ix86_arch_string, "generic"))
+ error ("generic CPU can be used only for %stune=%s %s",
+ prefix, suffix, sw);
+ else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
error ("bad value (%s) for %sarch=%s %s",
ix86_arch_string, prefix, suffix, sw);
x86_prefetch_sse = true;
break;
}
- if (i == pta_size)
+
+ if (ix86_tune_specified && i == pta_size)
error ("bad value (%s) for %stune=%s %s",
ix86_tune_string, prefix, suffix, sw);
ix86_tls_dialect = TLS_DIALECT_GNU;
else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
ix86_tls_dialect = TLS_DIALECT_GNU2;
- else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
- ix86_tls_dialect = TLS_DIALECT_SUN;
else
error ("bad value (%s) for %stls-dialect=%s %s",
ix86_tls_dialect_string, prefix, suffix, sw);
}
/* for V1xx modes, just use the base mode */
- if (VECTOR_MODE_P (mode) && mode != V1DImode
+ if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
&& GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
mode = GET_MODE_INNER (mode);
classes[0] = X86_64_SSE_CLASS;
classes[1] = X86_64_SSEUP_CLASS;
return 2;
+ case V1TImode:
case V1DImode:
case V2SFmode:
case V2SImode:
case V4HImode:
case V2SImode:
case V2SFmode:
+ case V1TImode:
case V1DImode:
if (!type || !AGGREGATE_TYPE_P (type))
{
case V4HImode:
case V2SImode:
case V2SFmode:
+ case V1TImode:
case V1DImode:
if (!type || !AGGREGATE_TYPE_P (type))
{
/* This function generates code for -fpic that loads %ebx with
the return address of the caller and then returns. */
-void
-ix86_file_end (void)
+static void
+ix86_code_end (void)
{
rtx xops[2];
int regno;
for (regno = 0; regno < 8; ++regno)
{
char name[32];
+ tree decl;
if (! ((pic_labels_used >> regno) & 1))
continue;
get_pc_thunk_name (name, regno);
+ decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+ get_identifier (name),
+ build_function_type (void_type_node, void_list_node));
+ DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+ NULL_TREE, void_type_node);
+ TREE_PUBLIC (decl) = 1;
+ TREE_STATIC (decl) = 1;
+
#if TARGET_MACHO
if (TARGET_MACHO)
{
assemble_name (asm_out_file, name);
fputs ("\n", asm_out_file);
ASM_OUTPUT_LABEL (asm_out_file, name);
+ DECL_WEAK (decl) = 1;
}
else
#endif
if (USE_HIDDEN_LINKONCE)
{
- tree decl;
-
- decl = build_decl (BUILTINS_LOCATION,
- FUNCTION_DECL, get_identifier (name),
- error_mark_node);
- TREE_PUBLIC (decl) = 1;
- TREE_STATIC (decl) = 1;
DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
(*targetm.asm_out.unique_section) (decl, 0);
ASM_OUTPUT_LABEL (asm_out_file, name);
}
+ DECL_INITIAL (decl) = make_node (BLOCK);
+ current_function_decl = decl;
+ init_function_start (decl);
+ first_function_block_is_cold = false;
+ /* Make sure unwind info is emitted for the thunk if needed. */
+ final_start_function (emit_barrier (), asm_out_file, 1);
+
xops[0] = gen_rtx_REG (Pmode, regno);
xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
output_asm_insn ("ret", xops);
+ final_end_function ();
+ init_insn_lengths ();
+ free_after_compilation (cfun);
+ set_cfun (NULL);
+ current_function_decl = NULL;
}
-
- if (NEED_INDICATE_EXEC_STACK)
- file_end_indicate_exec_stack ();
}
/* Emit code for the SET_GOT patterns. */
if (!flag_pic)
output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
else
- output_asm_insn ("call\t%a2", xops);
+ {
+ output_asm_insn ("call\t%a2", xops);
+#ifdef DWARF2_UNWIND_INFO
+ /* The call to next label acts as a push. */
+ if (dwarf2out_do_frame ())
+ {
+ rtx insn;
+ start_sequence ();
+ insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+ gen_rtx_PLUS (Pmode,
+ stack_pointer_rtx,
+ GEN_INT (-4))));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ dwarf2out_frame_debug (insn, true);
+ end_sequence ();
+ }
+#endif
+ }
#if TARGET_MACHO
/* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
if (flag_pic)
- output_asm_insn ("pop%z0\t%0", xops);
+ {
+ output_asm_insn ("pop%z0\t%0", xops);
+#ifdef DWARF2_UNWIND_INFO
+ /* The pop is a pop and clobbers dest, but doesn't restore it
+ for unwind info purposes. */
+ if (dwarf2out_do_frame ())
+ {
+ rtx insn;
+ start_sequence ();
+ insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx));
+ dwarf2out_frame_debug (insn, true);
+ insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+ gen_rtx_PLUS (Pmode,
+ stack_pointer_rtx,
+ GEN_INT (4))));
+ RTX_FRAME_RELATED_P (insn) = 1;
+ dwarf2out_frame_debug (insn, true);
+ end_sequence ();
+ }
+#endif
+ }
}
else
{
get_pc_thunk_name (name, REGNO (dest));
pic_labels_used |= 1 << REGNO (dest);
+#ifdef DWARF2_UNWIND_INFO
+ /* Ensure all queued register saves are flushed before the
+ call. */
+ if (dwarf2out_do_frame ())
+ {
+ rtx insn;
+ start_sequence ();
+ insn = emit_barrier ();
+ end_sequence ();
+ dwarf2out_frame_debug (insn, false);
+ }
+#endif
xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
xops[2] = gen_rtx_MEM (QImode, xops[2]);
output_asm_insn ("call\t%X2", xops);
end_sequence ();
insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
- RTX_FRAME_RELATED_P (insn) = 1;
+ if (!optimize)
+ {
+ add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
+ RTX_FRAME_RELATED_P (insn) = 1;
+ }
return drap_vreg;
}
else
ix86_cfa_state->reg == stack_pointer_rtx);
else
{
- /* Only valid for Win32. */
rtx eax = gen_rtx_REG (Pmode, AX_REG);
bool eax_live;
rtx t;
- gcc_assert (!TARGET_64BIT || cfun->machine->call_abi == MS_ABI);
-
if (cfun->machine->call_abi == MS_ABI)
eax_live = false;
else
{
int regno;
rtx base_address = gen_rtx_MEM (TImode, pointer);
- rtx mem, insn;
+ rtx mem;
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
}
mem = adjust_address (base_address, TImode, offset);
set_mem_align (mem, 128);
- insn = emit_move_insn (reg, mem);
+ emit_move_insn (reg, mem);
offset += 16;
ix86_add_cfa_restore_note (NULL_RTX, reg, red_offset);
break;
case UNSPEC_GOTTPOFF:
/* FIXME: This might be @TPOFF in Sun ld too. */
- fputs ("@GOTTPOFF", file);
+ fputs ("@gottpoff", file);
break;
case UNSPEC_TPOFF:
- fputs ("@TPOFF", file);
+ fputs ("@tpoff", file);
break;
case UNSPEC_NTPOFF:
if (TARGET_64BIT)
- fputs ("@TPOFF", file);
+ fputs ("@tpoff", file);
else
- fputs ("@NTPOFF", file);
+ fputs ("@ntpoff", file);
break;
case UNSPEC_DTPOFF:
- fputs ("@DTPOFF", file);
+ fputs ("@dtpoff", file);
break;
case UNSPEC_GOTNTPOFF:
if (TARGET_64BIT)
fputs (ASSEMBLER_DIALECT == ASM_ATT ?
- "@GOTTPOFF(%rip)": "@GOTTPOFF[rip]", file);
+ "@gottpoff(%rip)": "@gottpoff[rip]", file);
else
- fputs ("@GOTNTPOFF", file);
+ fputs ("@gotntpoff", file);
break;
case UNSPEC_INDNTPOFF:
- fputs ("@INDNTPOFF", file);
+ fputs ("@indntpoff", file);
break;
#if TARGET_MACHO
case UNSPEC_MACHOPIC_OFFSET:
{
fputs (ASM_LONG, file);
output_addr_const (file, x);
- fputs ("@DTPOFF", file);
+ fputs ("@dtpoff", file);
switch (size)
{
case 4:
ix86_delegitimize_address (rtx x)
{
rtx orig_x = delegitimize_mem_from_attrs (x);
+ /* addend is NULL or some rtx if x is something+GOTOFF where
+ something doesn't include the PIC register. */
+ rtx addend = NULL_RTX;
/* reg_addend is NULL or a multiple of some register. */
rtx reg_addend = NULL_RTX;
/* const_addend is NULL or a const_int. */
else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
reg_addend = XEXP (reg_addend, 0);
else
- return orig_x;
- if (!REG_P (reg_addend)
- && GET_CODE (reg_addend) != MULT
- && GET_CODE (reg_addend) != ASHIFT)
- return orig_x;
+ {
+ reg_addend = NULL_RTX;
+ addend = XEXP (x, 0);
+ }
}
else
- return orig_x;
+ addend = XEXP (x, 0);
x = XEXP (XEXP (x, 1), 0);
if (GET_CODE (x) == PLUS
}
if (GET_CODE (x) == UNSPEC
- && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x))
+ && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
|| (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
result = XVECEXP (x, 0, 0);
result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
if (reg_addend)
result = gen_rtx_PLUS (Pmode, reg_addend, result);
+ if (addend)
+ {
+ /* If the rest of original X doesn't involve the PIC register, add
+ addend and subtract pic_offset_table_rtx. This can happen e.g.
+ for code like:
+ leal (%ebx, %ecx, 4), %ecx
+ ...
+ movl foo@GOTOFF(%ecx), %edx
+ in which case we return (%ecx - %ebx) + foo. */
+ if (pic_offset_table_rtx)
+ result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
+ pic_offset_table_rtx),
+ result);
+ else
+ return orig_x;
+ }
return result;
}
L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
C -- print opcode suffix for set/cmov insn.
c -- like C, but print reversed condition
- E,e -- likewise, but for compare-and-branch fused insn.
F,f -- likewise, but for floating-point.
O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
otherwise nothing
put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
return;
- case 'E':
- put_condition_code (GET_CODE (x), CCmode, 0, 0, file);
- return;
-
- case 'e':
- put_condition_code (GET_CODE (x), CCmode, 1, 0, file);
- return;
-
case 'H':
/* It doesn't actually matter what mode we use here, as we're
only going to use this for printing. */
case 2: size = "WORD"; break;
case 4: size = "DWORD"; break;
case 8: size = "QWORD"; break;
- case 12: size = "XWORD"; break;
+ case 12: size = "TBYTE"; break;
case 16:
if (GET_MODE (x) == XFmode)
- size = "XWORD";
+ size = "TBYTE";
else
size = "XMMWORD";
break;
+ case 32: size = "YMMWORD"; break;
default:
gcc_unreachable ();
}
case UNSPEC_GOTTPOFF:
output_addr_const (file, op);
/* FIXME: This might be @TPOFF in Sun ld. */
- fputs ("@GOTTPOFF", file);
+ fputs ("@gottpoff", file);
break;
case UNSPEC_TPOFF:
output_addr_const (file, op);
- fputs ("@TPOFF", file);
+ fputs ("@tpoff", file);
break;
case UNSPEC_NTPOFF:
output_addr_const (file, op);
if (TARGET_64BIT)
- fputs ("@TPOFF", file);
+ fputs ("@tpoff", file);
else
- fputs ("@NTPOFF", file);
+ fputs ("@ntpoff", file);
break;
case UNSPEC_DTPOFF:
output_addr_const (file, op);
- fputs ("@DTPOFF", file);
+ fputs ("@dtpoff", file);
break;
case UNSPEC_GOTNTPOFF:
output_addr_const (file, op);
if (TARGET_64BIT)
fputs (ASSEMBLER_DIALECT == ASM_ATT ?
- "@GOTTPOFF(%rip)" : "@GOTTPOFF[rip]", file);
+ "@gottpoff(%rip)" : "@gottpoff[rip]", file);
else
- fputs ("@GOTNTPOFF", file);
+ fputs ("@gotntpoff", file);
break;
case UNSPEC_INDNTPOFF:
output_addr_const (file, op);
- fputs ("@INDNTPOFF", file);
+ fputs ("@indntpoff", file);
break;
#if TARGET_MACHO
case UNSPEC_MACHOPIC_OFFSET:
return TRUE;
}
+/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
+ are ok, keeping in mind the possible movddup alternative. */
+
+bool
+ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
+{
+ if (MEM_P (operands[0]))
+ return rtx_equal_p (operands[0], operands[1 + high]);
+ if (MEM_P (operands[1]) && MEM_P (operands[2]))
+ return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
+ return true;
+}
+
/* Post-reload splitter for converting an SF or DFmode value in an
SSE register into an unsigned SImode. */
exponents = validize_mem (force_const_mem (V4SImode, x));
/* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
- emit_insn (gen_sse2_punpckldq (int_xmm, int_xmm, exponents));
+ emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
/* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
else
{
x = copy_to_mode_reg (V2DFmode, fp_xmm);
- emit_insn (gen_sse2_unpckhpd (fp_xmm, fp_xmm, fp_xmm));
+ emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
}
enum rtx_code code = GET_CODE (operands[1]), compare_code;
rtx compare_seq, compare_op;
enum machine_mode mode = GET_MODE (operands[0]);
- bool sign_bit_compare_p = false;;
+ bool sign_bit_compare_p = false;
start_sequence ();
ix86_compare_op0 = XEXP (operands[1], 0);
if (!sign_bit_compare_p)
{
+ rtx flags;
bool fpcmp = false;
compare_code = GET_CODE (compare_op);
- if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
- || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
+ flags = XEXP (compare_op, 0);
+
+ if (GET_MODE (flags) == CCFPmode
+ || GET_MODE (flags) == CCFPUmode)
{
fpcmp = true;
- compare_code = ix86_fp_compare_code_to_integer (compare_code);
+ compare_code
+ = ix86_fp_compare_code_to_integer (compare_code);
}
/* To simplify rest of code, restrict to the GEU case. */
reverse_condition_maybe_unordered
(GET_CODE (compare_op)));
else
- PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
+ PUT_CODE (compare_op,
+ reverse_condition (GET_CODE (compare_op)));
}
diff = ct - cf;
tmp = gen_reg_rtx (mode);
if (mode == DImode)
- emit_insn (gen_x86_movdicc_0_m1 (tmp, compare_op));
+ emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
else
- emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
- compare_op));
+ emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
+ flags, compare_op));
}
else
{
/* XOP supports all of the comparisons on all vector int types. */
if (!TARGET_XOP)
{
- /* Canonicalize the comparison to EQ, GT, GTU. */
- switch (code)
- {
- case EQ:
- case GT:
- case GTU:
- break;
-
- case NE:
- case LE:
- case LEU:
- code = reverse_condition (code);
- negate = true;
- break;
-
- case GE:
- case GEU:
- code = reverse_condition (code);
- negate = true;
- /* FALLTHRU */
-
- case LT:
- case LTU:
- code = swap_condition (code);
- x = cop0, cop0 = cop1, cop1 = x;
- break;
-
- default:
- gcc_unreachable ();
- }
-
- /* Only SSE4.1/SSE4.2 supports V2DImode. */
- if (mode == V2DImode)
- {
+ /* Canonicalize the comparison to EQ, GT, GTU. */
switch (code)
{
case EQ:
- /* SSE4.1 supports EQ. */
- if (!TARGET_SSE4_1)
- return false;
- break;
-
case GT:
case GTU:
- /* SSE4.2 supports GT/GTU. */
- if (!TARGET_SSE4_2)
- return false;
+ break;
+
+ case NE:
+ case LE:
+ case LEU:
+ code = reverse_condition (code);
+ negate = true;
+ break;
+
+ case GE:
+ case GEU:
+ code = reverse_condition (code);
+ negate = true;
+ /* FALLTHRU */
+
+ case LT:
+ case LTU:
+ code = swap_condition (code);
+ x = cop0, cop0 = cop1, cop1 = x;
break;
default:
gcc_unreachable ();
}
- }
- /* Unsigned parallel compare is not supported by the hardware. Play some
- tricks to turn this into a signed comparison against 0. */
- if (code == GTU)
- {
- cop0 = force_reg (mode, cop0);
+ /* Only SSE4.1/SSE4.2 supports V2DImode. */
+ if (mode == V2DImode)
+ {
+ switch (code)
+ {
+ case EQ:
+ /* SSE4.1 supports EQ. */
+ if (!TARGET_SSE4_1)
+ return false;
+ break;
- switch (mode)
+ case GT:
+ case GTU:
+ /* SSE4.2 supports GT/GTU. */
+ if (!TARGET_SSE4_2)
+ return false;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+ }
+
+ /* Unsigned parallel compare is not supported by the hardware.
+ Play some tricks to turn this into a signed comparison
+ against 0. */
+ if (code == GTU)
{
- case V4SImode:
- case V2DImode:
- {
- rtx t1, t2, mask;
-
- /* Perform a parallel modulo subtraction. */
- t1 = gen_reg_rtx (mode);
- emit_insn ((mode == V4SImode
- ? gen_subv4si3
- : gen_subv2di3) (t1, cop0, cop1));
-
- /* Extract the original sign bit of op0. */
- mask = ix86_build_signbit_mask (GET_MODE_INNER (mode),
- true, false);
- t2 = gen_reg_rtx (mode);
- emit_insn ((mode == V4SImode
- ? gen_andv4si3
- : gen_andv2di3) (t2, cop0, mask));
-
- /* XOR it back into the result of the subtraction. This results
- in the sign bit set iff we saw unsigned underflow. */
- x = gen_reg_rtx (mode);
- emit_insn ((mode == V4SImode
- ? gen_xorv4si3
- : gen_xorv2di3) (x, t1, t2));
-
- code = GT;
- }
- break;
+ cop0 = force_reg (mode, cop0);
+
+ switch (mode)
+ {
+ case V4SImode:
+ case V2DImode:
+ {
+ rtx t1, t2, mask;
+ rtx (*gen_sub3) (rtx, rtx, rtx);
+
+ /* Subtract (-(INT MAX) - 1) from both operands to make
+ them signed. */
+ mask = ix86_build_signbit_mask (GET_MODE_INNER (mode),
+ true, false);
+ gen_sub3 = (mode == V4SImode
+ ? gen_subv4si3 : gen_subv2di3);
+ t1 = gen_reg_rtx (mode);
+ emit_insn (gen_sub3 (t1, cop0, mask));
+
+ t2 = gen_reg_rtx (mode);
+ emit_insn (gen_sub3 (t2, cop1, mask));
+
+ cop0 = t1;
+ cop1 = t2;
+ code = GT;
+ }
+ break;
- case V16QImode:
- case V8HImode:
- /* Perform a parallel unsigned saturating subtraction. */
- x = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (VOIDmode, x,
- gen_rtx_US_MINUS (mode, cop0, cop1)));
+ case V16QImode:
+ case V8HImode:
+ /* Perform a parallel unsigned saturating subtraction. */
+ x = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (VOIDmode, x,
+ gen_rtx_US_MINUS (mode, cop0, cop1)));
- code = EQ;
- negate = !negate;
- break;
+ cop0 = x;
+ cop1 = CONST0_RTX (mode);
+ code = EQ;
+ negate = !negate;
+ break;
- default:
- gcc_unreachable ();
+ default:
+ gcc_unreachable ();
+ }
}
-
- cop0 = x;
- cop1 = CONST0_RTX (mode);
- }
}
x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
{
/* Shift higher 8 bytes to lower 8 bytes. */
src = gen_reg_rtx (imode);
- emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, src),
- gen_lowpart (TImode, operands[1]),
- GEN_INT (64)));
+ emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, src),
+ gen_lowpart (V1TImode, operands[1]),
+ GEN_INT (64)));
}
else
src = operands[1];
ix86_expand_int_addcc (rtx operands[])
{
enum rtx_code code = GET_CODE (operands[1]);
- rtx (*insn)(rtx, rtx, rtx, rtx);
+ rtx flags;
+ rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
rtx compare_op;
rtx val = const0_rtx;
bool fpcmp = false;
- enum machine_mode mode = GET_MODE (operands[0]);
+ enum machine_mode mode;
ix86_compare_op0 = XEXP (operands[1], 0);
ix86_compare_op1 = XEXP (operands[1], 1);
return 0;
code = GET_CODE (compare_op);
- if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
- || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
+ flags = XEXP (compare_op, 0);
+
+ if (GET_MODE (flags) == CCFPmode
+ || GET_MODE (flags) == CCFPUmode)
{
fpcmp = true;
code = ix86_fp_compare_code_to_integer (code);
else
PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
}
- PUT_MODE (compare_op, mode);
+
+ mode = GET_MODE (operands[0]);
/* Construct either adc or sbb insn. */
if ((code == LTU) == (operands[3] == constm1_rtx))
{
- switch (GET_MODE (operands[0]))
+ switch (mode)
{
case QImode:
insn = gen_subqi3_carry;
}
else
{
- switch (GET_MODE (operands[0]))
+ switch (mode)
{
case QImode:
insn = gen_addqi3_carry;
gcc_unreachable ();
}
}
- emit_insn (insn (operands[0], operands[2], val, compare_op));
+ emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
return 1; /* DONE */
}
gen_rtx_IF_THEN_ELSE (Pmode, tmp,
reg2,
out)));
-
}
else
{
/* Avoid branch in fixing the byte. */
tmpreg = gen_lowpart (QImode, tmpreg);
emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
- cmp = gen_rtx_LTU (Pmode, gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
- emit_insn ((*ix86_gen_sub3_carry) (out, out, GEN_INT (3), cmp));
+ tmp = gen_rtx_REG (CCmode, FLAGS_REG);
+ cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
+ emit_insn ((*ix86_gen_sub3_carry) (out, out, GEN_INT (3), tmp, cmp));
emit_label (end_0_label);
}
IX86_BUILTIN_EXTRACTF128SI256,
IX86_BUILTIN_VZEROALL,
IX86_BUILTIN_VZEROUPPER,
- IX86_BUILTIN_VZEROUPPER_REX64,
IX86_BUILTIN_VPERMILVARPD,
IX86_BUILTIN_VPERMILVARPS,
IX86_BUILTIN_VPERMILVARPD256,
IX86_BUILTIN_VPERMILPS,
IX86_BUILTIN_VPERMILPD256,
IX86_BUILTIN_VPERMILPS256,
+ IX86_BUILTIN_VPERMIL2PD,
+ IX86_BUILTIN_VPERMIL2PS,
+ IX86_BUILTIN_VPERMIL2PD256,
+ IX86_BUILTIN_VPERMIL2PS256,
IX86_BUILTIN_VPERM2F128PD256,
IX86_BUILTIN_VPERM2F128PS256,
IX86_BUILTIN_VPERM2F128SI256,
IX86_BUILTIN_CVTUDQ2PS,
+ IX86_BUILTIN_VEC_PERM_V2DF,
+ IX86_BUILTIN_VEC_PERM_V4SF,
+ IX86_BUILTIN_VEC_PERM_V2DI,
+ IX86_BUILTIN_VEC_PERM_V4SI,
+ IX86_BUILTIN_VEC_PERM_V8HI,
+ IX86_BUILTIN_VEC_PERM_V16QI,
+ IX86_BUILTIN_VEC_PERM_V2DI_U,
+ IX86_BUILTIN_VEC_PERM_V4SI_U,
+ IX86_BUILTIN_VEC_PERM_V8HI_U,
+ IX86_BUILTIN_VEC_PERM_V16QI_U,
+ IX86_BUILTIN_VEC_PERM_V4DF,
+ IX86_BUILTIN_VEC_PERM_V8SF,
+
/* FMA4 and XOP instructions. */
IX86_BUILTIN_VFMADDSS,
IX86_BUILTIN_VFMADDSD,
IX86_BUILTIN_VPCOMTRUEQ,
/* LWP instructions. */
- IX86_BUILTIN_LLWPCB16,
- IX86_BUILTIN_LLWPCB32,
- IX86_BUILTIN_LLWPCB64,
- IX86_BUILTIN_SLWPCB16,
- IX86_BUILTIN_SLWPCB32,
- IX86_BUILTIN_SLWPCB64,
- IX86_BUILTIN_LWPVAL16,
+ IX86_BUILTIN_LLWPCB,
+ IX86_BUILTIN_SLWPCB,
IX86_BUILTIN_LWPVAL32,
IX86_BUILTIN_LWPVAL64,
- IX86_BUILTIN_LWPINS16,
IX86_BUILTIN_LWPINS32,
IX86_BUILTIN_LWPINS64,
+ IX86_BUILTIN_CLZS,
+
IX86_BUILTIN_MAX
};
/* AVX */
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, 0, IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
- { OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_64BIT, CODE_FOR_avx_vzeroupper_rex64, 0, IX86_BUILTIN_VZEROUPPER_REX64, UNKNOWN, (int) VOID_FTYPE_VOID },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastsd256, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss256, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_pd256, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
- { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_ps256, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DF_V4DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SF_V8SF },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcbhi1, "__builtin_ia32_llwpcb16", IX86_BUILTIN_LLWPCB16, UNKNOWN, (int) VOID_FTYPE_VOID },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcbsi1, "__builtin_ia32_llwpcb32", IX86_BUILTIN_LLWPCB32, UNKNOWN, (int) VOID_FTYPE_VOID },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcbdi1, "__builtin_ia32_llwpcb64", IX86_BUILTIN_LLWPCB64, UNKNOWN, (int) VOID_FTYPE_VOID },
-
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcbhi1, "__builtin_ia32_slwpcb16", IX86_BUILTIN_SLWPCB16, UNKNOWN, (int) VOID_FTYPE_VOID },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcbsi1, "__builtin_ia32_slwpcb32", IX86_BUILTIN_SLWPCB32, UNKNOWN, (int) VOID_FTYPE_VOID },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcbdi1, "__builtin_ia32_slwpcb64", IX86_BUILTIN_SLWPCB64, UNKNOWN, (int) VOID_FTYPE_VOID },
-
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalhi3, "__builtin_ia32_lwpval16", IX86_BUILTIN_LWPVAL16, UNKNOWN, (int) VOID_FTYPE_USHORT_UINT_USHORT },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinshi3, "__builtin_ia32_lwpins16", IX86_BUILTIN_LWPINS16, UNKNOWN, (int) UCHAR_FTYPE_USHORT_UINT_USHORT },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
- { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
+ { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
+ { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
+ { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
+ { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
+ { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
+ { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
};
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
- { OPTION_MASK_ISA_SSE, CODE_FOR_sse_unpckhps, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
- { OPTION_MASK_ISA_SSE, CODE_FOR_sse_unpcklps, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+ { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+ { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
{ OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
/* SSE2 */
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2df", IX86_BUILTIN_VEC_PERM_V2DF, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI },
+ { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4sf", IX86_BUILTIN_VEC_PERM_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di", IX86_BUILTIN_VEC_PERM_V2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si", IX86_BUILTIN_VEC_PERM_V4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi", IX86_BUILTIN_VEC_PERM_V8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi", IX86_BUILTIN_VEC_PERM_V16QI, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di_u", IX86_BUILTIN_VEC_PERM_V2DI_U, UNKNOWN, (int) V2UDI_FTYPE_V2UDI_V2UDI_V2UDI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si_u", IX86_BUILTIN_VEC_PERM_V4SI_U, UNKNOWN, (int) V4USI_FTYPE_V4USI_V4USI_V4USI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi_u", IX86_BUILTIN_VEC_PERM_V8HI_U, UNKNOWN, (int) V8UHI_FTYPE_V8UHI_V8UHI_V8UHI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi_u", IX86_BUILTIN_VEC_PERM_V16QI_U, UNKNOWN, (int) V16UQI_FTYPE_V16UQI_V16UQI_V16UQI },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4df", IX86_BUILTIN_VEC_PERM_V4DF, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DI },
+ { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8sf", IX86_BUILTIN_VEC_PERM_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SI },
+
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpckhpd_exp, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpcklpd_exp, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckhbw, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckhwd, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckhdq, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckhqdq, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpcklbw, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpcklwd, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckldq, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpcklqdq, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
- { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
{ OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
+
+ { OPTION_MASK_ISA_ABM, CODE_FOR_clzhi2_abm, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
};
/* FMA4 and XOP. */
+#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
+#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
+#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
+#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
#define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
#define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
#define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
{ OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
{ OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
+ { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
+ { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
+ { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
+ { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
+
};
/* Set up all the MMX/SSE builtins, even builtins for instructions that are not
def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
- /* AVX */
- def_builtin (OPTION_MASK_ISA_AVX, "__builtin_ia32_vzeroupper",
- VOID_FTYPE_VOID,
- (TARGET_64BIT ? IX86_BUILTIN_VZEROUPPER_REX64
- : IX86_BUILTIN_VZEROUPPER));
-
/* MMX access to the vec_init patterns. */
def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
switch (m_type)
{
+ case MULTI_ARG_4_DF2_DI_I:
+ case MULTI_ARG_4_DF2_DI_I1:
+ case MULTI_ARG_4_SF2_SI_I:
+ case MULTI_ARG_4_SF2_SI_I1:
+ nargs = 4;
+ last_arg_constant = true;
+ break;
+
case MULTI_ARG_3_SF:
case MULTI_ARG_3_DF:
case MULTI_ARG_3_SF2:
pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
break;
+ case 4:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
+ break;
+
default:
gcc_unreachable ();
}
case FLOAT_FTYPE_FLOAT:
case INT_FTYPE_INT:
case UINT64_FTYPE_INT:
+ case UINT16_FTYPE_UINT16:
case INT64_FTYPE_INT64:
case INT64_FTYPE_V4SF:
case INT64_FTYPE_V2DF:
break;
case V2DI_FTYPE_V2DI_INT_CONVERT:
nargs = 2;
- rmode = V2DImode;
+ rmode = V1TImode;
nargs_constant = 1;
break;
case V8HI_FTYPE_V8HI_INT:
nargs = 3;
nargs_constant = 2;
break;
+ case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
+ case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
+ case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
+ case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
+ nargs = 4;
+ nargs_constant = 1;
+ break;
case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
nargs = 4;
nargs_constant = 2;
case CODE_FOR_sse4_1_blendpd:
case CODE_FOR_avx_vpermilv2df:
+ case CODE_FOR_xop_vpermil2v2df3:
+ case CODE_FOR_xop_vpermil2v4sf3:
+ case CODE_FOR_xop_vpermil2v4df3:
+ case CODE_FOR_xop_vpermil2v8sf3:
error ("the last argument must be a 2-bit immediate");
return const0_rtx;
{
rtx op;
enum machine_mode mode;
- } args[2];
+ } args[3];
enum insn_code icode = d->icode;
bool last_arg_constant = false;
const struct insn_data *insn_p = &insn_data[icode];
case V4DF_FTYPE_PCV2DF:
case V4DF_FTYPE_PCDOUBLE:
case V2DF_FTYPE_PCDOUBLE:
+ case VOID_FTYPE_PVOID:
nargs = 1;
klass = load;
memory = 0;
/* Reserve memory operand for target. */
memory = ARRAY_SIZE (args);
break;
- case VOID_FTYPE_USHORT_UINT_USHORT:
case VOID_FTYPE_UINT_UINT_UINT:
case VOID_FTYPE_UINT64_UINT_UINT:
- case UCHAR_FTYPE_USHORT_UINT_USHORT:
case UCHAR_FTYPE_UINT_UINT_UINT:
case UCHAR_FTYPE_UINT64_UINT_UINT:
nargs = 3;
- klass = store;
- memory = 0;
+ klass = load;
+ memory = ARRAY_SIZE (args);
+ last_arg_constant = true;
break;
default:
gcc_unreachable ();
if (last_arg_constant && (i + 1) == nargs)
{
if (!match)
- switch (icode)
- {
- default:
+ {
+ if (icode == CODE_FOR_lwp_lwpvalsi3
+ || icode == CODE_FOR_lwp_lwpinssi3
+ || icode == CODE_FOR_lwp_lwpvaldi3
+ || icode == CODE_FOR_lwp_lwpinsdi3)
+ error ("the last argument must be a 32-bit immediate");
+ else
error ("the last argument must be an 8-bit immediate");
- return const0_rtx;
- }
+ return const0_rtx;
+ }
}
else
{
case 2:
pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
break;
+ case 3:
+ pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+ break;
default:
gcc_unreachable ();
}
case IX86_BUILTIN_VEC_SET_V16QI:
return ix86_expand_vec_set_builtin (exp);
+ case IX86_BUILTIN_VEC_PERM_V2DF:
+ case IX86_BUILTIN_VEC_PERM_V4SF:
+ case IX86_BUILTIN_VEC_PERM_V2DI:
+ case IX86_BUILTIN_VEC_PERM_V4SI:
+ case IX86_BUILTIN_VEC_PERM_V8HI:
+ case IX86_BUILTIN_VEC_PERM_V16QI:
+ case IX86_BUILTIN_VEC_PERM_V2DI_U:
+ case IX86_BUILTIN_VEC_PERM_V4SI_U:
+ case IX86_BUILTIN_VEC_PERM_V8HI_U:
+ case IX86_BUILTIN_VEC_PERM_V16QI_U:
+ case IX86_BUILTIN_VEC_PERM_V4DF:
+ case IX86_BUILTIN_VEC_PERM_V8SF:
+ return ix86_expand_vec_perm_builtin (exp);
+
case IX86_BUILTIN_INFQ:
case IX86_BUILTIN_HUGE_VALQ:
{
return target;
}
+ case IX86_BUILTIN_LLWPCB:
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ op0 = expand_normal (arg0);
+ icode = CODE_FOR_lwp_llwpcb;
+ if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode))
+ op0 = copy_to_mode_reg (Pmode, op0);
+ emit_insn (gen_lwp_llwpcb (op0));
+ return 0;
+
+ case IX86_BUILTIN_SLWPCB:
+ icode = CODE_FOR_lwp_slwpcb;
+ if (!target
+ || ! (*insn_data[icode].operand[0].predicate) (target, Pmode))
+ target = gen_reg_rtx (Pmode);
+ emit_insn (gen_lwp_slwpcb (target));
+ return target;
+
default:
break;
}
if it is not available. */
static tree
-ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
+ix86_builtin_vectorized_function (tree fndecl, tree type_out,
tree type_in)
{
enum machine_mode in_mode, out_mode;
int in_n, out_n;
+ enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
if (TREE_CODE (type_out) != VECTOR_TYPE
- || TREE_CODE (type_in) != VECTOR_TYPE)
+ || TREE_CODE (type_in) != VECTOR_TYPE
+ || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
return NULL_TREE;
out_mode = TYPE_MODE (TREE_TYPE (type_out));
/* Make sure success has a non-zero value by adding one. */
return mask + 1;
}
+
+/* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
+ the expansion functions to turn the parallel back into a mask.
+ The return value is 0 for no match and the imm8+1 for a match. */
+
+int
+avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
+{
+ unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
+ unsigned mask = 0;
+ unsigned char ipar[8];
+
+ if (XVECLEN (par, 0) != (int) nelt)
+ return 0;
+
+ /* Validate that all of the elements are constants, and not totally
+ out of range. Copy the data into an integral array to make the
+ subsequent checks easier. */
+ for (i = 0; i < nelt; ++i)
+ {
+ rtx er = XVECEXP (par, 0, i);
+ unsigned HOST_WIDE_INT ei;
+
+ if (!CONST_INT_P (er))
+ return 0;
+ ei = INTVAL (er);
+ if (ei >= 2 * nelt)
+ return 0;
+ ipar[i] = ei;
+ }
+
+ /* Validate that the halves of the permute are halves. */
+ for (i = 0; i < nelt2 - 1; ++i)
+ if (ipar[i] + 1 != ipar[i + 1])
+ return 0;
+ for (i = nelt2; i < nelt - 1; ++i)
+ if (ipar[i] + 1 != ipar[i + 1])
+ return 0;
+
+ /* Reconstruct the mask. */
+ for (i = 0; i < 2; ++i)
+ {
+ unsigned e = ipar[i * nelt2];
+ if (e % nelt2)
+ return 0;
+ e /= nelt2;
+ mask |= e << (i * 4);
+ }
+
+ /* Make sure success has a non-zero value by adding one. */
+ return mask + 1;
+}
\f
/* Store OPERAND to the memory after reload is completed. This means
*total = 0;
return false;
+ case VEC_SELECT:
+ case VEC_CONCAT:
+ case VEC_MERGE:
+ case VEC_DUPLICATE:
+ /* ??? Assume all of these vector manipulation patterns are
+ recognizable. In which case they all pretty much have the
+ same cost. */
+ *total = COSTS_N_INSNS (1);
+ return true;
+
default:
return false;
}
fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
fprintf (file, ASM_LONG "%s\n", binder_name);
}
-
-void
-darwin_x86_file_end (void)
-{
- darwin_file_end ();
- ix86_file_end ();
-}
#endif /* TARGET_MACHO */
/* Order the registers for register allocator. */
*(*this + vcall_offset) should be added to THIS. */
static void
-x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED,
+x86_output_mi_thunk (FILE *file,
tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
HOST_WIDE_INT vcall_offset, tree function)
{
rtx this_param = x86_this_parameter (function);
rtx this_reg, tmp;
+ /* Make sure unwind info is emitted for the thunk if needed. */
+ final_start_function (emit_barrier (), file, 1);
+
/* If VCALL_OFFSET, we'll need THIS in a register. Might as well
pull it in now and let DELTA benefit. */
if (REG_P (this_param))
xops[0] = tmp;
xops[1] = this_param;
}
- output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
+ if (x86_maybe_negate_const_int (&xops[0], DImode))
+ output_asm_insn ("sub{q}\t{%0, %1|%1, %0}", xops);
+ else
+ output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
}
+ else if (x86_maybe_negate_const_int (&xops[0], SImode))
+ output_asm_insn ("sub{l}\t{%0, %1|%1, %0}", xops);
else
output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
}
output_asm_insn ("jmp\t{*}%1", xops);
}
}
+ final_end_function ();
}
static void
if (TARGET_64BIT)
{
#ifndef NO_PROFILE_COUNTERS
- fprintf (file, "\tleaq\t" LPREFIX "P%d@(%%rip),%%r11\n", labelno);
+ fprintf (file, "\tleaq\t" LPREFIX "P%d(%%rip),%%r11\n", labelno);
#endif
if (DEFAULT_ABI == SYSV_ABI && flag_pic)
extended_reg_mentioned_1, NULL);
}
-/* Generate an unsigned DImode/SImode to FP conversion. This is the same code
- optabs would emit if we didn't have TFmode patterns. */
-
-void
-x86_emit_floatuns (rtx operands[2])
+/* If profitable, negate (without causing overflow) integer constant
+ of mode MODE at location LOC. Return true in this case. */
+bool
+x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
{
- rtx neglab, donelab, i0, i1, f0, in, out;
- enum machine_mode mode, inmode;
-
- inmode = GET_MODE (operands[1]);
- gcc_assert (inmode == SImode || inmode == DImode);
+ HOST_WIDE_INT val;
- out = operands[0];
- in = force_reg (inmode, operands[1]);
- mode = GET_MODE (out);
- neglab = gen_label_rtx ();
- donelab = gen_label_rtx ();
- f0 = gen_reg_rtx (mode);
+ if (!CONST_INT_P (*loc))
+ return false;
- emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
+ switch (mode)
+ {
+ case DImode:
+ /* DImode x86_64 constants must fit in 32 bits. */
+ gcc_assert (x86_64_immediate_operand (*loc, mode));
+
+ mode = SImode;
+ break;
+
+ case SImode:
+ case HImode:
+ case QImode:
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /* Avoid overflows. */
+ if (mode_signbit_p (mode, *loc))
+ return false;
+
+ val = INTVAL (*loc);
+
+ /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
+ Exceptions: -128 encodes smaller than 128, so swap sign and op. */
+ if ((val < 0 && val != -128)
+ || val == 128)
+ {
+ *loc = GEN_INT (-val);
+ return true;
+ }
+
+ return false;
+}
+
+/* Generate an unsigned DImode/SImode to FP conversion. This is the same code
+ optabs would emit if we didn't have TFmode patterns. */
+
+void
+x86_emit_floatuns (rtx operands[2])
+{
+ rtx neglab, donelab, i0, i1, f0, in, out;
+ enum machine_mode mode, inmode;
+
+ inmode = GET_MODE (operands[1]);
+ gcc_assert (inmode == SImode || inmode == DImode);
+
+ out = operands[0];
+ in = force_reg (inmode, operands[1]);
+ mode = GET_MODE (out);
+ neglab = gen_label_rtx ();
+ donelab = gen_label_rtx ();
+ f0 = gen_reg_rtx (mode);
+
+ emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
expand_float (out, in, 0);
emit_label (donelab);
}
\f
+/* AVX does not support 32-byte integer vector operations,
+ thus the longest vector we are faced with is V16QImode. */
+#define MAX_VECT_LEN 16
+
+struct expand_vec_perm_d
+{
+ rtx target, op0, op1;
+ unsigned char perm[MAX_VECT_LEN];
+ enum machine_mode vmode;
+ unsigned char nelt;
+ bool testing_p;
+};
+
+static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+
+/* Get a vector mode of the same size as the original but with elements
+ twice as wide. This is only guaranteed to apply to integral vectors. */
+
+static inline enum machine_mode
+get_mode_wider_vector (enum machine_mode o)
+{
+ /* ??? Rely on the ordering that genmodes.c gives to vectors. */
+ enum machine_mode n = GET_MODE_WIDER_MODE (o);
+ gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
+ gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
+ return n;
+}
+
/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
with all elements equal to VAR. Return true if successful. */
ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
rtx target, rtx val)
{
- enum machine_mode hmode, smode, wsmode, wvmode;
- rtx x;
+ bool ok;
switch (mode)
{
return false;
/* FALLTHRU */
+ case V4DFmode:
+ case V4DImode:
+ case V8SFmode:
+ case V8SImode:
case V2DFmode:
case V2DImode:
case V4SFmode:
case V4SImode:
- val = force_reg (GET_MODE_INNER (mode), val);
- x = gen_rtx_VEC_DUPLICATE (mode, val);
- emit_insn (gen_rtx_SET (VOIDmode, target, x));
+ {
+ rtx insn, dup;
+
+ /* First attempt to recognize VAL as-is. */
+ dup = gen_rtx_VEC_DUPLICATE (mode, val);
+ insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
+ if (recog_memoized (insn) < 0)
+ {
+ rtx seq;
+ /* If that fails, force VAL into a register. */
+
+ start_sequence ();
+ XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
+ seq = get_insns ();
+ end_sequence ();
+ if (seq)
+ emit_insn_before (seq, insn);
+
+ ok = recog_memoized (insn) >= 0;
+ gcc_assert (ok);
+ }
+ }
return true;
case V4HImode:
return false;
if (TARGET_SSE || TARGET_3DNOW_A)
{
+ rtx x;
+
val = gen_lowpart (SImode, val);
x = gen_rtx_TRUNCATE (HImode, val);
x = gen_rtx_VEC_DUPLICATE (mode, x);
emit_insn (gen_rtx_SET (VOIDmode, target, x));
return true;
}
- else
- {
- smode = HImode;
- wsmode = SImode;
- wvmode = V2SImode;
- goto widen;
- }
+ goto widen;
case V8QImode:
if (!mmx_ok)
return false;
- smode = QImode;
- wsmode = HImode;
- wvmode = V4HImode;
goto widen;
+
case V8HImode:
if (TARGET_SSE2)
{
+ struct expand_vec_perm_d dperm;
rtx tmp1, tmp2;
- /* Extend HImode to SImode using a paradoxical SUBREG. */
+
+ permute:
+ memset (&dperm, 0, sizeof (dperm));
+ dperm.target = target;
+ dperm.vmode = mode;
+ dperm.nelt = GET_MODE_NUNITS (mode);
+ dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
+
+ /* Extend to SImode using a paradoxical SUBREG. */
tmp1 = gen_reg_rtx (SImode);
emit_move_insn (tmp1, gen_lowpart (SImode, val));
- /* Insert the SImode value as low element of V4SImode vector. */
- tmp2 = gen_reg_rtx (V4SImode);
- tmp1 = gen_rtx_VEC_MERGE (V4SImode,
- gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
- CONST0_RTX (V4SImode),
- const1_rtx);
- emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
- /* Cast the V4SImode vector back to a V8HImode vector. */
- tmp1 = gen_reg_rtx (V8HImode);
- emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
- /* Duplicate the low short through the whole low SImode word. */
- emit_insn (gen_sse2_punpcklwd (tmp1, tmp1, tmp1));
- /* Cast the V8HImode vector back to a V4SImode vector. */
- tmp2 = gen_reg_rtx (V4SImode);
- emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
- /* Replicate the low element of the V4SImode vector. */
- emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
- /* Cast the V2SImode back to V8HImode, and store in target. */
- emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
- return true;
+
+ /* Insert the SImode value as low element of a V4SImode vector. */
+ tmp2 = gen_lowpart (V4SImode, dperm.op0);
+ emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+
+ ok = (expand_vec_perm_1 (&dperm)
+ || expand_vec_perm_broadcast_1 (&dperm));
+ gcc_assert (ok);
+ return ok;
}
- smode = HImode;
- wsmode = SImode;
- wvmode = V4SImode;
goto widen;
+
case V16QImode:
if (TARGET_SSE2)
- {
- rtx tmp1, tmp2;
- /* Extend QImode to SImode using a paradoxical SUBREG. */
- tmp1 = gen_reg_rtx (SImode);
- emit_move_insn (tmp1, gen_lowpart (SImode, val));
- /* Insert the SImode value as low element of V4SImode vector. */
- tmp2 = gen_reg_rtx (V4SImode);
- tmp1 = gen_rtx_VEC_MERGE (V4SImode,
- gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
- CONST0_RTX (V4SImode),
- const1_rtx);
- emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
- /* Cast the V4SImode vector back to a V16QImode vector. */
- tmp1 = gen_reg_rtx (V16QImode);
- emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
- /* Duplicate the low byte through the whole low SImode word. */
- emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
- emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
- /* Cast the V16QImode vector back to a V4SImode vector. */
- tmp2 = gen_reg_rtx (V4SImode);
- emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
- /* Replicate the low element of the V4SImode vector. */
- emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
- /* Cast the V2SImode back to V16QImode, and store in target. */
- emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
- return true;
- }
- smode = QImode;
- wsmode = HImode;
- wvmode = V8HImode;
+ goto permute;
goto widen;
+
widen:
/* Replicate the value once into the next wider mode and recurse. */
- val = convert_modes (wsmode, smode, val, true);
- x = expand_simple_binop (wsmode, ASHIFT, val,
- GEN_INT (GET_MODE_BITSIZE (smode)),
- NULL_RTX, 1, OPTAB_LIB_WIDEN);
- val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
-
- x = gen_reg_rtx (wvmode);
- if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val))
- gcc_unreachable ();
- emit_move_insn (target, gen_lowpart (mode, x));
- return true;
+ {
+ enum machine_mode smode, wsmode, wvmode;
+ rtx x;
+
+ smode = GET_MODE_INNER (mode);
+ wvmode = get_mode_wider_vector (mode);
+ wsmode = GET_MODE_INNER (wvmode);
+
+ val = convert_modes (wsmode, smode, val, true);
+ x = expand_simple_binop (wsmode, ASHIFT, val,
+ GEN_INT (GET_MODE_BITSIZE (smode)),
+ NULL_RTX, 1, OPTAB_LIB_WIDEN);
+ val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+ x = gen_lowpart (wvmode, target);
+ ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
+ gcc_assert (ok);
+ return ok;
+ }
- case V4DFmode:
- hmode = V2DFmode;
- goto half;
- case V4DImode:
- hmode = V2DImode;
- goto half;
- case V8SFmode:
- hmode = V4SFmode;
- goto half;
- case V8SImode:
- hmode = V4SImode;
- goto half;
case V16HImode:
- hmode = V8HImode;
- goto half;
case V32QImode:
- hmode = V16QImode;
- goto half;
-half:
{
- rtx tmp = gen_reg_rtx (hmode);
- ix86_expand_vector_init_duplicate (mmx_ok, hmode, tmp, val);
- emit_insn (gen_rtx_SET (VOIDmode, target,
- gen_rtx_VEC_CONCAT (mode, tmp, tmp)));
+ enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
+ rtx x = gen_reg_rtx (hvmode);
+
+ ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+ gcc_assert (ok);
+
+ x = gen_rtx_VEC_CONCAT (mode, x, x);
+ emit_insn (gen_rtx_SET (VOIDmode, target, x));
}
return true;
/* tmp = target = A B C D */
tmp = copy_to_reg (target);
/* target = A A B B */
- emit_insn (gen_sse_unpcklps (target, target, target));
+ emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
/* target = X A B B */
ix86_expand_vector_set (false, target, val, 0);
/* target = A X C D */
case 2:
tmp = gen_reg_rtx (mode);
- emit_insn (gen_sse_unpckhps (tmp, vec, vec));
+ emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
break;
default:
case 2:
tmp = gen_reg_rtx (mode);
- emit_insn (gen_sse2_punpckhdq (tmp, vec, vec));
+ emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
break;
default:
emit_move_insn (operand0, res);
}
\f
-/* Validate whether a FMA4 instruction is valid or not.
- OPERANDS is the array of operands.
- NUM is the number of operands.
- USES_OC0 is true if the instruction uses OC0 and provides 4 variants.
- NUM_MEMORY is the maximum number of memory operands to accept.
- NUM_MEMORY less than zero is a special case to allow an operand
- of an instruction to be memory operation.
- when COMMUTATIVE is set, operand 1 and 2 can be swapped. */
-
-bool
-ix86_fma4_valid_op_p (rtx operands[], rtx insn ATTRIBUTE_UNUSED, int num,
- bool uses_oc0, int num_memory, bool commutative)
-{
- int mem_mask;
- int mem_count;
- int i;
-
- /* Count the number of memory arguments */
- mem_mask = 0;
- mem_count = 0;
- for (i = 0; i < num; i++)
- {
- enum machine_mode mode = GET_MODE (operands[i]);
- if (register_operand (operands[i], mode))
- ;
-
- else if (memory_operand (operands[i], mode))
- {
- mem_mask |= (1 << i);
- mem_count++;
- }
-
- else
- {
- rtx pattern = PATTERN (insn);
-
- /* allow 0 for pcmov */
- if (GET_CODE (pattern) != SET
- || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE
- || i < 2
- || operands[i] != CONST0_RTX (mode))
- return false;
- }
- }
-
- /* Special case pmacsdq{l,h} where we allow the 3rd argument to be
- a memory operation. */
- if (num_memory < 0)
- {
- num_memory = -num_memory;
- if ((mem_mask & (1 << (num-1))) != 0)
- {
- mem_mask &= ~(1 << (num-1));
- mem_count--;
- }
- }
-
- /* If there were no memory operations, allow the insn */
- if (mem_mask == 0)
- return true;
-
- /* Do not allow the destination register to be a memory operand. */
- else if (mem_mask & (1 << 0))
- return false;
-
- /* If there are too many memory operations, disallow the instruction. While
- the hardware only allows 1 memory reference, before register allocation
- for some insns, we allow two memory operations sometimes in order to allow
- code like the following to be optimized:
-
- float fmadd (float *a, float *b, float *c) { return (*a * *b) + *c; }
-
- or similar cases that are vectorized into using the vfmaddss
- instruction. */
- else if (mem_count > num_memory)
- return false;
-
- /* Don't allow more than one memory operation if not optimizing. */
- else if (mem_count > 1 && !optimize)
- return false;
-
- else if (num == 4 && mem_count == 1)
- {
- /* formats (destination is the first argument), example vfmaddss:
- xmm1, xmm1, xmm2, xmm3/mem
- xmm1, xmm1, xmm2/mem, xmm3
- xmm1, xmm2, xmm3/mem, xmm1
- xmm1, xmm2/mem, xmm3, xmm1 */
- if (uses_oc0)
- return ((mem_mask == (1 << 1))
- || (mem_mask == (1 << 2))
- || (mem_mask == (1 << 3)));
-
- /* format, example vpmacsdd:
- xmm1, xmm2, xmm3/mem, xmm1 */
- if (commutative)
- return (mem_mask == (1 << 2) || mem_mask == (1 << 1));
- else
- return (mem_mask == (1 << 2));
- }
-
- else if (num == 4 && num_memory == 2)
- {
- /* If there are two memory operations, we can load one of the memory ops
- into the destination register. This is for optimizing the
- multiply/add ops, which the combiner has optimized both the multiply
- and the add insns to have a memory operation. We have to be careful
- that the destination doesn't overlap with the inputs. */
- rtx op0 = operands[0];
-
- if (reg_mentioned_p (op0, operands[1])
- || reg_mentioned_p (op0, operands[2])
- || reg_mentioned_p (op0, operands[3]))
- return false;
-
- /* formats (destination is the first argument), example vfmaddss:
- xmm1, xmm1, xmm2, xmm3/mem
- xmm1, xmm1, xmm2/mem, xmm3
- xmm1, xmm2, xmm3/mem, xmm1
- xmm1, xmm2/mem, xmm3, xmm1
-
- For the oc0 case, we will load either operands[1] or operands[3] into
- operands[0], so any combination of 2 memory operands is ok. */
- if (uses_oc0)
- return true;
-
- /* format, example vpmacsdd:
- xmm1, xmm2, xmm3/mem, xmm1
-
- For the integer multiply/add instructions be more restrictive and
- require operands[2] and operands[3] to be the memory operands. */
- if (commutative)
- return (mem_mask == ((1 << 1) | (1 << 3)) || ((1 << 2) | (1 << 3)));
- else
- return (mem_mask == ((1 << 2) | (1 << 3)));
- }
-
- else if (num == 3 && num_memory == 1)
- {
- /* formats, example vprotb:
- xmm1, xmm2, xmm3/mem
- xmm1, xmm2/mem, xmm3 */
- if (uses_oc0)
- return ((mem_mask == (1 << 1)) || (mem_mask == (1 << 2)));
-
- /* format, example vpcomeq:
- xmm1, xmm2, xmm3/mem */
- else
- return (mem_mask == (1 << 2));
- }
-
- else
- gcc_unreachable ();
-
- return false;
-}
-
-
-/* Fixup an FMA4 instruction that has 2 memory input references into a form the
- hardware will allow by using the destination register to load one of the
- memory operations. Presently this is used by the multiply/add routines to
- allow 2 memory references. */
-
-void
-ix86_expand_fma4_multiple_memory (rtx operands[],
- int num,
- enum machine_mode mode)
-{
- rtx op0 = operands[0];
- if (num != 4
- || memory_operand (op0, mode)
- || reg_mentioned_p (op0, operands[1])
- || reg_mentioned_p (op0, operands[2])
- || reg_mentioned_p (op0, operands[3]))
- gcc_unreachable ();
-
- /* For 2 memory operands, pick either operands[1] or operands[3] to move into
- the destination register. */
- if (memory_operand (operands[1], mode))
- {
- emit_move_insn (op0, operands[1]);
- operands[1] = op0;
- }
- else if (memory_operand (operands[3], mode))
- {
- emit_move_insn (op0, operands[3]);
- operands[3] = op0;
- }
- else
- gcc_unreachable ();
-
- return;
-}
/* Table of valid machine attributes. */
static const struct attribute_spec ix86_attribute_table[] =
/* Implement targetm.vectorize.builtin_vectorization_cost. */
static int
-x86_builtin_vectorization_cost (bool runtime_test)
+ix86_builtin_vectorization_cost (bool runtime_test)
{
/* If the branch of the runtime test is taken - i.e. - the vectorized
version is skipped - this incurs a misprediction cost (because the
return 0;
}
-/* This function returns the calling abi specific va_list type node.
- It returns the FNDECL specific va_list type. */
+/* Implement targetm.vectorize.builtin_vec_perm. */
-tree
-ix86_fn_abi_va_list (tree fndecl)
+static tree
+ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type)
{
- if (!TARGET_64BIT)
- return va_list_type_node;
- gcc_assert (fndecl != NULL_TREE);
+ tree itype = TREE_TYPE (vec_type);
+ bool u = TYPE_UNSIGNED (itype);
+ enum machine_mode vmode = TYPE_MODE (vec_type);
+ enum ix86_builtins fcode = fcode; /* Silence bogus warning. */
+ bool ok = TARGET_SSE2;
- if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
- return ms_va_list_type_node;
- else
- return sysv_va_list_type_node;
+ switch (vmode)
+ {
+ case V4DFmode:
+ ok = TARGET_AVX;
+ fcode = IX86_BUILTIN_VEC_PERM_V4DF;
+ goto get_di;
+ case V2DFmode:
+ fcode = IX86_BUILTIN_VEC_PERM_V2DF;
+ get_di:
+ itype = ix86_get_builtin_type (IX86_BT_DI);
+ break;
+
+ case V8SFmode:
+ ok = TARGET_AVX;
+ fcode = IX86_BUILTIN_VEC_PERM_V8SF;
+ goto get_si;
+ case V4SFmode:
+ ok = TARGET_SSE;
+ fcode = IX86_BUILTIN_VEC_PERM_V4SF;
+ get_si:
+ itype = ix86_get_builtin_type (IX86_BT_SI);
+ break;
+
+ case V2DImode:
+ fcode = u ? IX86_BUILTIN_VEC_PERM_V2DI_U : IX86_BUILTIN_VEC_PERM_V2DI;
+ break;
+ case V4SImode:
+ fcode = u ? IX86_BUILTIN_VEC_PERM_V4SI_U : IX86_BUILTIN_VEC_PERM_V4SI;
+ break;
+ case V8HImode:
+ fcode = u ? IX86_BUILTIN_VEC_PERM_V8HI_U : IX86_BUILTIN_VEC_PERM_V8HI;
+ break;
+ case V16QImode:
+ fcode = u ? IX86_BUILTIN_VEC_PERM_V16QI_U : IX86_BUILTIN_VEC_PERM_V16QI;
+ break;
+ default:
+ ok = false;
+ break;
+ }
+
+ if (!ok)
+ return NULL_TREE;
+
+ *mask_type = itype;
+ return ix86_builtins[(int) fcode];
}
-/* Returns the canonical va_list type specified by TYPE. If there
- is no valid TYPE provided, it return NULL_TREE. */
+/* Return a vector mode with twice as many elements as VMODE. */
+/* ??? Consider moving this to a table generated by genmodes.c. */
-tree
-ix86_canonical_va_list_type (tree type)
+static enum machine_mode
+doublesize_vector_mode (enum machine_mode vmode)
+{
+ switch (vmode)
+ {
+ case V2SFmode: return V4SFmode;
+ case V1DImode: return V2DImode;
+ case V2SImode: return V4SImode;
+ case V4HImode: return V8HImode;
+ case V8QImode: return V16QImode;
+
+ case V2DFmode: return V4DFmode;
+ case V4SFmode: return V8SFmode;
+ case V2DImode: return V4DImode;
+ case V4SImode: return V8SImode;
+ case V8HImode: return V16HImode;
+ case V16QImode: return V32QImode;
+
+ case V4DFmode: return V8DFmode;
+ case V8SFmode: return V16SFmode;
+ case V4DImode: return V8DImode;
+ case V8SImode: return V16SImode;
+ case V16HImode: return V32HImode;
+ case V32QImode: return V64QImode;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* Construct (set target (vec_select op0 (parallel perm))) and
+ return true if that's a valid instruction in the active ISA. */
+
+static bool
+expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
{
- tree wtype, htype;
+ rtx rperm[MAX_VECT_LEN], x;
+ unsigned i;
- /* Resolve references and pointers to va_list type. */
- if (INDIRECT_REF_P (type))
- type = TREE_TYPE (type);
- else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
- type = TREE_TYPE (type);
+ for (i = 0; i < nelt; ++i)
+ rperm[i] = GEN_INT (perm[i]);
- if (TARGET_64BIT)
+ x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
+ x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
+ x = gen_rtx_SET (VOIDmode, target, x);
+
+ x = emit_insn (x);
+ if (recog_memoized (x) < 0)
{
- wtype = va_list_type_node;
- gcc_assert (wtype != NULL_TREE);
- htype = type;
- if (TREE_CODE (wtype) == ARRAY_TYPE)
- {
- /* If va_list is an array type, the argument may have decayed
- to a pointer type, e.g. by being passed to another function.
- In that case, unwrap both types so that we can compare the
- underlying records. */
- if (TREE_CODE (htype) == ARRAY_TYPE
- || POINTER_TYPE_P (htype))
- {
- wtype = TREE_TYPE (wtype);
- htype = TREE_TYPE (htype);
- }
- }
- if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
- return va_list_type_node;
- wtype = sysv_va_list_type_node;
- gcc_assert (wtype != NULL_TREE);
- htype = type;
- if (TREE_CODE (wtype) == ARRAY_TYPE)
- {
+ remove_insn (x);
+ return false;
+ }
+ return true;
+}
+
+/* Similar, but generate a vec_concat from op0 and op1 as well. */
+
+static bool
+expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
+ const unsigned char *perm, unsigned nelt)
+{
+ enum machine_mode v2mode;
+ rtx x;
+
+ v2mode = doublesize_vector_mode (GET_MODE (op0));
+ x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
+ return expand_vselect (target, x, perm, nelt);
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ in terms of blendp[sd] / pblendw / pblendvb. */
+
+static bool
+expand_vec_perm_blend (struct expand_vec_perm_d *d)
+{
+ enum machine_mode vmode = d->vmode;
+ unsigned i, mask, nelt = d->nelt;
+ rtx target, op0, op1, x;
+
+ if (!TARGET_SSE4_1 || d->op0 == d->op1)
+ return false;
+ if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode))
+ return false;
+
+ /* This is a blend, not a permute. Elements must stay in their
+ respective lanes. */
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = d->perm[i];
+ if (!(e == i || e == i + nelt))
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ /* ??? Without SSE4.1, we could implement this with and/andn/or. This
+ decision should be extracted elsewhere, so that we only try that
+ sequence once all budget==3 options have been tried. */
+
+ /* For bytes, see if bytes move in pairs so we can use pblendw with
+ an immediate argument, rather than pblendvb with a vector argument. */
+ if (vmode == V16QImode)
+ {
+ bool pblendw_ok = true;
+ for (i = 0; i < 16 && pblendw_ok; i += 2)
+ pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]);
+
+ if (!pblendw_ok)
+ {
+ rtx rperm[16], vperm;
+
+ for (i = 0; i < nelt; ++i)
+ rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+
+ vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
+ vperm = force_reg (V16QImode, vperm);
+
+ emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm));
+ return true;
+ }
+ }
+
+ target = d->target;
+ op0 = d->op0;
+ op1 = d->op1;
+ mask = 0;
+
+ switch (vmode)
+ {
+ case V4DFmode:
+ case V8SFmode:
+ case V2DFmode:
+ case V4SFmode:
+ case V8HImode:
+ for (i = 0; i < nelt; ++i)
+ mask |= (d->perm[i] >= nelt) << i;
+ break;
+
+ case V2DImode:
+ for (i = 0; i < 2; ++i)
+ mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
+ goto do_subreg;
+
+ case V4SImode:
+ for (i = 0; i < 4; ++i)
+ mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+ goto do_subreg;
+
+ case V16QImode:
+ for (i = 0; i < 8; ++i)
+ mask |= (d->perm[i * 2] >= 16) << i;
+
+ do_subreg:
+ vmode = V8HImode;
+ target = gen_lowpart (vmode, target);
+ op0 = gen_lowpart (vmode, op0);
+ op1 = gen_lowpart (vmode, op1);
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ /* This matches five different patterns with the different modes. */
+ x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
+ x = gen_rtx_SET (VOIDmode, target, x);
+ emit_insn (x);
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ in terms of the variable form of vpermilps.
+
+ Note that we will have already failed the immediate input vpermilps,
+ which requires that the high and low part shuffle be identical; the
+ variable form doesn't require that. */
+
+static bool
+expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
+{
+ rtx rperm[8], vperm;
+ unsigned i;
+
+ if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
+ return false;
+
+ /* We can only permute within the 128-bit lane. */
+ for (i = 0; i < 8; ++i)
+ {
+ unsigned e = d->perm[i];
+ if (i < 4 ? e >= 4 : e < 4)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ for (i = 0; i < 8; ++i)
+ {
+ unsigned e = d->perm[i];
+
+ /* Within each 128-bit lane, the elements of op0 are numbered
+ from 0 and the elements of op1 are numbered from 4. */
+ if (e >= 8 + 4)
+ e -= 8;
+ else if (e >= 4)
+ e -= 4;
+
+ rperm[i] = GEN_INT (e);
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+ vperm = force_reg (V8SImode, vperm);
+ emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ in terms of pshufb or vpperm. */
+
+static bool
+expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt, eltsz;
+ rtx rperm[16], vperm, target, op0, op1;
+
+ if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP))
+ return false;
+ if (GET_MODE_SIZE (d->vmode) != 16)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i];
+ for (j = 0; j < eltsz; ++j)
+ rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
+ vperm = force_reg (V16QImode, vperm);
+
+ target = gen_lowpart (V16QImode, d->target);
+ op0 = gen_lowpart (V16QImode, d->op0);
+ if (d->op0 == d->op1)
+ emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+ else
+ {
+ op1 = gen_lowpart (V16QImode, d->op1);
+ emit_insn (gen_xop_pperm (target, op0, op1, vperm));
+ }
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
+ in a single instruction. */
+
+static bool
+expand_vec_perm_1 (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt = d->nelt;
+ unsigned char perm2[MAX_VECT_LEN];
+
+ /* Check plain VEC_SELECT first, because AVX has instructions that could
+ match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
+ input where SEL+CONCAT may not. */
+ if (d->op0 == d->op1)
+ {
+ int mask = nelt - 1;
+
+ for (i = 0; i < nelt; i++)
+ perm2[i] = d->perm[i] & mask;
+
+ if (expand_vselect (d->target, d->op0, perm2, nelt))
+ return true;
+
+ /* There are plenty of patterns in sse.md that are written for
+ SEL+CONCAT and are not replicated for a single op. Perhaps
+ that should be changed, to avoid the nastiness here. */
+
+ /* Recognize interleave style patterns, which means incrementing
+ every other permutation operand. */
+ for (i = 0; i < nelt; i += 2)
+ {
+ perm2[i] = d->perm[i] & mask;
+ perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
+ }
+ if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
+ return true;
+
+ /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
+ if (nelt >= 4)
+ {
+ for (i = 0; i < nelt; i += 4)
+ {
+ perm2[i + 0] = d->perm[i + 0] & mask;
+ perm2[i + 1] = d->perm[i + 1] & mask;
+ perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
+ perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
+ }
+
+ if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
+ return true;
+ }
+ }
+
+ /* Finally, try the fully general two operand permute. */
+ if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
+ return true;
+
+ /* Recognize interleave style patterns with reversed operands. */
+ if (d->op0 != d->op1)
+ {
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = d->perm[i];
+ if (e >= nelt)
+ e -= nelt;
+ else
+ e += nelt;
+ perm2[i] = e;
+ }
+
+ if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
+ return true;
+ }
+
+ /* Try the SSE4.1 blend variable merge instructions. */
+ if (expand_vec_perm_blend (d))
+ return true;
+
+ /* Try one of the AVX vpermil variable permutations. */
+ if (expand_vec_perm_vpermil (d))
+ return true;
+
+ /* Try the SSSE3 pshufb or XOP vpperm variable permutation. */
+ if (expand_vec_perm_pshufb (d))
+ return true;
+
+ return false;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+ in terms of a pair of pshuflw + pshufhw instructions. */
+
+static bool
+expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
+{
+ unsigned char perm2[MAX_VECT_LEN];
+ unsigned i;
+ bool ok;
+
+ if (d->vmode != V8HImode || d->op0 != d->op1)
+ return false;
+
+ /* The two permutations only operate in 64-bit lanes. */
+ for (i = 0; i < 4; ++i)
+ if (d->perm[i] >= 4)
+ return false;
+ for (i = 4; i < 8; ++i)
+ if (d->perm[i] < 4)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ /* Emit the pshuflw. */
+ memcpy (perm2, d->perm, 4);
+ for (i = 4; i < 8; ++i)
+ perm2[i] = i;
+ ok = expand_vselect (d->target, d->op0, perm2, 8);
+ gcc_assert (ok);
+
+ /* Emit the pshufhw. */
+ memcpy (perm2 + 4, d->perm + 4, 4);
+ for (i = 0; i < 4; ++i)
+ perm2[i] = i;
+ ok = expand_vselect (d->target, d->target, perm2, 8);
+ gcc_assert (ok);
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+ the permutation using the SSSE3 palignr instruction. This succeeds
+ when all of the elements in PERM fit within one vector and we merely
+ need to shift them down so that a single vector permutation has a
+ chance to succeed. */
+
+static bool
+expand_vec_perm_palignr (struct expand_vec_perm_d *d)
+{
+ unsigned i, nelt = d->nelt;
+ unsigned min, max;
+ bool in_order, ok;
+ rtx shift;
+
+ /* Even with AVX, palignr only operates on 128-bit vectors. */
+ if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+ return false;
+
+ min = nelt, max = 0;
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = d->perm[i];
+ if (e < min)
+ min = e;
+ if (e > max)
+ max = e;
+ }
+ if (min == 0 || max - min >= nelt)
+ return false;
+
+ /* Given that we have SSSE3, we know we'll be able to implement the
+ single operand permutation after the palignr with pshufb. */
+ if (d->testing_p)
+ return true;
+
+ shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+ emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
+ gen_lowpart (TImode, d->op1),
+ gen_lowpart (TImode, d->op0), shift));
+
+ d->op0 = d->op1 = d->target;
+
+ in_order = true;
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = d->perm[i] - min;
+ if (e != i)
+ in_order = false;
+ d->perm[i] = e;
+ }
+
+ /* Test for the degenerate case where the alignment by itself
+ produces the desired permutation. */
+ if (in_order)
+ return true;
+
+ ok = expand_vec_perm_1 (d);
+ gcc_assert (ok);
+
+ return ok;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+ a two vector permutation into a single vector permutation by using
+ an interleave operation to merge the vectors. */
+
+static bool
+expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dremap, dfinal;
+ unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
+ unsigned contents, h1, h2, h3, h4;
+ unsigned char remap[2 * MAX_VECT_LEN];
+ rtx seq;
+ bool ok;
+
+ if (d->op0 == d->op1)
+ return false;
+
+ /* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit
+ lanes. We can use similar techniques with the vperm2f128 instruction,
+ but it requires slightly different logic. */
+ if (GET_MODE_SIZE (d->vmode) != 16)
+ return false;
+
+ /* Examine from whence the elements come. */
+ contents = 0;
+ for (i = 0; i < nelt; ++i)
+ contents |= 1u << d->perm[i];
+
+ /* Split the two input vectors into 4 halves. */
+ h1 = (1u << nelt2) - 1;
+ h2 = h1 << nelt2;
+ h3 = h2 << nelt2;
+ h4 = h3 << nelt2;
+
+ memset (remap, 0xff, sizeof (remap));
+ dremap = *d;
+
+ /* If the elements from the low halves use interleave low, and similarly
+ for interleave high. If the elements are from mis-matched halves, we
+ can use shufps for V4SF/V4SI or do a DImode shuffle. */
+ if ((contents & (h1 | h3)) == contents)
+ {
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i * 2;
+ remap[i + nelt] = i * 2 + 1;
+ dremap.perm[i * 2] = i;
+ dremap.perm[i * 2 + 1] = i + nelt;
+ }
+ }
+ else if ((contents & (h2 | h4)) == contents)
+ {
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i * 2;
+ remap[i + nelt + nelt2] = i * 2 + 1;
+ dremap.perm[i * 2] = i + nelt2;
+ dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+ }
+ }
+ else if ((contents & (h1 | h4)) == contents)
+ {
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i;
+ remap[i + nelt + nelt2] = i + nelt2;
+ dremap.perm[i] = i;
+ dremap.perm[i + nelt2] = i + nelt + nelt2;
+ }
+ if (nelt != 4)
+ {
+ dremap.vmode = V2DImode;
+ dremap.nelt = 2;
+ dremap.perm[0] = 0;
+ dremap.perm[1] = 3;
+ }
+ }
+ else if ((contents & (h2 | h3)) == contents)
+ {
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i;
+ remap[i + nelt] = i + nelt2;
+ dremap.perm[i] = i + nelt2;
+ dremap.perm[i + nelt2] = i + nelt;
+ }
+ if (nelt != 4)
+ {
+ dremap.vmode = V2DImode;
+ dremap.nelt = 2;
+ dremap.perm[0] = 1;
+ dremap.perm[1] = 2;
+ }
+ }
+ else
+ return false;
+
+ /* Use the remapping array set up above to move the elements from their
+ swizzled locations into their final destinations. */
+ dfinal = *d;
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned e = remap[d->perm[i]];
+ gcc_assert (e < nelt);
+ dfinal.perm[i] = e;
+ }
+ dfinal.op0 = gen_reg_rtx (dfinal.vmode);
+ dfinal.op1 = dfinal.op0;
+ dremap.target = dfinal.op0;
+
+ /* Test if the final remap can be done with a single insn. For V4SFmode or
+ V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
+ start_sequence ();
+ ok = expand_vec_perm_1 (&dfinal);
+ seq = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+
+ if (dremap.vmode != dfinal.vmode)
+ {
+ dremap.target = gen_lowpart (dremap.vmode, dremap.target);
+ dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
+ dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
+ }
+
+ ok = expand_vec_perm_1 (&dremap);
+ gcc_assert (ok);
+
+ emit_insn (seq);
+ return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
+ permutation with two pshufb insns and an ior. We should have already
+ failed all two instruction sequences. */
+
+static bool
+expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
+{
+ rtx rperm[2][16], vperm, l, h, op, m128;
+ unsigned int i, nelt, eltsz;
+
+ if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+ return false;
+ gcc_assert (d->op0 != d->op1);
+
+ nelt = d->nelt;
+ eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+ /* Generate two permutation masks. If the required element is within
+ the given vector it is shuffled into the proper lane. If the required
+ element is in the other vector, force a zero into the lane by setting
+ bit 7 in the permutation mask. */
+ m128 = GEN_INT (-128);
+ for (i = 0; i < nelt; ++i)
+ {
+ unsigned j, e = d->perm[i];
+ unsigned which = (e >= nelt);
+ if (e >= nelt)
+ e -= nelt;
+
+ for (j = 0; j < eltsz; ++j)
+ {
+ rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
+ rperm[1-which][i*eltsz + j] = m128;
+ }
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
+ vperm = force_reg (V16QImode, vperm);
+
+ l = gen_reg_rtx (V16QImode);
+ op = gen_lowpart (V16QImode, d->op0);
+ emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
+
+ vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
+ vperm = force_reg (V16QImode, vperm);
+
+ h = gen_reg_rtx (V16QImode);
+ op = gen_lowpart (V16QImode, d->op1);
+ emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
+
+ op = gen_lowpart (V16QImode, d->target);
+ emit_insn (gen_iorv16qi3 (op, l, h));
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
+ and extract-odd permutations. */
+
+static bool
+expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
+{
+ rtx t1, t2, t3, t4;
+
+ switch (d->vmode)
+ {
+ case V4DFmode:
+ t1 = gen_reg_rtx (V4DFmode);
+ t2 = gen_reg_rtx (V4DFmode);
+
+ /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
+ emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
+ emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+ /* Now an unpck[lh]pd will produce the result required. */
+ if (odd)
+ t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
+ else
+ t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
+ emit_insn (t3);
+ break;
+
+ case V8SFmode:
+ {
+ static const unsigned char perm1[8] = { 0, 2, 1, 3, 5, 6, 5, 7 };
+ static const unsigned char perme[8] = { 0, 1, 8, 9, 4, 5, 12, 13 };
+ static const unsigned char permo[8] = { 2, 3, 10, 11, 6, 7, 14, 15 };
+
+ t1 = gen_reg_rtx (V8SFmode);
+ t2 = gen_reg_rtx (V8SFmode);
+ t3 = gen_reg_rtx (V8SFmode);
+ t4 = gen_reg_rtx (V8SFmode);
+
+ /* Shuffle within the 128-bit lanes to produce:
+ { 0 2 1 3 4 6 5 7 } and { 8 a 9 b c e d f }. */
+ expand_vselect (t1, d->op0, perm1, 8);
+ expand_vselect (t2, d->op1, perm1, 8);
+
+ /* Shuffle the lanes around to produce:
+ { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
+ emit_insn (gen_avx_vperm2f128v8sf3 (t3, t1, t2, GEN_INT (0x20)));
+ emit_insn (gen_avx_vperm2f128v8sf3 (t4, t1, t2, GEN_INT (0x31)));
+
+ /* Now a vpermil2p will produce the result required. */
+ /* ??? The vpermil2p requires a vector constant. Another option
+ is a unpck[lh]ps to merge the two vectors to produce
+ { 0 4 2 6 8 c a e } or { 1 5 3 7 9 d b f }. Then use another
+ vpermilps to get the elements into the final order. */
+ d->op0 = t3;
+ d->op1 = t4;
+ memcpy (d->perm, odd ? permo: perme, 8);
+ expand_vec_perm_vpermil (d);
+ }
+ break;
+
+ case V2DFmode:
+ case V4SFmode:
+ case V2DImode:
+ case V4SImode:
+ /* These are always directly implementable by expand_vec_perm_1. */
+ gcc_unreachable ();
+
+ case V8HImode:
+ if (TARGET_SSSE3)
+ return expand_vec_perm_pshufb2 (d);
+ else
+ {
+ /* We need 2*log2(N)-1 operations to achieve odd/even
+ with interleave. */
+ t1 = gen_reg_rtx (V8HImode);
+ t2 = gen_reg_rtx (V8HImode);
+ emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
+ emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
+ emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
+ emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
+ if (odd)
+ t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
+ else
+ t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
+ emit_insn (t3);
+ }
+ break;
+
+ case V16QImode:
+ if (TARGET_SSSE3)
+ return expand_vec_perm_pshufb2 (d);
+ else
+ {
+ t1 = gen_reg_rtx (V16QImode);
+ t2 = gen_reg_rtx (V16QImode);
+ t3 = gen_reg_rtx (V16QImode);
+ emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
+ emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
+ emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
+ emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
+ emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
+ emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
+ if (odd)
+ t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
+ else
+ t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
+ emit_insn (t3);
+ }
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+ extract-even and extract-odd permutations. */
+
+static bool
+expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
+{
+ unsigned i, odd, nelt = d->nelt;
+
+ odd = d->perm[0];
+ if (odd != 0 && odd != 1)
+ return false;
+
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != 2 * i + odd)
+ return false;
+
+ return expand_vec_perm_even_odd_1 (d, odd);
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
+ permutations. We assume that expand_vec_perm_1 has already failed. */
+
+static bool
+expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
+{
+ unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
+ enum machine_mode vmode = d->vmode;
+ unsigned char perm2[4];
+ rtx op0 = d->op0;
+ bool ok;
+
+ switch (vmode)
+ {
+ case V4DFmode:
+ case V8SFmode:
+ /* These are special-cased in sse.md so that we can optionally
+ use the vbroadcast instruction. They expand to two insns
+ if the input happens to be in a register. */
+ gcc_unreachable ();
+
+ case V2DFmode:
+ case V2DImode:
+ case V4SFmode:
+ case V4SImode:
+ /* These are always implementable using standard shuffle patterns. */
+ gcc_unreachable ();
+
+ case V8HImode:
+ case V16QImode:
+ /* These can be implemented via interleave. We save one insn by
+ stopping once we have promoted to V4SImode and then use pshufd. */
+ do
+ {
+ optab otab = vec_interleave_low_optab;
+
+ if (elt >= nelt2)
+ {
+ otab = vec_interleave_high_optab;
+ elt -= nelt2;
+ }
+ nelt2 /= 2;
+
+ op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
+ vmode = get_mode_wider_vector (vmode);
+ op0 = gen_lowpart (vmode, op0);
+ }
+ while (vmode != V4SImode);
+
+ memset (perm2, elt, 4);
+ ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
+ gcc_assert (ok);
+ return true;
+
+ default:
+ gcc_unreachable ();
+ }
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+ broadcast permutations. */
+
+static bool
+expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+{
+ unsigned i, elt, nelt = d->nelt;
+
+ if (d->op0 != d->op1)
+ return false;
+
+ elt = d->perm[0];
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != elt)
+ return false;
+
+ return expand_vec_perm_broadcast_1 (d);
+}
+
+/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
+ With all of the interface bits taken care of, perform the expansion
+ in D and return true on success. */
+
+static bool
+ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
+{
+ /* Try a single instruction expansion. */
+ if (expand_vec_perm_1 (d))
+ return true;
+
+ /* Try sequences of two instructions. */
+
+ if (expand_vec_perm_pshuflw_pshufhw (d))
+ return true;
+
+ if (expand_vec_perm_palignr (d))
+ return true;
+
+ if (expand_vec_perm_interleave2 (d))
+ return true;
+
+ if (expand_vec_perm_broadcast (d))
+ return true;
+
+ /* Try sequences of three instructions. */
+
+ if (expand_vec_perm_pshufb2 (d))
+ return true;
+
+ /* ??? Look for narrow permutations whose element orderings would
+ allow the promotion to a wider mode. */
+
+ /* ??? Look for sequences of interleave or a wider permute that place
+ the data into the correct lanes for a half-vector shuffle like
+ pshuf[lh]w or vpermilps. */
+
+ /* ??? Look for sequences of interleave that produce the desired results.
+ The combinatorics of punpck[lh] get pretty ugly... */
+
+ if (expand_vec_perm_even_odd (d))
+ return true;
+
+ return false;
+}
+
+/* Extract the values from the vector CST into the permutation array in D.
+ Return 0 on error, 1 if all values from the permutation come from the
+ first vector, 2 if all values from the second vector, and 3 otherwise. */
+
+static int
+extract_vec_perm_cst (struct expand_vec_perm_d *d, tree cst)
+{
+ tree list = TREE_VECTOR_CST_ELTS (cst);
+ unsigned i, nelt = d->nelt;
+ int ret = 0;
+
+ for (i = 0; i < nelt; ++i, list = TREE_CHAIN (list))
+ {
+ unsigned HOST_WIDE_INT e;
+
+ if (!host_integerp (TREE_VALUE (list), 1))
+ return 0;
+ e = tree_low_cst (TREE_VALUE (list), 1);
+ if (e >= 2 * nelt)
+ return 0;
+
+ ret |= (e < nelt ? 1 : 2);
+ d->perm[i] = e;
+ }
+ gcc_assert (list == NULL);
+
+ /* For all elements from second vector, fold the elements to first. */
+ if (ret == 2)
+ for (i = 0; i < nelt; ++i)
+ d->perm[i] -= nelt;
+
+ return ret;
+}
+
+static rtx
+ix86_expand_vec_perm_builtin (tree exp)
+{
+ struct expand_vec_perm_d d;
+ tree arg0, arg1, arg2;
+
+ arg0 = CALL_EXPR_ARG (exp, 0);
+ arg1 = CALL_EXPR_ARG (exp, 1);
+ arg2 = CALL_EXPR_ARG (exp, 2);
+
+ d.vmode = TYPE_MODE (TREE_TYPE (arg0));
+ d.nelt = GET_MODE_NUNITS (d.vmode);
+ d.testing_p = false;
+ gcc_assert (VECTOR_MODE_P (d.vmode));
+
+ if (TREE_CODE (arg2) != VECTOR_CST)
+ {
+ error_at (EXPR_LOCATION (exp),
+ "vector permutation requires vector constant");
+ goto exit_error;
+ }
+
+ switch (extract_vec_perm_cst (&d, arg2))
+ {
+ default:
+ gcc_unreachable();
+
+ case 0:
+ error_at (EXPR_LOCATION (exp), "invalid vector permutation constant");
+ goto exit_error;
+
+ case 3:
+ if (!operand_equal_p (arg0, arg1, 0))
+ {
+ d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
+ d.op0 = force_reg (d.vmode, d.op0);
+ d.op1 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
+ d.op1 = force_reg (d.vmode, d.op1);
+ break;
+ }
+
+ /* The elements of PERM do not suggest that only the first operand
+ is used, but both operands are identical. Allow easier matching
+ of the permutation by folding the permutation into the single
+ input vector. */
+ {
+ unsigned i, nelt = d.nelt;
+ for (i = 0; i < nelt; ++i)
+ if (d.perm[i] >= nelt)
+ d.perm[i] -= nelt;
+ }
+ /* FALLTHRU */
+
+ case 1:
+ d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
+ d.op0 = force_reg (d.vmode, d.op0);
+ d.op1 = d.op0;
+ break;
+
+ case 2:
+ d.op0 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
+ d.op0 = force_reg (d.vmode, d.op0);
+ d.op1 = d.op0;
+ break;
+ }
+
+ d.target = gen_reg_rtx (d.vmode);
+ if (ix86_expand_vec_perm_builtin_1 (&d))
+ return d.target;
+
+ /* For compiler generated permutations, we should never got here, because
+ the compiler should also be checking the ok hook. But since this is a
+ builtin the user has access too, so don't abort. */
+ switch (d.nelt)
+ {
+ case 2:
+ sorry ("vector permutation (%d %d)", d.perm[0], d.perm[1]);
+ break;
+ case 4:
+ sorry ("vector permutation (%d %d %d %d)",
+ d.perm[0], d.perm[1], d.perm[2], d.perm[3]);
+ break;
+ case 8:
+ sorry ("vector permutation (%d %d %d %d %d %d %d %d)",
+ d.perm[0], d.perm[1], d.perm[2], d.perm[3],
+ d.perm[4], d.perm[5], d.perm[6], d.perm[7]);
+ break;
+ case 16:
+ sorry ("vector permutation "
+ "(%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d)",
+ d.perm[0], d.perm[1], d.perm[2], d.perm[3],
+ d.perm[4], d.perm[5], d.perm[6], d.perm[7],
+ d.perm[8], d.perm[9], d.perm[10], d.perm[11],
+ d.perm[12], d.perm[13], d.perm[14], d.perm[15]);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ exit_error:
+ return CONST0_RTX (d.vmode);
+}
+
+/* Implement targetm.vectorize.builtin_vec_perm_ok. */
+
+static bool
+ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask)
+{
+ struct expand_vec_perm_d d;
+ int vec_mask;
+ bool ret, one_vec;
+
+ d.vmode = TYPE_MODE (vec_type);
+ d.nelt = GET_MODE_NUNITS (d.vmode);
+ d.testing_p = true;
+
+ /* Given sufficient ISA support we can just return true here
+ for selected vector modes. */
+ if (GET_MODE_SIZE (d.vmode) == 16)
+ {
+ /* All implementable with a single vpperm insn. */
+ if (TARGET_XOP)
+ return true;
+ /* All implementable with 2 pshufb + 1 ior. */
+ if (TARGET_SSSE3)
+ return true;
+ /* All implementable with shufpd or unpck[lh]pd. */
+ if (d.nelt == 2)
+ return true;
+ }
+
+ vec_mask = extract_vec_perm_cst (&d, mask);
+
+ /* This hook is cannot be called in response to something that the
+ user does (unlike the builtin expander) so we shouldn't ever see
+ an error generated from the extract. */
+ gcc_assert (vec_mask > 0 && vec_mask <= 3);
+ one_vec = (vec_mask != 3);
+
+ /* Implementable with shufps or pshufd. */
+ if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
+ return true;
+
+ /* Otherwise we have to go through the motions and see if we can
+ figure out how to generate the requested permutation. */
+ d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+ d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+ if (!one_vec)
+ d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+ start_sequence ();
+ ret = ix86_expand_vec_perm_builtin_1 (&d);
+ end_sequence ();
+
+ return ret;
+}
+
+void
+ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
+{
+ struct expand_vec_perm_d d;
+ unsigned i, nelt;
+
+ d.target = targ;
+ d.op0 = op0;
+ d.op1 = op1;
+ d.vmode = GET_MODE (targ);
+ d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+ d.testing_p = false;
+
+ for (i = 0; i < nelt; ++i)
+ d.perm[i] = i * 2 + odd;
+
+ /* We'll either be able to implement the permutation directly... */
+ if (expand_vec_perm_1 (&d))
+ return;
+
+ /* ... or we use the special-case patterns. */
+ expand_vec_perm_even_odd_1 (&d, odd);
+}
+\f
+/* This function returns the calling abi specific va_list type node.
+ It returns the FNDECL specific va_list type. */
+
+tree
+ix86_fn_abi_va_list (tree fndecl)
+{
+ if (!TARGET_64BIT)
+ return va_list_type_node;
+ gcc_assert (fndecl != NULL_TREE);
+
+ if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
+ return ms_va_list_type_node;
+ else
+ return sysv_va_list_type_node;
+}
+
+/* Returns the canonical va_list type specified by TYPE. If there
+ is no valid TYPE provided, it return NULL_TREE. */
+
+tree
+ix86_canonical_va_list_type (tree type)
+{
+ tree wtype, htype;
+
+ /* Resolve references and pointers to va_list type. */
+ if (INDIRECT_REF_P (type))
+ type = TREE_TYPE (type);
+ else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
+ type = TREE_TYPE (type);
+
+ if (TARGET_64BIT)
+ {
+ wtype = va_list_type_node;
+ gcc_assert (wtype != NULL_TREE);
+ htype = type;
+ if (TREE_CODE (wtype) == ARRAY_TYPE)
+ {
+ /* If va_list is an array type, the argument may have decayed
+ to a pointer type, e.g. by being passed to another function.
+ In that case, unwrap both types so that we can compare the
+ underlying records. */
+ if (TREE_CODE (htype) == ARRAY_TYPE
+ || POINTER_TYPE_P (htype))
+ {
+ wtype = TREE_TYPE (wtype);
+ htype = TREE_TYPE (htype);
+ }
+ }
+ if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
+ return va_list_type_node;
+ wtype = sysv_va_list_type_node;
+ gcc_assert (wtype != NULL_TREE);
+ htype = type;
+ if (TREE_CODE (wtype) == ARRAY_TYPE)
+ {
/* If va_list is an array type, the argument may have decayed
to a pointer type, e.g. by being passed to another function.
In that case, unwrap both types so that we can compare the
#define TARGET_DEFAULT_TARGET_FLAGS \
(TARGET_DEFAULT \
| TARGET_SUBTARGET_DEFAULT \
- | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT)
+ | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT \
+ | MASK_FUSED_MADD)
#undef TARGET_HANDLE_OPTION
#define TARGET_HANDLE_OPTION ix86_handle_option
#define TARGET_SECONDARY_RELOAD ix86_secondary_reload
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
-#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST x86_builtin_vectorization_cost
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
+ ix86_builtin_vectorization_cost
+#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
+#define TARGET_VECTORIZE_BUILTIN_VEC_PERM \
+ ix86_vectorize_builtin_vec_perm
+#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK
+#define TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK \
+ ix86_vectorize_builtin_vec_perm_ok
#undef TARGET_SET_CURRENT_FUNCTION
#define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
#undef TARGET_CAN_ELIMINATE
#define TARGET_CAN_ELIMINATE ix86_can_eliminate
+#undef TARGET_ASM_CODE_END
+#define TARGET_ASM_CODE_END ix86_code_end
+
struct gcc_target targetm = TARGET_INITIALIZER;
\f
#include "gt-i386.h"