OSDN Git Service

Add support for AVX2 builtin functions.
authorhjl <hjl@138bc75d-0d04-0410-961f-82ee72b054a4>
Mon, 22 Aug 2011 13:57:18 +0000 (13:57 +0000)
committerhjl <hjl@138bc75d-0d04-0410-961f-82ee72b054a4>
Mon, 22 Aug 2011 13:57:18 +0000 (13:57 +0000)
2011-08-22  Kirill Yukhin  <kirill.yukhin@intel.com>

* config/i386/avx2intrin.h: New file.
* config/i386/i386-builtin-types.def (PCINT, PCINT64, PV4SI,
PV8SI, V32QI_FTYPE_V32QI, V32QI_FTYPE_V16QI, V16HI_FTYPE_V16HI,
V16HI_FTYPE_V8HI, V8SI_FTYPE_V8SI, V16HI_FTYPE_V16QI,
V8SI_FTYPE_V16QI, V4DI_FTYPE_V16QI, V8SI_FTYPE_V8HI,
V4DI_FTYPE_V8HI, V4DI_FTYPE_V4SI, V4DI_FTYPE_PV4DI,
V4DI_FTYPE_V2DI, V2DI_FTYPE_PCV2DI_V2DI, V4SI_FTYPE_PCV4SI_V4SI,
V32QI_FTYPE_V16HI_V16HI, V16HI_FTYPE_V8SI_V8SI,
V32QI_FTYPE_V32QI_V32QI, V16HI_FTYPE_V32QI_V32QI,
V16HI_FTYPE_V16HI_V8HI, V16HI_FTYPE_V16HI_V16HI,
V16HI_FTYPE_V16HI_INT, V16HI_FTYPE_V16HI_SI,
V16HI_FTYPE_V16HI_V16HI_INT, V32QI_FTYPE_V32QI_V32QI_INT,
V8SI_FTYPE_V8SI_V4SI, V8SI_FTYPE_V8SI_V8SI,
V8SI_FTYPE_V16HI_V16HI, V8SI_FTYPE_V8SI_INT, V8SI_FTYPE_V8SI_SI,
V8SI_FTYPE_PCV8SI_V8SI, V4DI_FTYPE_V4DI_V4DI,
V4DI_FTYPE_V8SI_V8SI, V4DI_FTYPE_V4DI_V2DI,
V4DI_FTYPE_PCV4DI_V4DI, V4DI_FTYPE_V4DI_INT,
V2DI_FTYPE_V4DI_INT, V4DI_FTYPE_V4DI_V4DI_INT,
V4DI_FTYPE_V4DI_V2DI_INT, VOID_FTYPE_PV2DI_V2DI_V2DI,
VOID_FTYPE_PV4DI_V4DI_V4DI, VOID_FTYPE_PV4SI_V4SI_V4SI,
VOID_FTYPE_PV8SI_V8SI_V8SI,
V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
V16HI_FTYPE_V16HI_SI_COUNT, V16HI_FTYPE_V16HI_V8HI_COUNT,
V8SI_FTYPE_V8SI_SI_COUNT, V8SI_FTYPE_V8SI_V4SI_COUNT,
V4DI_FTYPE_V4DI_INT_COUNT, V4DI_FTYPE_V4DI_V2DI_COUNT,
V4DI_FTYPE_V4DI_INT_CONVERT,
V4DI_FTYPE_V4DI_V4DI_INT_CONVERT): New.
* config/i386/i386.c (ix86_builtins): Add IX86_BUILTIN_MPSADBW256,
IX86_BUILTIN_PABSB256, IX86_BUILTIN_PABSW256,
IX86_BUILTIN_PABSD256, IX86_BUILTIN_PACKSSDW256,
IX86_BUILTIN_PACKSSWB256, IX86_BUILTIN_PACKUSDW256,
IX86_BUILTIN_PACKUSWB256, IX86_BUILTIN_PADDB256,
IX86_BUILTIN_PADDW256, IX86_BUILTIN_PADDD256,
IX86_BUILTIN_PADDQ256, IX86_BUILTIN_PADDSB256,
IX86_BUILTIN_PADDSW256, IX86_BUILTIN_PADDUSB256,
IX86_BUILTIN_PADDUSW256, IX86_BUILTIN_PALIGNR256,
IX86_BUILTIN_AND256I, IX86_BUILTIN_ANDNOT256I,
IX86_BUILTIN_PAVGB256, IX86_BUILTIN_PAVGW256,
IX86_BUILTIN_PBLENDVB256, IX86_BUILTIN_PBLENDVW256,
IX86_BUILTIN_PCMPEQB256, IX86_BUILTIN_PCMPEQW256,
IX86_BUILTIN_PCMPEQD256, IX86_BUILTIN_PCMPEQQ256,
IX86_BUILTIN_PCMPGTB256, IX86_BUILTIN_PCMPGTW256,
IX86_BUILTIN_PCMPGTD256, IX86_BUILTIN_PCMPGTQ256,
IX86_BUILTIN_PHADDW256, IX86_BUILTIN_PHADDD256,
IX86_BUILTIN_PHADDSW256, IX86_BUILTIN_PHSUBW256,
IX86_BUILTIN_PHSUBD256, IX86_BUILTIN_PHSUBSW256,
IX86_BUILTIN_PMADDUBSW256, IX86_BUILTIN_PMADDWD256,
IX86_BUILTIN_PMAXSB256, IX86_BUILTIN_PMAXSW256,
IX86_BUILTIN_PMAXSD256, IX86_BUILTIN_PMAXUB256,
IX86_BUILTIN_PMAXUW256, IX86_BUILTIN_PMAXUD256,
IX86_BUILTIN_PMINSB256, IX86_BUILTIN_PMINSW256,
IX86_BUILTIN_PMINSD256, IX86_BUILTIN_PMINUB256,
IX86_BUILTIN_PMINUW256, IX86_BUILTIN_PMINUD256,
IX86_BUILTIN_PMOVMSKB256, IX86_BUILTIN_PMOVSXBW256,
IX86_BUILTIN_PMOVSXBD256, IX86_BUILTIN_PMOVSXBQ256,
IX86_BUILTIN_PMOVSXWD256, IX86_BUILTIN_PMOVSXWQ256,
IX86_BUILTIN_PMOVSXDQ256, IX86_BUILTIN_PMOVZXBW256,
IX86_BUILTIN_PMOVZXBD256, IX86_BUILTIN_PMOVZXBQ256,
IX86_BUILTIN_PMOVZXWD256, IX86_BUILTIN_PMOVZXWQ256,
IX86_BUILTIN_PMOVZXDQ256, IX86_BUILTIN_PMULDQ256,
IX86_BUILTIN_PMULHRSW256, IX86_BUILTIN_PMULHUW256,
IX86_BUILTIN_PMULHW256, IX86_BUILTIN_PMULLW256,
IX86_BUILTIN_PMULLD256, IX86_BUILTIN_PMULUDQ256,
IX86_BUILTIN_POR256, IX86_BUILTIN_PSADBW256,
IX86_BUILTIN_PSHUFB256, IX86_BUILTIN_PSHUFD256,
IX86_BUILTIN_PSHUFHW256, IX86_BUILTIN_PSHUFLW256,
IX86_BUILTIN_PSIGNB256, IX86_BUILTIN_PSIGNW256,
IX86_BUILTIN_PSIGND256, IX86_BUILTIN_PSLLDQI256,
IX86_BUILTIN_PSLLWI256, IX86_BUILTIN_PSLLW256,
IX86_BUILTIN_PSLLDI256, IX86_BUILTIN_PSLLD256,
IX86_BUILTIN_PSLLQI256, IX86_BUILTIN_PSLLQ256,
IX86_BUILTIN_PSRAWI256, IX86_BUILTIN_PSRAW256,
IX86_BUILTIN_PSRADI256, IX86_BUILTIN_PSRAD256,
IX86_BUILTIN_PSRLDQI256, IX86_BUILTIN_PSRLWI256,
IX86_BUILTIN_PSRLW256, IX86_BUILTIN_PSRLDI256,
IX86_BUILTIN_PSRLD256, IX86_BUILTIN_PSRLQI256,
IX86_BUILTIN_PSRLQ256, IX86_BUILTIN_PSUBB256,
IX86_BUILTIN_PSUBW256, IX86_BUILTIN_PSUBD256,
IX86_BUILTIN_PSUBQ256, IX86_BUILTIN_PSUBSB256,
IX86_BUILTIN_PSUBSW256, IX86_BUILTIN_PSUBUSB256,
IX86_BUILTIN_PSUBUSW256, IX86_BUILTIN_PUNPCKHBW256,
IX86_BUILTIN_PUNPCKHWD256, IX86_BUILTIN_PUNPCKHDQ256,
IX86_BUILTIN_PUNPCKHQDQ256, IX86_BUILTIN_PUNPCKLBW256,
IX86_BUILTIN_PUNPCKLWD256, IX86_BUILTIN_PUNPCKLDQ256,
IX86_BUILTIN_PUNPCKLQDQ256, IX86_BUILTIN_PXOR256,
IX86_BUILTIN_MOVNTDQA256, IX86_BUILTIN_VBROADCASTSS_PS,
IX86_BUILTIN_VBROADCASTSS_PS256,
IX86_BUILTIN_VBROADCASTSD_PD256,
IX86_BUILTIN_VBROADCASTSI256, IX86_BUILTIN_PBLENDD256,
IX86_BUILTIN_PBLENDD128, IX86_BUILTIN_PBROADCASTB256,
IX86_BUILTIN_PBROADCASTW256, IX86_BUILTIN_PBROADCASTD256,
IX86_BUILTIN_PBROADCASTQ256, IX86_BUILTIN_PBROADCASTB128,
IX86_BUILTIN_PBROADCASTW128, IX86_BUILTIN_PBROADCASTD128,
IX86_BUILTIN_PBROADCASTQ128, IX86_BUILTIN_VPERMVARSI256,
IX86_BUILTIN_VPERMDF256, IX86_BUILTIN_VPERMVARSF256,
IX86_BUILTIN_VPERMDI256, IX86_BUILTIN_VPERMTI256,
IX86_BUILTIN_VEXTRACT128I256, IX86_BUILTIN_VINSERT128I256,
IX86_BUILTIN_MASKLOADD, IX86_BUILTIN_MASKLOADQ,
IX86_BUILTIN_MASKLOADD256, IX86_BUILTIN_MASKLOADQ256,
IX86_BUILTIN_MASKSTORED, IX86_BUILTIN_MASKSTOREQ,
IX86_BUILTIN_MASKSTORED256, IX86_BUILTIN_MASKSTOREQ256,
IX86_BUILTIN_PSLLVV4DI, IX86_BUILTIN_PSLLVV2DI,
IX86_BUILTIN_PSLLVV8SI, IX86_BUILTIN_PSLLVV4SI,
IX86_BUILTIN_PSRAVV8SI, IX86_BUILTIN_PSRAVV4SI,
IX86_BUILTIN_PSRLVV4DI, IX86_BUILTIN_PSRLVV2DI,
IX86_BUILTIN_PSRLVV8SI, IX86_BUILTIN_PSRLVV4SI,
IX86_BUILTIN_GATHERSIV2DF, IX86_BUILTIN_GATHERSIV4DF,
IX86_BUILTIN_GATHERDIV2DF, IX86_BUILTIN_GATHERDIV4DF,
IX86_BUILTIN_GATHERSIV4SF, IX86_BUILTIN_GATHERSIV8SF,
IX86_BUILTIN_GATHERDIV4SF, IX86_BUILTIN_GATHERDIV8SF,
IX86_BUILTIN_GATHERSIV2DI, IX86_BUILTIN_GATHERSIV4DI,
IX86_BUILTIN_GATHERDIV2DI, IX86_BUILTIN_GATHERDIV4DI,
IX86_BUILTIN_GATHERSIV4SI, IX86_BUILTIN_GATHERSIV8SI,
IX86_BUILTIN_GATHERDIV4SI, IX86_BUILTIN_GATHERDIV8SI.
(bdesc_special_args): Add IX86_BUILTIN_MOVNTDQA256,
IX86_BUILTIN_MASKLOADD, IX86_BUILTIN_MASKLOADQ,
IX86_BUILTIN_MASKLOADD256, IX86_BUILTIN_MASKLOADQ256,
IX86_BUILTIN_MASKSTORED, IX86_BUILTIN_MASKSTOREQ,
IX86_BUILTIN_MASKSTORED256, IX86_BUILTIN_MASKSTOREQ256.
(bdesc_args): Add  IX86_BUILTIN_MPSADBW256,
IX86_BUILTIN_PABSB256, IX86_BUILTIN_PABSW256,
IX86_BUILTIN_PABSD256, IX86_BUILTIN_PACKSSDW256,
IX86_BUILTIN_PACKSSWB256, IX86_BUILTIN_PACKUSDW256,
IX86_BUILTIN_PACKUSWB256, IX86_BUILTIN_PADDB256,
IX86_BUILTIN_PADDW256, IX86_BUILTIN_PADDD256,
IX86_BUILTIN_PADDQ256, IX86_BUILTIN_PADDSB256,
IX86_BUILTIN_PADDSW256, IX86_BUILTIN_PADDUSB256,
IX86_BUILTIN_PADDUSW256, IX86_BUILTIN_PALIGNR256,
IX86_BUILTIN_AND256I, IX86_BUILTIN_ANDNOT256I,
IX86_BUILTIN_PAVGB256, IX86_BUILTIN_PAVGW256,
IX86_BUILTIN_PBLENDVB256, IX86_BUILTIN_PBLENDVW256,
IX86_BUILTIN_PCMPEQB256, IX86_BUILTIN_PCMPEQW256,
IX86_BUILTIN_PCMPEQD256, IX86_BUILTIN_PCMPEQQ256,
IX86_BUILTIN_PCMPGTB256, IX86_BUILTIN_PCMPGTW256,
IX86_BUILTIN_PCMPGTD256, IX86_BUILTIN_PCMPGTQ256,
IX86_BUILTIN_PHADDW256, IX86_BUILTIN_PHADDD256,
IX86_BUILTIN_PHADDSW256, IX86_BUILTIN_PHSUBW256,
IX86_BUILTIN_PHSUBD256, IX86_BUILTIN_PHSUBSW256,
IX86_BUILTIN_PMADDUBSW256, IX86_BUILTIN_PMADDWD256,
IX86_BUILTIN_PMAXSB256, IX86_BUILTIN_PMAXSW256,
IX86_BUILTIN_PMAXSD256, IX86_BUILTIN_PMAXUB256,
IX86_BUILTIN_PMAXUW256, IX86_BUILTIN_PMAXUD256,
IX86_BUILTIN_PMINSB256, IX86_BUILTIN_PMINSW256,
IX86_BUILTIN_PMINSD256, IX86_BUILTIN_PMINUB256,
IX86_BUILTIN_PMINUW256, IX86_BUILTIN_PMINUD256,
IX86_BUILTIN_PMOVMSKB256, IX86_BUILTIN_PMOVSXBW256,
IX86_BUILTIN_PMOVSXBD256, IX86_BUILTIN_PMOVSXBQ256,
IX86_BUILTIN_PMOVSXWD256, IX86_BUILTIN_PMOVSXWQ256,
IX86_BUILTIN_PMOVSXDQ256, IX86_BUILTIN_PMOVZXBW256,
IX86_BUILTIN_PMOVZXBD256, IX86_BUILTIN_PMOVZXBQ256,
IX86_BUILTIN_PMOVZXWD256, IX86_BUILTIN_PMOVZXWQ256,
IX86_BUILTIN_PMOVZXDQ256, IX86_BUILTIN_PMULDQ256,
IX86_BUILTIN_PMULHRSW256, IX86_BUILTIN_PMULHUW256,
IX86_BUILTIN_PMULHW256, IX86_BUILTIN_PMULLW256,
IX86_BUILTIN_PMULLD256, IX86_BUILTIN_PMULUDQ256,
IX86_BUILTIN_POR256, IX86_BUILTIN_PSADBW256,
IX86_BUILTIN_PSHUFB256, IX86_BUILTIN_PSHUFD256,
IX86_BUILTIN_PSHUFHW256, IX86_BUILTIN_PSHUFLW256,
IX86_BUILTIN_PSIGNB256, IX86_BUILTIN_PSIGNW256,
IX86_BUILTIN_PSIGND256, IX86_BUILTIN_PSLLDQI256,
IX86_BUILTIN_PSLLWI256, IX86_BUILTIN_PSLLW256,
IX86_BUILTIN_PSLLDI256, IX86_BUILTIN_PSLLD256,
IX86_BUILTIN_PSLLQI256, IX86_BUILTIN_PSLLQ256,
IX86_BUILTIN_PSRAWI256, IX86_BUILTIN_PSRAW256,
IX86_BUILTIN_PSRADI256, IX86_BUILTIN_PSRAD256,
IX86_BUILTIN_PSRLDQI256, IX86_BUILTIN_PSRLWI256,
IX86_BUILTIN_PSRLW256, IX86_BUILTIN_PSRLDI256,
IX86_BUILTIN_PSRLD256, IX86_BUILTIN_PSRLQI256,
IX86_BUILTIN_PSRLQ256, IX86_BUILTIN_PSUBB256,
IX86_BUILTIN_PSUBW256, IX86_BUILTIN_PSUBD256,
IX86_BUILTIN_PSUBQ256, IX86_BUILTIN_PSUBSB256,
IX86_BUILTIN_PSUBSW256, IX86_BUILTIN_PSUBUSB256,
IX86_BUILTIN_PSUBUSW256, IX86_BUILTIN_PUNPCKHBW256,
IX86_BUILTIN_PUNPCKHWD256, IX86_BUILTIN_PUNPCKHDQ256,
IX86_BUILTIN_PUNPCKHQDQ256, IX86_BUILTIN_PUNPCKLBW256,
IX86_BUILTIN_PUNPCKLWD256, IX86_BUILTIN_PUNPCKLDQ256,
IX86_BUILTIN_PUNPCKLQDQ256, IX86_BUILTIN_PXOR256,
IX86_BUILTIN_VBROADCASTSS_PS, IX86_BUILTIN_VBROADCASTSS_PS256,
IX86_BUILTIN_VBROADCASTSD_PD256,
IX86_BUILTIN_VBROADCASTSI256, IX86_BUILTIN_PBLENDD256,
IX86_BUILTIN_PBLENDD128, IX86_BUILTIN_PBROADCASTB256,
IX86_BUILTIN_PBROADCASTW256, IX86_BUILTIN_PBROADCASTD256,
IX86_BUILTIN_PBROADCASTQ256, IX86_BUILTIN_PBROADCASTB128,
IX86_BUILTIN_PBROADCASTW128, IX86_BUILTIN_PBROADCASTD128,
IX86_BUILTIN_PBROADCASTQ128, IX86_BUILTIN_VPERMVARSI256,
IX86_BUILTIN_VPERMDF256, IX86_BUILTIN_VPERMVARSF256,
IX86_BUILTIN_VPERMDI256, IX86_BUILTIN_VPERMTI256,
IX86_BUILTIN_VEXTRACT128I256, IX86_BUILTIN_VINSERT128I256,
IX86_BUILTIN_PSLLVV4DI, IX86_BUILTIN_PSLLVV2DI,
IX86_BUILTIN_PSLLVV8SI, IX86_BUILTIN_PSLLVV4SI,
IX86_BUILTIN_PSRAVV8SI, IX86_BUILTIN_PSRAVV4SI,
IX86_BUILTIN_PSRLVV4DI, IX86_BUILTIN_PSRLVV2DI,
IX86_BUILTIN_PSRLVV8SI, IX86_BUILTIN_PSRLVV4SI.
(ix86_init_mmx_sse_builtins): Add IX86_BUILTIN_GATHERSIV2DF,
IX86_BUILTIN_GATHERSIV4DF, IX86_BUILTIN_GATHERDIV2DF,
IX86_BUILTIN_GATHERDIV4DF, IX86_BUILTIN_GATHERSIV4SF,
IX86_BUILTIN_GATHERSIV8SF, IX86_BUILTIN_GATHERDIV4SF,
IX86_BUILTIN_GATHERDIV8SF, IX86_BUILTIN_GATHERSIV2DI,
IX86_BUILTIN_GATHERSIV4DI, IX86_BUILTIN_GATHERDIV2DI,
IX86_BUILTIN_GATHERDIV4DI, IX86_BUILTIN_GATHERSIV4SI,
IX86_BUILTIN_GATHERSIV8SI, IX86_BUILTIN_GATHERDIV4SI,
IX86_BUILTIN_GATHERDIV8SI.
(ix86_preferred_simd_mode): Support AVX2 modes.
(ix86_expand_args_builtin): Support AVX2 built-ins.
(ix86_expand_special_args_builtin): Likewise.
(ix86_expand_builtin): Likewise.
* config/i386/i386.md (UNSPEC_VPERMSI): New.
(UNSPEC_VPERMDF): Likewise.
(UNSPEC_VPERMSF): Likewise.
(UNSPEC_VPERMDI): Likewise.
(UNSPEC_VPERMTI): Likewise.
(UNSPEC_GATHER): Likewise.
(ssemodesuffix): Extend.
* config/i386/immintrin.h: Include avx2intrin.h when __AVX2__
is defined.
* config/i386/predicates.md (const1248_operand): New.
* config/i386/sse.md (VI_AVX2):
(VI1_AVX2): Likewise.
(VI2_AVX2): Likewise.
(VI4_AVX2): Likewise.
(VI8_AVX2): Likewise.
(VIMAX_AVX2): Likewise.
(SSESCALARMODE): Likewise.
(VI12_AVX2): Likewise.
(VI24_AVX2): Likewise.
(VI124_AVX2): Likeuse_submit_for_speed = 1
wise.
(VI248_AVX2): Likewise.
(VI48_AVX2): Likewise.
(VI4SD_AVX2): Likewise.
(V48_AVX2): Likewise.
(avx2modesuffix): Likewise.
(sse_avx2): Likewise.
(sse2_avx2): Likewise.
(ssse3_avx2): Likewise.
(sse4_1_avx2): Likewise.
(avx_avx2): Likewise.
(lshift)<code_oterator>: Likewise.
(lshift_insn): Likewise.
(lshift)<code_attr>: Likewise.
(SSESHORTMODE): Likewise.
(SSELONGMODE): Likewise.
(SSEBYTEMODE): Likewise.
(AVXTOSSEMODE): Likewise.
(shortmode): Likewise.
(ssescalarmodesuffix): Update.
(sseunpackmode): Likewise.
(ssepackmode): Likewise.
(AVX256MODEI): New.
(AVX256MODE124): Likewise.
(AVX256MODE1248): Likewise.
(AVX256MODE248): Likewise.
(AVXMODE48P_SI): Likewise.
(AVXMODE48P_SI): Likewise.
(AVXMODE48P_DI): Likewise.
(AVXMODE48P_DI): Likewise.
(gthrfirstp): Likewise.
(gthrlastp): Likewise.
(avx2): Likwise.
(ssevecsize): Likewise.
(ssedoublesizemode): Likewise.
(avxvecmode): Likewise.
(avxvecsize): Likewise.
(avxhalfvecmode): Likewise.
(avxscalarmode): Likewise.
(avxpermvecmode): Likewise.
(avxmodesuffixp): Likewise.
(avxmodesuffix): Likewise.
(avx2_vec_dupv4sf): New.
(avx2_vec_dupv8sf): Likewise.
(avx2_interleave_highv4di): Likewise.
(avx2_interleave_lowv4di): Likewise.
(<plusminus_insn><mode>3): Update.
(*<plusminus_insn><mode>3): Likewise.
(sse2_<plusminus_insn><mode>3): Rename to ...
("<sse2_avx2>_<plusminus_insn><mode>3): ... this. updated.
(*sse2_<plusminus_insn><mode>3): Likewise.
(*<sse2_avx2>_<plusminus_insn><mode>3): Likewise.
(mulv8hi3): Likewise.
(mul<mode>3): Likewise.
(*mulv8hi3): Likewise.
(*mul<mode>3): Likewise.
(<s>mulv8hi3_highpart): Likewise.
(<s>mul<mode>3_highpart): Likewise.
(*<s>mulv8hi3_highpart): Likewise.
(*<s>mul<mode>3_highpart): Likewise.
(avx2_umulv4siv4di3): Likewise.
(*avx_umulv4siv4di3): Likewise.
(sse4_1_mulv2siv2di3): Likewise.
(<sse4_1_avx2>_mul<shortmode><mode>3): Likewise.
(*sse4_1_mulv2siv2di3): Likewise.
(*<sse4_1_avx2>_mulv2siv2di3): Likewise.
(avx2_pmaddwd): New.
(*avx2_pmaddwd): Likewise.
(mulv4si3): Rename to ...
(mul<mode>3): ... this. Update.
(*sse4_1_mulv4si3): Likewise.
(*<sse4_1_avx2>_mul<mode>3): Likewise.
(ashr<mode>3): Update.
(avx2_lshrqv4di3): New.
(lshr<mode>3): Update.
(avx2_lshlqv4di3): New.
(avx2_lshl<mode>3): Likewise.
(sse2_ashlv1ti3): Rename to ...
(<sse2_avx2>_ashl<mode>3): ... this. Update.
(avx2_<code><mode>3)<umaxmin>: New.
(*avx2_<code><mode>3)<umaxmin>: Likewise.
(avx2_<code><mode>3)<smaxmin>: New.
(*avx2_<code><mode>3)<smaxmin>: Likewise.
(avx2_eq<mode>3): Likewise.
(*avx2_eq<mode>3): Likewise.
(avx2_gt<mode>3): Likewise.
(sse2_andnot<mode>3): Rename to ...
(<sse2_avx2>_andnot<mode>3): ... this. Update.
(*andnot<mode>3): Update.
(<code><mode>3)<any_logic>: Update.
(*<code><mode>3)<any_logic>: Likewise.
(sse2_packsswb): Rename to ...
(<sse2_avx2>_packsswb): ... this. Update.
(sse2_packssdw): Likewise.
(<sse2_avx2>_packssdw): Likewise.
(sse2_packuswb): Likewise.
(<sse2_avx2>_packuswb): Likewise.
(avx2_interleave_highv32qi): New.
(avx2_interleave_lowv32qi): Likewise.
(avx2_interleave_highv16hi): Likewise.
(avx2_interleave_lowv16hi): Likewise.
(avx2_interleave_highv8si): Likewise.
(avx2_interleave_lowv8si): Likewise.
(avx2_pshufd): New
(avx2_pshufd_1): Likewise.
(avx2_pshuflwv3): Likewise.
(avx2_pshuflw_1): Likewise.
(avx2_pshufhwv3): Likewise.
(avx2_pshufhw_1): Likewise.
(avx2_uavgv32qi3): Likewise.
(*avx2_uavgv32qi3): Likewise.
(avx2_uavgv16hi3): Likewise.
(*avx2_uavgv16hi3): Likewise.
(sse2_psadbw): Rename to ...
(<sse2_avx2>_psadbw): ... this. Update.
(avx2_pmovmskb): New.
(avx2_phaddwv16hi3): Likewise.
(avx2_phadddv8si3): Likewise.
(avx2_phaddswv16hi3): Likewise.
(avx2_phsubwv16hi3): Likewise.
(avx2_phsubdv8si3): Likewise.
(avx2_phsubswv16hi3): Likewise.
(avx2_pmaddubsw256): Likewise.
(avx2_umulhrswv16hi3): Likewise.
(*avx2_umulhrswv16hi3): Likewise.
(ssse3_pshufbv16qi3): Rename to ...
(<ssse3_avx2>_pshufb<mode>3): ... this. Update.
(ssse3_psign<mode>3): Likewise.
(<ssse3_avx2>_psign<mode>3): Likewise.
(ssse3_palignrti): Likewise.
(<ssse3_avx2>_palignr<mode>): Likewise.
(abs<mode>2): Likewise.
(sse4_1_movntdqa): Rename to ...
(<sse4_1_avx2>_movntdqa): ... this. Update.
(sse4_1_mpsadbw): Likewise.
(<sse4_1_avx2>_mpsadbw): Likewise.
(avx2_packusdw): New.
(sse4_1_pblendvb): Rename to ...
(<sse4_1_avx2>_pblendvb): ... this. Update.
(sse4_1_pblendw): Likewise.
(<sse4_1_avx2>_pblendw): Likewise.
(avx2_pblendd<mode>): New.
(avx2_<code>v16qiv16hi2): Likewise.
(avx2_<code>v8qiv8si2): Likewise.
(avx2_<code>v8hiv8si2): Likewise.
(avx2_<code>v4qiv4di2): Likewise.
(avx2_<code>v4hiv4di2): Likewise.
(avx2_<code>v4siv4di2): Likewise.
(avx2_pbroadcast<mode>): Likewise.
(avx2_permvarv8si): Likewise.
(avx2_permv4df): Likewise.
(avx2_permvarv8sf): Likewise.
(avx2_permv4di): Likewise.
(avx2_permv2ti): Likewise.
(avx2_vec_dupv4df): Likewise.
(avx2_vbroadcasti128_<mode>): Likewise.
(avx2_vec_set_lo_v4di): Likewise.
(avx2_vec_set_hi_v4di): Likewise.
(avx_maskload<ssemodesuffix><avxsizesuffix>): Rename to ...
(<avx_avx2>_maskload<avx2modesuffix><avxmodesuffix>): ... this.
Update.
(avx_maskstore<ssemodesuffix><avxsizesuffix>): Likewise.
(<avx_avx2>_maskstore<avx2modesuffix><avxmodesuffix>): Likewise.
(*avx2_maskmov<avx2modesuffix><avxmodesuffix>): New.
(avx2_extracti128): Likewise.
(avx2_inserti128): Likewise.
(avx2_ashrvv8si): Likewise.
(avx2_ashrvv4si): Likewise.
(avx2_<lshift>vv8si): Likewise.
(avx2_<lshift>v<mode>): Likewise.
(avx2_<lshift>vv2di): Likewise.
(avx2_gathersi<mode>): Likewise.
(*avx2_gathersi<mode>): Likewise.
(avx2_gatherdi<mode>): Likewise.
(*avx2_gatherdi<mode>): Likewise.
(avx2_gatherdi<mode>256): Likewise.
(*avx2_gatherdi<mode>256): Likewise.
* doc/extend.texi: Document AVX2 built-in functions.
* doc/invoke.texi: Document -mavx2.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@177955 138bc75d-0d04-0410-961f-82ee72b054a4

gcc/ChangeLog
gcc/config.gcc
gcc/config/i386/avx2intrin.h [new file with mode: 0644]
gcc/config/i386/i386-builtin-types.def
gcc/config/i386/i386.c
gcc/config/i386/i386.md
gcc/config/i386/immintrin.h
gcc/config/i386/predicates.md
gcc/config/i386/sse.md
gcc/doc/extend.texi

index 76154f2..15c4e3b 100644 (file)
@@ -1,3 +1,426 @@
+2011-08-22  Kirill Yukhin  <kirill.yukhin@intel.com>
+
+       * config/i386/avx2intrin.h: New file.
+       * config/i386/i386-builtin-types.def (PCINT, PCINT64, PV4SI,
+       PV8SI, V32QI_FTYPE_V32QI, V32QI_FTYPE_V16QI, V16HI_FTYPE_V16HI,
+       V16HI_FTYPE_V8HI, V8SI_FTYPE_V8SI, V16HI_FTYPE_V16QI,
+       V8SI_FTYPE_V16QI, V4DI_FTYPE_V16QI, V8SI_FTYPE_V8HI,
+       V4DI_FTYPE_V8HI, V4DI_FTYPE_V4SI, V4DI_FTYPE_PV4DI,
+       V4DI_FTYPE_V2DI, V2DI_FTYPE_PCV2DI_V2DI, V4SI_FTYPE_PCV4SI_V4SI,
+       V32QI_FTYPE_V16HI_V16HI, V16HI_FTYPE_V8SI_V8SI,
+       V32QI_FTYPE_V32QI_V32QI, V16HI_FTYPE_V32QI_V32QI,
+       V16HI_FTYPE_V16HI_V8HI, V16HI_FTYPE_V16HI_V16HI,
+       V16HI_FTYPE_V16HI_INT, V16HI_FTYPE_V16HI_SI,
+       V16HI_FTYPE_V16HI_V16HI_INT, V32QI_FTYPE_V32QI_V32QI_INT,
+       V8SI_FTYPE_V8SI_V4SI, V8SI_FTYPE_V8SI_V8SI,
+       V8SI_FTYPE_V16HI_V16HI, V8SI_FTYPE_V8SI_INT, V8SI_FTYPE_V8SI_SI,
+       V8SI_FTYPE_PCV8SI_V8SI, V4DI_FTYPE_V4DI_V4DI,
+       V4DI_FTYPE_V8SI_V8SI, V4DI_FTYPE_V4DI_V2DI,
+       V4DI_FTYPE_PCV4DI_V4DI, V4DI_FTYPE_V4DI_INT,
+       V2DI_FTYPE_V4DI_INT, V4DI_FTYPE_V4DI_V4DI_INT,
+       V4DI_FTYPE_V4DI_V2DI_INT, VOID_FTYPE_PV2DI_V2DI_V2DI,
+       VOID_FTYPE_PV4DI_V4DI_V4DI, VOID_FTYPE_PV4SI_V4SI_V4SI,
+       VOID_FTYPE_PV8SI_V8SI_V8SI,
+       V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
+       V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
+       V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
+       V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
+       V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
+       V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
+       V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
+       V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
+       V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
+       V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
+       V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
+       V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
+       V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
+       V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
+       V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
+       V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
+       V16HI_FTYPE_V16HI_SI_COUNT, V16HI_FTYPE_V16HI_V8HI_COUNT,
+       V8SI_FTYPE_V8SI_SI_COUNT, V8SI_FTYPE_V8SI_V4SI_COUNT,
+       V4DI_FTYPE_V4DI_INT_COUNT, V4DI_FTYPE_V4DI_V2DI_COUNT,
+       V4DI_FTYPE_V4DI_INT_CONVERT,
+       V4DI_FTYPE_V4DI_V4DI_INT_CONVERT): New.
+       * config/i386/i386.c (ix86_builtins): Add IX86_BUILTIN_MPSADBW256,
+       IX86_BUILTIN_PABSB256, IX86_BUILTIN_PABSW256,
+       IX86_BUILTIN_PABSD256, IX86_BUILTIN_PACKSSDW256,
+       IX86_BUILTIN_PACKSSWB256, IX86_BUILTIN_PACKUSDW256,
+       IX86_BUILTIN_PACKUSWB256, IX86_BUILTIN_PADDB256,
+       IX86_BUILTIN_PADDW256, IX86_BUILTIN_PADDD256,
+       IX86_BUILTIN_PADDQ256, IX86_BUILTIN_PADDSB256,
+       IX86_BUILTIN_PADDSW256, IX86_BUILTIN_PADDUSB256,
+       IX86_BUILTIN_PADDUSW256, IX86_BUILTIN_PALIGNR256,
+       IX86_BUILTIN_AND256I, IX86_BUILTIN_ANDNOT256I,
+       IX86_BUILTIN_PAVGB256, IX86_BUILTIN_PAVGW256,
+       IX86_BUILTIN_PBLENDVB256, IX86_BUILTIN_PBLENDVW256,
+       IX86_BUILTIN_PCMPEQB256, IX86_BUILTIN_PCMPEQW256,
+       IX86_BUILTIN_PCMPEQD256, IX86_BUILTIN_PCMPEQQ256,
+       IX86_BUILTIN_PCMPGTB256, IX86_BUILTIN_PCMPGTW256,
+       IX86_BUILTIN_PCMPGTD256, IX86_BUILTIN_PCMPGTQ256,
+       IX86_BUILTIN_PHADDW256, IX86_BUILTIN_PHADDD256,
+       IX86_BUILTIN_PHADDSW256, IX86_BUILTIN_PHSUBW256,
+       IX86_BUILTIN_PHSUBD256, IX86_BUILTIN_PHSUBSW256,
+       IX86_BUILTIN_PMADDUBSW256, IX86_BUILTIN_PMADDWD256,
+       IX86_BUILTIN_PMAXSB256, IX86_BUILTIN_PMAXSW256,
+       IX86_BUILTIN_PMAXSD256, IX86_BUILTIN_PMAXUB256,
+       IX86_BUILTIN_PMAXUW256, IX86_BUILTIN_PMAXUD256,
+       IX86_BUILTIN_PMINSB256, IX86_BUILTIN_PMINSW256,
+       IX86_BUILTIN_PMINSD256, IX86_BUILTIN_PMINUB256,
+       IX86_BUILTIN_PMINUW256, IX86_BUILTIN_PMINUD256,
+       IX86_BUILTIN_PMOVMSKB256, IX86_BUILTIN_PMOVSXBW256,
+       IX86_BUILTIN_PMOVSXBD256, IX86_BUILTIN_PMOVSXBQ256,
+       IX86_BUILTIN_PMOVSXWD256, IX86_BUILTIN_PMOVSXWQ256,
+       IX86_BUILTIN_PMOVSXDQ256, IX86_BUILTIN_PMOVZXBW256,
+       IX86_BUILTIN_PMOVZXBD256, IX86_BUILTIN_PMOVZXBQ256,
+       IX86_BUILTIN_PMOVZXWD256, IX86_BUILTIN_PMOVZXWQ256,
+       IX86_BUILTIN_PMOVZXDQ256, IX86_BUILTIN_PMULDQ256,
+       IX86_BUILTIN_PMULHRSW256, IX86_BUILTIN_PMULHUW256,
+       IX86_BUILTIN_PMULHW256, IX86_BUILTIN_PMULLW256,
+       IX86_BUILTIN_PMULLD256, IX86_BUILTIN_PMULUDQ256,
+       IX86_BUILTIN_POR256, IX86_BUILTIN_PSADBW256,
+       IX86_BUILTIN_PSHUFB256, IX86_BUILTIN_PSHUFD256,
+       IX86_BUILTIN_PSHUFHW256, IX86_BUILTIN_PSHUFLW256,
+       IX86_BUILTIN_PSIGNB256, IX86_BUILTIN_PSIGNW256,
+       IX86_BUILTIN_PSIGND256, IX86_BUILTIN_PSLLDQI256,
+       IX86_BUILTIN_PSLLWI256, IX86_BUILTIN_PSLLW256,
+       IX86_BUILTIN_PSLLDI256, IX86_BUILTIN_PSLLD256,
+       IX86_BUILTIN_PSLLQI256, IX86_BUILTIN_PSLLQ256,
+       IX86_BUILTIN_PSRAWI256, IX86_BUILTIN_PSRAW256,
+       IX86_BUILTIN_PSRADI256, IX86_BUILTIN_PSRAD256,
+       IX86_BUILTIN_PSRLDQI256, IX86_BUILTIN_PSRLWI256,
+       IX86_BUILTIN_PSRLW256, IX86_BUILTIN_PSRLDI256,
+       IX86_BUILTIN_PSRLD256, IX86_BUILTIN_PSRLQI256,
+       IX86_BUILTIN_PSRLQ256, IX86_BUILTIN_PSUBB256,
+       IX86_BUILTIN_PSUBW256, IX86_BUILTIN_PSUBD256,
+       IX86_BUILTIN_PSUBQ256, IX86_BUILTIN_PSUBSB256,
+       IX86_BUILTIN_PSUBSW256, IX86_BUILTIN_PSUBUSB256,
+       IX86_BUILTIN_PSUBUSW256, IX86_BUILTIN_PUNPCKHBW256,
+       IX86_BUILTIN_PUNPCKHWD256, IX86_BUILTIN_PUNPCKHDQ256,
+       IX86_BUILTIN_PUNPCKHQDQ256, IX86_BUILTIN_PUNPCKLBW256,
+       IX86_BUILTIN_PUNPCKLWD256, IX86_BUILTIN_PUNPCKLDQ256,
+       IX86_BUILTIN_PUNPCKLQDQ256, IX86_BUILTIN_PXOR256,
+       IX86_BUILTIN_MOVNTDQA256, IX86_BUILTIN_VBROADCASTSS_PS,
+       IX86_BUILTIN_VBROADCASTSS_PS256,
+       IX86_BUILTIN_VBROADCASTSD_PD256,
+       IX86_BUILTIN_VBROADCASTSI256, IX86_BUILTIN_PBLENDD256,
+       IX86_BUILTIN_PBLENDD128, IX86_BUILTIN_PBROADCASTB256,
+       IX86_BUILTIN_PBROADCASTW256, IX86_BUILTIN_PBROADCASTD256,
+       IX86_BUILTIN_PBROADCASTQ256, IX86_BUILTIN_PBROADCASTB128,
+       IX86_BUILTIN_PBROADCASTW128, IX86_BUILTIN_PBROADCASTD128,
+       IX86_BUILTIN_PBROADCASTQ128, IX86_BUILTIN_VPERMVARSI256,
+       IX86_BUILTIN_VPERMDF256, IX86_BUILTIN_VPERMVARSF256,
+       IX86_BUILTIN_VPERMDI256, IX86_BUILTIN_VPERMTI256,
+       IX86_BUILTIN_VEXTRACT128I256, IX86_BUILTIN_VINSERT128I256,
+       IX86_BUILTIN_MASKLOADD, IX86_BUILTIN_MASKLOADQ,
+       IX86_BUILTIN_MASKLOADD256, IX86_BUILTIN_MASKLOADQ256,
+       IX86_BUILTIN_MASKSTORED, IX86_BUILTIN_MASKSTOREQ,
+       IX86_BUILTIN_MASKSTORED256, IX86_BUILTIN_MASKSTOREQ256,
+       IX86_BUILTIN_PSLLVV4DI, IX86_BUILTIN_PSLLVV2DI,
+       IX86_BUILTIN_PSLLVV8SI, IX86_BUILTIN_PSLLVV4SI,
+       IX86_BUILTIN_PSRAVV8SI, IX86_BUILTIN_PSRAVV4SI,
+       IX86_BUILTIN_PSRLVV4DI, IX86_BUILTIN_PSRLVV2DI,
+       IX86_BUILTIN_PSRLVV8SI, IX86_BUILTIN_PSRLVV4SI,
+       IX86_BUILTIN_GATHERSIV2DF, IX86_BUILTIN_GATHERSIV4DF,
+       IX86_BUILTIN_GATHERDIV2DF, IX86_BUILTIN_GATHERDIV4DF,
+       IX86_BUILTIN_GATHERSIV4SF, IX86_BUILTIN_GATHERSIV8SF,
+       IX86_BUILTIN_GATHERDIV4SF, IX86_BUILTIN_GATHERDIV8SF,
+       IX86_BUILTIN_GATHERSIV2DI, IX86_BUILTIN_GATHERSIV4DI,
+       IX86_BUILTIN_GATHERDIV2DI, IX86_BUILTIN_GATHERDIV4DI,
+       IX86_BUILTIN_GATHERSIV4SI, IX86_BUILTIN_GATHERSIV8SI,
+       IX86_BUILTIN_GATHERDIV4SI, IX86_BUILTIN_GATHERDIV8SI.
+       (bdesc_special_args): Add IX86_BUILTIN_MOVNTDQA256,
+       IX86_BUILTIN_MASKLOADD, IX86_BUILTIN_MASKLOADQ,
+       IX86_BUILTIN_MASKLOADD256, IX86_BUILTIN_MASKLOADQ256,
+       IX86_BUILTIN_MASKSTORED, IX86_BUILTIN_MASKSTOREQ,
+       IX86_BUILTIN_MASKSTORED256, IX86_BUILTIN_MASKSTOREQ256.
+       (bdesc_args): Add  IX86_BUILTIN_MPSADBW256,
+       IX86_BUILTIN_PABSB256, IX86_BUILTIN_PABSW256,
+       IX86_BUILTIN_PABSD256, IX86_BUILTIN_PACKSSDW256,
+       IX86_BUILTIN_PACKSSWB256, IX86_BUILTIN_PACKUSDW256,
+       IX86_BUILTIN_PACKUSWB256, IX86_BUILTIN_PADDB256,
+       IX86_BUILTIN_PADDW256, IX86_BUILTIN_PADDD256,
+       IX86_BUILTIN_PADDQ256, IX86_BUILTIN_PADDSB256,
+       IX86_BUILTIN_PADDSW256, IX86_BUILTIN_PADDUSB256,
+       IX86_BUILTIN_PADDUSW256, IX86_BUILTIN_PALIGNR256,
+       IX86_BUILTIN_AND256I, IX86_BUILTIN_ANDNOT256I,
+       IX86_BUILTIN_PAVGB256, IX86_BUILTIN_PAVGW256,
+       IX86_BUILTIN_PBLENDVB256, IX86_BUILTIN_PBLENDVW256,
+       IX86_BUILTIN_PCMPEQB256, IX86_BUILTIN_PCMPEQW256,
+       IX86_BUILTIN_PCMPEQD256, IX86_BUILTIN_PCMPEQQ256,
+       IX86_BUILTIN_PCMPGTB256, IX86_BUILTIN_PCMPGTW256,
+       IX86_BUILTIN_PCMPGTD256, IX86_BUILTIN_PCMPGTQ256,
+       IX86_BUILTIN_PHADDW256, IX86_BUILTIN_PHADDD256,
+       IX86_BUILTIN_PHADDSW256, IX86_BUILTIN_PHSUBW256,
+       IX86_BUILTIN_PHSUBD256, IX86_BUILTIN_PHSUBSW256,
+       IX86_BUILTIN_PMADDUBSW256, IX86_BUILTIN_PMADDWD256,
+       IX86_BUILTIN_PMAXSB256, IX86_BUILTIN_PMAXSW256,
+       IX86_BUILTIN_PMAXSD256, IX86_BUILTIN_PMAXUB256,
+       IX86_BUILTIN_PMAXUW256, IX86_BUILTIN_PMAXUD256,
+       IX86_BUILTIN_PMINSB256, IX86_BUILTIN_PMINSW256,
+       IX86_BUILTIN_PMINSD256, IX86_BUILTIN_PMINUB256,
+       IX86_BUILTIN_PMINUW256, IX86_BUILTIN_PMINUD256,
+       IX86_BUILTIN_PMOVMSKB256, IX86_BUILTIN_PMOVSXBW256,
+       IX86_BUILTIN_PMOVSXBD256, IX86_BUILTIN_PMOVSXBQ256,
+       IX86_BUILTIN_PMOVSXWD256, IX86_BUILTIN_PMOVSXWQ256,
+       IX86_BUILTIN_PMOVSXDQ256, IX86_BUILTIN_PMOVZXBW256,
+       IX86_BUILTIN_PMOVZXBD256, IX86_BUILTIN_PMOVZXBQ256,
+       IX86_BUILTIN_PMOVZXWD256, IX86_BUILTIN_PMOVZXWQ256,
+       IX86_BUILTIN_PMOVZXDQ256, IX86_BUILTIN_PMULDQ256,
+       IX86_BUILTIN_PMULHRSW256, IX86_BUILTIN_PMULHUW256,
+       IX86_BUILTIN_PMULHW256, IX86_BUILTIN_PMULLW256,
+       IX86_BUILTIN_PMULLD256, IX86_BUILTIN_PMULUDQ256,
+       IX86_BUILTIN_POR256, IX86_BUILTIN_PSADBW256,
+       IX86_BUILTIN_PSHUFB256, IX86_BUILTIN_PSHUFD256,
+       IX86_BUILTIN_PSHUFHW256, IX86_BUILTIN_PSHUFLW256,
+       IX86_BUILTIN_PSIGNB256, IX86_BUILTIN_PSIGNW256,
+       IX86_BUILTIN_PSIGND256, IX86_BUILTIN_PSLLDQI256,
+       IX86_BUILTIN_PSLLWI256, IX86_BUILTIN_PSLLW256,
+       IX86_BUILTIN_PSLLDI256, IX86_BUILTIN_PSLLD256,
+       IX86_BUILTIN_PSLLQI256, IX86_BUILTIN_PSLLQ256,
+       IX86_BUILTIN_PSRAWI256, IX86_BUILTIN_PSRAW256,
+       IX86_BUILTIN_PSRADI256, IX86_BUILTIN_PSRAD256,
+       IX86_BUILTIN_PSRLDQI256, IX86_BUILTIN_PSRLWI256,
+       IX86_BUILTIN_PSRLW256, IX86_BUILTIN_PSRLDI256,
+       IX86_BUILTIN_PSRLD256, IX86_BUILTIN_PSRLQI256,
+       IX86_BUILTIN_PSRLQ256, IX86_BUILTIN_PSUBB256,
+       IX86_BUILTIN_PSUBW256, IX86_BUILTIN_PSUBD256,
+       IX86_BUILTIN_PSUBQ256, IX86_BUILTIN_PSUBSB256,
+       IX86_BUILTIN_PSUBSW256, IX86_BUILTIN_PSUBUSB256,
+       IX86_BUILTIN_PSUBUSW256, IX86_BUILTIN_PUNPCKHBW256,
+       IX86_BUILTIN_PUNPCKHWD256, IX86_BUILTIN_PUNPCKHDQ256,
+       IX86_BUILTIN_PUNPCKHQDQ256, IX86_BUILTIN_PUNPCKLBW256,
+       IX86_BUILTIN_PUNPCKLWD256, IX86_BUILTIN_PUNPCKLDQ256,
+       IX86_BUILTIN_PUNPCKLQDQ256, IX86_BUILTIN_PXOR256,
+       IX86_BUILTIN_VBROADCASTSS_PS, IX86_BUILTIN_VBROADCASTSS_PS256,
+       IX86_BUILTIN_VBROADCASTSD_PD256,
+       IX86_BUILTIN_VBROADCASTSI256, IX86_BUILTIN_PBLENDD256,
+       IX86_BUILTIN_PBLENDD128, IX86_BUILTIN_PBROADCASTB256,
+       IX86_BUILTIN_PBROADCASTW256, IX86_BUILTIN_PBROADCASTD256,
+       IX86_BUILTIN_PBROADCASTQ256, IX86_BUILTIN_PBROADCASTB128,
+       IX86_BUILTIN_PBROADCASTW128, IX86_BUILTIN_PBROADCASTD128,
+       IX86_BUILTIN_PBROADCASTQ128, IX86_BUILTIN_VPERMVARSI256,
+       IX86_BUILTIN_VPERMDF256, IX86_BUILTIN_VPERMVARSF256,
+       IX86_BUILTIN_VPERMDI256, IX86_BUILTIN_VPERMTI256,
+       IX86_BUILTIN_VEXTRACT128I256, IX86_BUILTIN_VINSERT128I256,
+       IX86_BUILTIN_PSLLVV4DI, IX86_BUILTIN_PSLLVV2DI,
+       IX86_BUILTIN_PSLLVV8SI, IX86_BUILTIN_PSLLVV4SI,
+       IX86_BUILTIN_PSRAVV8SI, IX86_BUILTIN_PSRAVV4SI,
+       IX86_BUILTIN_PSRLVV4DI, IX86_BUILTIN_PSRLVV2DI,
+       IX86_BUILTIN_PSRLVV8SI, IX86_BUILTIN_PSRLVV4SI.
+       (ix86_init_mmx_sse_builtins): Add IX86_BUILTIN_GATHERSIV2DF,
+       IX86_BUILTIN_GATHERSIV4DF, IX86_BUILTIN_GATHERDIV2DF,
+       IX86_BUILTIN_GATHERDIV4DF, IX86_BUILTIN_GATHERSIV4SF,
+       IX86_BUILTIN_GATHERSIV8SF, IX86_BUILTIN_GATHERDIV4SF,
+       IX86_BUILTIN_GATHERDIV8SF, IX86_BUILTIN_GATHERSIV2DI,
+       IX86_BUILTIN_GATHERSIV4DI, IX86_BUILTIN_GATHERDIV2DI,
+       IX86_BUILTIN_GATHERDIV4DI, IX86_BUILTIN_GATHERSIV4SI,
+       IX86_BUILTIN_GATHERSIV8SI, IX86_BUILTIN_GATHERDIV4SI,
+       IX86_BUILTIN_GATHERDIV8SI.
+       (ix86_preferred_simd_mode): Support AVX2 modes.
+       (ix86_expand_args_builtin): Support AVX2 built-ins.
+       (ix86_expand_special_args_builtin): Likewise.
+       (ix86_expand_builtin): Likewise.
+       * config/i386/i386.md (UNSPEC_VPERMSI): New.
+       (UNSPEC_VPERMDF): Likewise.
+       (UNSPEC_VPERMSF): Likewise.
+       (UNSPEC_VPERMDI): Likewise.
+       (UNSPEC_VPERMTI): Likewise.
+       (UNSPEC_GATHER): Likewise.
+       (ssemodesuffix): Extend.
+       * config/i386/immintrin.h: Include avx2intrin.h when __AVX2__
+       is defined.
+       * config/i386/predicates.md (const1248_operand): New.
+       * config/i386/sse.md (VI_AVX2):
+       (VI1_AVX2): Likewise.
+       (VI2_AVX2): Likewise.
+       (VI4_AVX2): Likewise.
+       (VI8_AVX2): Likewise.
+       (VIMAX_AVX2): Likewise.
+       (SSESCALARMODE): Likewise.
+       (VI12_AVX2): Likewise.
+       (VI24_AVX2): Likewise.
+       (VI124_AVX2): Likeuse_submit_for_speed = 1
+       wise.
+       (VI248_AVX2): Likewise.
+       (VI48_AVX2): Likewise.
+       (VI4SD_AVX2): Likewise.
+       (V48_AVX2): Likewise.
+       (avx2modesuffix): Likewise.
+       (sse_avx2): Likewise.
+       (sse2_avx2): Likewise.
+       (ssse3_avx2): Likewise.
+       (sse4_1_avx2): Likewise.
+       (avx_avx2): Likewise.
+       (lshift)<code_oterator>: Likewise.
+       (lshift_insn): Likewise.
+       (lshift)<code_attr>: Likewise.
+       (SSESHORTMODE): Likewise.
+       (SSELONGMODE): Likewise.
+       (SSEBYTEMODE): Likewise.
+       (AVXTOSSEMODE): Likewise.
+       (shortmode): Likewise.
+       (ssescalarmodesuffix): Update.
+       (sseunpackmode): Likewise.
+       (ssepackmode): Likewise.
+       (AVX256MODEI): New.
+       (AVX256MODE124): Likewise.
+       (AVX256MODE1248): Likewise.
+       (AVX256MODE248): Likewise.
+       (AVXMODE48P_SI): Likewise.
+       (AVXMODE48P_SI): Likewise.
+       (AVXMODE48P_DI): Likewise.
+       (AVXMODE48P_DI): Likewise.
+       (gthrfirstp): Likewise.
+       (gthrlastp): Likewise.
+       (avx2): Likwise.
+       (ssevecsize): Likewise.
+       (ssedoublesizemode): Likewise.
+       (avxvecmode): Likewise.
+       (avxvecsize): Likewise.
+       (avxhalfvecmode): Likewise.
+       (avxscalarmode): Likewise.
+       (avxpermvecmode): Likewise.
+       (avxmodesuffixp): Likewise.
+       (avxmodesuffix): Likewise.
+       (avx2_vec_dupv4sf): New.
+       (avx2_vec_dupv8sf): Likewise.
+       (avx2_interleave_highv4di): Likewise.
+       (avx2_interleave_lowv4di): Likewise.
+       (<plusminus_insn><mode>3): Update.
+       (*<plusminus_insn><mode>3): Likewise.
+       (sse2_<plusminus_insn><mode>3): Rename to ...
+       ("<sse2_avx2>_<plusminus_insn><mode>3): ... this. updated.
+       (*sse2_<plusminus_insn><mode>3): Likewise.
+       (*<sse2_avx2>_<plusminus_insn><mode>3): Likewise.
+       (mulv8hi3): Likewise.
+       (mul<mode>3): Likewise.
+       (*mulv8hi3): Likewise.
+       (*mul<mode>3): Likewise.
+       (<s>mulv8hi3_highpart): Likewise.
+       (<s>mul<mode>3_highpart): Likewise.
+       (*<s>mulv8hi3_highpart): Likewise.
+       (*<s>mul<mode>3_highpart): Likewise.
+       (avx2_umulv4siv4di3): Likewise.
+       (*avx_umulv4siv4di3): Likewise.
+       (sse4_1_mulv2siv2di3): Likewise.
+       (<sse4_1_avx2>_mul<shortmode><mode>3): Likewise.
+       (*sse4_1_mulv2siv2di3): Likewise.
+       (*<sse4_1_avx2>_mulv2siv2di3): Likewise.
+       (avx2_pmaddwd): New.
+       (*avx2_pmaddwd): Likewise.
+       (mulv4si3): Rename to ...
+       (mul<mode>3): ... this. Update.
+       (*sse4_1_mulv4si3): Likewise.
+       (*<sse4_1_avx2>_mul<mode>3): Likewise.
+       (ashr<mode>3): Update.
+       (avx2_lshrqv4di3): New.
+       (lshr<mode>3): Update.
+       (avx2_lshlqv4di3): New.
+       (avx2_lshl<mode>3): Likewise.
+       (sse2_ashlv1ti3): Rename to ...
+       (<sse2_avx2>_ashl<mode>3): ... this. Update.
+       (avx2_<code><mode>3)<umaxmin>: New.
+       (*avx2_<code><mode>3)<umaxmin>: Likewise.
+       (avx2_<code><mode>3)<smaxmin>: New.
+       (*avx2_<code><mode>3)<smaxmin>: Likewise.
+       (avx2_eq<mode>3): Likewise.
+       (*avx2_eq<mode>3): Likewise.
+       (avx2_gt<mode>3): Likewise.
+       (sse2_andnot<mode>3): Rename to ...
+       (<sse2_avx2>_andnot<mode>3): ... this. Update.
+       (*andnot<mode>3): Update.
+       (<code><mode>3)<any_logic>: Update.
+       (*<code><mode>3)<any_logic>: Likewise.
+       (sse2_packsswb): Rename to ...
+       (<sse2_avx2>_packsswb): ... this. Update.
+       (sse2_packssdw): Likewise.
+       (<sse2_avx2>_packssdw): Likewise.
+       (sse2_packuswb): Likewise.
+       (<sse2_avx2>_packuswb): Likewise.
+       (avx2_interleave_highv32qi): New.
+       (avx2_interleave_lowv32qi): Likewise.
+       (avx2_interleave_highv16hi): Likewise.
+       (avx2_interleave_lowv16hi): Likewise.
+       (avx2_interleave_highv8si): Likewise.
+       (avx2_interleave_lowv8si): Likewise.
+       (avx2_pshufd): New
+       (avx2_pshufd_1): Likewise.
+       (avx2_pshuflwv3): Likewise.
+       (avx2_pshuflw_1): Likewise.
+       (avx2_pshufhwv3): Likewise.
+       (avx2_pshufhw_1): Likewise.
+       (avx2_uavgv32qi3): Likewise.
+       (*avx2_uavgv32qi3): Likewise.
+       (avx2_uavgv16hi3): Likewise.
+       (*avx2_uavgv16hi3): Likewise.
+       (sse2_psadbw): Rename to ...
+       (<sse2_avx2>_psadbw): ... this. Update.
+       (avx2_pmovmskb): New.
+       (avx2_phaddwv16hi3): Likewise.
+       (avx2_phadddv8si3): Likewise.
+       (avx2_phaddswv16hi3): Likewise.
+       (avx2_phsubwv16hi3): Likewise.
+       (avx2_phsubdv8si3): Likewise.
+       (avx2_phsubswv16hi3): Likewise.
+       (avx2_pmaddubsw256): Likewise.
+       (avx2_umulhrswv16hi3): Likewise.
+       (*avx2_umulhrswv16hi3): Likewise.
+       (ssse3_pshufbv16qi3): Rename to ...
+       (<ssse3_avx2>_pshufb<mode>3): ... this. Update.
+       (ssse3_psign<mode>3): Likewise.
+       (<ssse3_avx2>_psign<mode>3): Likewise.
+       (ssse3_palignrti): Likewise.
+       (<ssse3_avx2>_palignr<mode>): Likewise.
+       (abs<mode>2): Likewise.
+       (sse4_1_movntdqa): Rename to ...
+       (<sse4_1_avx2>_movntdqa): ... this. Update.
+       (sse4_1_mpsadbw): Likewise.
+       (<sse4_1_avx2>_mpsadbw): Likewise.
+       (avx2_packusdw): New.
+       (sse4_1_pblendvb): Rename to ...
+       (<sse4_1_avx2>_pblendvb): ... this. Update.
+       (sse4_1_pblendw): Likewise.
+       (<sse4_1_avx2>_pblendw): Likewise.
+       (avx2_pblendd<mode>): New.
+       (avx2_<code>v16qiv16hi2): Likewise.
+       (avx2_<code>v8qiv8si2): Likewise.
+       (avx2_<code>v8hiv8si2): Likewise.
+       (avx2_<code>v4qiv4di2): Likewise.
+       (avx2_<code>v4hiv4di2): Likewise.
+       (avx2_<code>v4siv4di2): Likewise.
+       (avx2_pbroadcast<mode>): Likewise.
+       (avx2_permvarv8si): Likewise.
+       (avx2_permv4df): Likewise.
+       (avx2_permvarv8sf): Likewise.
+       (avx2_permv4di): Likewise.
+       (avx2_permv2ti): Likewise.
+       (avx2_vec_dupv4df): Likewise.
+       (avx2_vbroadcasti128_<mode>): Likewise.
+       (avx2_vec_set_lo_v4di): Likewise.
+       (avx2_vec_set_hi_v4di): Likewise.
+       (avx_maskload<ssemodesuffix><avxsizesuffix>): Rename to ...
+       (<avx_avx2>_maskload<avx2modesuffix><avxmodesuffix>): ... this.
+       Update.
+       (avx_maskstore<ssemodesuffix><avxsizesuffix>): Likewise.
+       (<avx_avx2>_maskstore<avx2modesuffix><avxmodesuffix>): Likewise.
+       (*avx2_maskmov<avx2modesuffix><avxmodesuffix>): New.
+       (avx2_extracti128): Likewise.
+       (avx2_inserti128): Likewise.
+       (avx2_ashrvv8si): Likewise.
+       (avx2_ashrvv4si): Likewise.
+       (avx2_<lshift>vv8si): Likewise.
+       (avx2_<lshift>v<mode>): Likewise.
+       (avx2_<lshift>vv2di): Likewise.
+       (avx2_gathersi<mode>): Likewise.
+       (*avx2_gathersi<mode>): Likewise.
+       (avx2_gatherdi<mode>): Likewise.
+       (*avx2_gatherdi<mode>): Likewise.
+       (avx2_gatherdi<mode>256): Likewise.
+       (*avx2_gatherdi<mode>256): Likewise.
+       * doc/extend.texi: Document AVX2 built-in functions.
+       * doc/invoke.texi: Document -mavx2.
+
 2011-08-22  Matthias Klose <doko@debian.org>
 
        Revert:
index e8e0eeb..b8addaf 100644 (file)
@@ -352,7 +352,7 @@ i[34567]86-*-*)
                       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
                       immintrin.h x86intrin.h avxintrin.h xopintrin.h
                       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-                      lzcntintrin.h bmiintrin.h tbmintrin.h"
+                      lzcntintrin.h bmiintrin.h tbmintrin.h avx2intrin.h"
        ;;
 x86_64-*-*)
        cpu_type=i386
@@ -364,7 +364,7 @@ x86_64-*-*)
                       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
                       immintrin.h x86intrin.h avxintrin.h xopintrin.h
                       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-                      lzcntintrin.h bmiintrin.h tbmintrin.h"
+                      lzcntintrin.h bmiintrin.h tbmintrin.h avx2intrin.h"
        need_64bit_hwint=yes
        ;;
 ia64-*-*)
diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
new file mode 100644 (file)
index 0000000..3c8f360
--- /dev/null
@@ -0,0 +1,1874 @@
+/* Copyright (C) 2011
+   Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+/* Sum absolute 8-bit integer difference of adjacent groups of 4
+   byte integers in the first 2 operands.  Starting offsets within
+   operands are determined by the 3rd mask operand.  */
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
+                                             (__v32qi)__Y, __M);
+}
+#else
+#define _mm256_mpsadbw_epu8(X, Y, M)                                   \
+  ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),         \
+                                       (__v32qi)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi8 (__m256i __A)
+{
+  return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi16 (__m256i __A)
+{
+  return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi32 (__m256i __A)
+{
+  return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packs_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packs_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packus_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packus_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
+{
+  return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
+                                             (__v4di)__B,
+                                             __N * 8);
+}
+#else
+/* In that case (__N*8) will be in vreg, and insn will not be matched. */
+/* Use define instead */
+#define _mm256_alignr_epi8(A, B, N)                               \
+  ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),     \
+                                       (__v4di)(__m256i)(B),      \
+                                       (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avg_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avg_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
+{
+  return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
+                                              (__v32qi)__Y,
+                                              (__v32qi)__M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
+                                             (__v16hi)__Y,
+                                              __M);
+}
+#else
+#define _mm256_blend_epi16(X, Y, M)                                    \
+  ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),         \
+                                       (__v16hi)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
+                                            (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
+                                            (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
+                                            (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
+                                            (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadds_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
+                                             (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
+                                            (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
+                                             (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
+                                               (__v32qi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_madd_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
+                                            (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_epi8 (__m256i __A)
+{
+  return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi16 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi16 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
+                                              (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhi_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhi_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_epu32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sad_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
+                                            (__v32qi)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_epi32 (__m256i __A, const int __mask)
+{
+  return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shufflehi_epi16 (__m256i __A, const int __mask)
+{
+  return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shufflelo_epi16 (__m256i __A, const int __mask)
+{
+  return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
+}
+#else
+#define _mm256_shuffle_epi32(A, N) \
+  ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
+#define _mm256_shufflehi_epi16(A, N) \
+  ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
+#define _mm256_shufflelo_epi16(A, N) \
+  ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_si256 (__m256i __A, const int __N)
+{
+  return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
+}
+#else
+#define _mm256_slli_si256(A, N) \
+  ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi16 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi16 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi32 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi32 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi64 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi64 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi16 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi16 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi32 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi32 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_si256 (__m256i __A, const int __N)
+{
+  return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
+}
+#else
+#define _mm256_srli_si256(A, N) \
+  ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi16 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi16 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi32 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi32 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi64 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi64 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_load_si256 (__m256i const *__X)
+{
+  return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastss_ps (__m128 __X)
+{
+  return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastss_ps (__m128 __X)
+{
+  return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastsd_pd (__m128d __X)
+{
+  return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastsi128_si256 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
+                                             (__v4si)__Y,
+                                             __M);
+}
+#else
+#define _mm_blend_epi32(X, Y, M)                                       \
+  ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),          \
+                                       (__v4si)(__m128i)(Y), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
+                                             (__v8si)__Y,
+                                             __M);
+}
+#else
+#define _mm256_blend_epi32(X, Y, M)                                    \
+  ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),          \
+                                       (__v8si)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastb_epi8 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastw_epi16 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastd_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastq_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastb_epi8 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastw_epi16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastd_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastq_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute4x64_pd (__m256d __X, const int __M)
+{
+  return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
+}
+#else
+#define _mm256_permute4x64_pd(X, M)                           \
+  ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
+#endif
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar8x32_ps (__m256 __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X,(__v8sf)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute4x64_epi64 (__m256i __X, const int __M)
+{
+  return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
+}
+#else
+#define _mm256_permute4x64_epi64(X, M)                        \
+  ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
+#endif
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
+}
+#else
+#define _mm256_permute2x128_si256(X, Y, M)                             \
+  ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extracti128_si256 (__m256i __X, const int __M)
+{
+  return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
+}
+#else
+#define _mm256_extracti128_si256(X, M)                         \
+  ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
+}
+#else
+#define _mm256_inserti128_si256(X, Y, M)                        \
+  ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
+                                          (__v2di)(__m128i)(Y), \
+                                          (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_epi32 (int const *__X, __m256i __M )
+{
+  return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
+                                               (__v8si)__M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_epi64 (long long const *__X, __m256i __M )
+{
+  return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
+                                               (__v4di)__M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_epi32 (int const *__X, __m128i __M )
+{
+  return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
+                                            (__v4si)__M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_epi64 (long long const *__X, __m128i __M )
+{
+  return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
+                                            (__v2di)__M);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
+{
+  __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
+{
+  __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
+{
+  __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
+{
+  __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi64 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srav_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srav_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi64 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_pd (double const *base, __m128i index, const int scale)
+{
+  __v2df src = _mm_setzero_pd ();
+  __v2df mask = _mm_cmpeq_pd (src, src);
+
+  return (__m128d) __builtin_ia32_gathersiv2df (src,
+                                               base,
+                                               (__v4si)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index,
+                      __m128d mask, const int scale)
+{
+  return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src,
+                                               base,
+                                               (__v4si)index,
+                                               (__v2df)mask,
+                                               scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_pd (double const *base, __m128i index, const int scale)
+{
+  __v4df src = _mm256_setzero_pd ();
+  __v4df mask = _mm256_set1_pd((double)(long long int) -1);
+
+  return (__m256d) __builtin_ia32_gathersiv4df (src,
+                                               base,
+                                               (__v4si)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_pd (__m256d src, double const *base,
+                         __m128i index, __m256d mask, const int scale)
+{
+  return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src,
+                                               base,
+                                               (__v4si)index,
+                                               (__v4df)mask,
+                                               scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_pd (double const *base, __m128i index, const int scale)
+{
+  __v2df src = _mm_setzero_pd ();
+  __v2df mask = _mm_cmpeq_pd (src, src);
+
+  return (__m128d) __builtin_ia32_gatherdiv2df (src,
+                                               base,
+                                               (__v2di)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index,
+                      __m128d mask, const int scale)
+{
+  return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src,
+                                               base,
+                                               (__v2di)index,
+                                               (__v2df)mask,
+                                               scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_pd (double const *base, __m256i index, const int scale)
+{
+  __v4df src = _mm256_setzero_pd ();
+  __v4df mask = _mm256_set1_pd((double)(long long int) -1);
+
+  return (__m256d) __builtin_ia32_gatherdiv4df (src,
+                                               base,
+                                               (__v4di)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_pd (__m256d src, double const *base,
+                         __m256i index, __m256d mask, const int scale)
+{
+  return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src,
+                                               base,
+                                               (__v4di)index,
+                                               (__v4df)mask,
+                                               scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_ps (float const *base, __m128i index, const int scale)
+{
+  __v4sf src = _mm_setzero_ps ();
+  __v4sf mask = _mm_cmpeq_ps (src, src);
+
+  return (__m128) __builtin_ia32_gathersiv4sf (src,
+                                              base,
+                                              (__v4si)index,
+                                              mask,
+                                              scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index,
+                      __m128 mask, const int scale)
+{
+  return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src,
+                                              base,
+                                              (__v4si)index,
+                                              (__v4sf)mask,
+                                              scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_ps (float const *base, __m256i index, const int scale)
+{
+  __v8sf src = _mm256_setzero_ps ();
+  __v8sf mask = _mm256_set1_ps((float)(int) -1);
+
+  return (__m256) __builtin_ia32_gathersiv8sf (src,
+                                              base,
+                                              (__v8si)index,
+                                              mask,
+                                              scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_ps (__m256 src, float const *base,
+                         __m256i index, __m256 mask, const int scale)
+{
+  return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src,
+                                              base,
+                                              (__v8si)index,
+                                              (__v8sf)mask,
+                                              scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_ps (float const *base, __m128i index, const int scale)
+{
+  __v4sf src = _mm_setzero_ps ();
+  __v4sf mask = _mm_cmpeq_ps (src, src);
+
+  return (__m128) __builtin_ia32_gatherdiv4sf (src,
+                                              base,
+                                              (__v2di)index,
+                                              mask,
+                                              scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index,
+                      __m128 mask, const int scale)
+{
+  return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src,
+                                               base,
+                                               (__v2di)index,
+                                               (__v4sf)mask,
+                                               scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_ps (float const *base, __m256i index, const int scale)
+{
+  __v4sf src = _mm_setzero_ps ();
+  __v4sf mask = _mm_cmpeq_ps (src, src);
+
+  return (__m128) __builtin_ia32_gatherdiv4sf256 (src,
+                                                 base,
+                                                 (__v4di)index,
+                                                 mask,
+                                                 scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_ps (__m128 src, float const *base,
+                         __m256i index, __m128 mask, const int scale)
+{
+  return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src,
+                                                 base,
+                                                 (__v4di)index,
+                                                 (__v4sf)mask,
+                                                 scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_epi64 (long long int const *base,
+                    __m128i index, const int scale)
+{
+  __v2di src = __extension__ (__v2di){ 0, 0 };
+  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gathersiv2di (src,
+                                               base,
+                                               (__v4si)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_epi64 (__m128i src, long long int const *base,
+                         __m128i index, __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src,
+                                               base,
+                                               (__v4si)index,
+                                               (__v2di)mask,
+                                               scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_epi64 (long long int const *base,
+                       __m128i index, const int scale)
+{
+  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
+  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
+
+  return (__m256i) __builtin_ia32_gathersiv4di (src,
+                                               base,
+                                               (__v4si)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_epi64 (__m256i src, long long int const *base,
+                            __m128i index, __m256i mask, const int scale)
+{
+  return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src,
+                                               base,
+                                               (__v4si)index,
+                                               (__v4di)mask,
+                                               scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_epi64 (long long int const *base,
+                    __m128i index, const int scale)
+{
+  __v2di src = __extension__ (__v2di){ 0, 0 };
+  __v2di mask = __extension__ (__v2di){ ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gatherdiv2di (src,
+                                               base,
+                                               (__v2di)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index,
+                         __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src,
+                                               base,
+                                               (__v2di)index,
+                                               (__v2di)mask,
+                                               scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_epi64 (long long int const *base,
+                       __m256i index, const int scale)
+{
+  __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 };
+  __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
+
+  return (__m256i) __builtin_ia32_gatherdiv4di (src,
+                                               base,
+                                               (__v4di)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_epi64 (__m256i src, long long int const *base,
+                            __m256i index, __m256i mask, const int scale)
+{
+  return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src,
+                                               base,
+                                               (__v4di)index,
+                                               (__v4di)mask,
+                                               scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_epi32 (int const *base, __m128i index, const int scale)
+{
+  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
+  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gathersiv4si (src,
+                                              base,
+                                              (__v4si)index,
+                                              mask,
+                                              scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index,
+                         __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src,
+                                               base,
+                                               (__v4si)index,
+                                               (__v4si)mask,
+                                               scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_epi32 (int const *base, __m256i index, const int scale)
+{
+  __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
+  __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
+
+  return (__m256i) __builtin_ia32_gathersiv8si (src,
+                                               base,
+                                               (__v8si)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_epi32 (__m256i src, int const *base,
+                            __m256i index, __m256i mask, const int scale)
+{
+  return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src,
+                                               base,
+                                               (__v8si)index,
+                                               (__v8si)mask,
+                                               scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_epi32 (int const *base, __m128i index, const int scale)
+{
+  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
+  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gatherdiv4si (src,
+                                               base,
+                                               (__v2di)index,
+                                               mask,
+                                               scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index,
+                         __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src,
+                                               base,
+                                               (__v2di)index,
+                                               (__v4si)mask,
+                                               scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_epi32 (int const *base, __m256i index, const int scale)
+{
+  __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 };
+  __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gatherdiv4si256 (src,
+                                                 base,
+                                                 (__v4di)index,
+                                                 mask,
+                                                 scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_epi32 (__m128i src, int const *base,
+                            __m256i index, __m128i mask, const int scale)
+{
+  return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src,
+                                                  base,
+                                                  (__v4di)index,
+                                                  (__v4si)mask,
+                                                  scale);
+}
+#else /* __OPTIMIZE__ */
+#define _mm_i32gather_pd(BASE, INDEX, SCALE)                           \
+  (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),   \
+                                        (double const *)BASE,          \
+                                        (__v4si)(__m128i)INDEX,        \
+                                        (__v2df)_mm_set1_pd(           \
+                                          (double)(long long int) -1), \
+                                        (int)SCALE)
+
+#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)    \
+  (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,  \
+                                        (double const *)BASE,   \
+                                        (__v4si)(__m128i)INDEX, \
+                                        (__v2df)(__m128d)MASK,  \
+                                        (int)SCALE)
+
+#define _mm256_i32gather_pd(BASE, INDEX, SCALE)                                \
+  (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),        \
+                                        (double const *)BASE,          \
+                                        (__v4si)(__m128i)INDEX,        \
+                                        (__v4df)_mm256_set1_pd(        \
+                                          (double)(long long int) -1), \
+                                        (int)SCALE)
+
+#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)         \
+  (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,  \
+                                        (double const *)BASE,   \
+                                        (__v4si)(__m128i)INDEX, \
+                                        (__v4df)(__m256d)MASK,  \
+                                        (int)SCALE)
+
+#define _mm_i64gather_pd(BASE, INDEX, SCALE)                           \
+  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),   \
+                                        (double const *)BASE,          \
+                                        (__v2di)(__m128i)INDEX,        \
+                                        (__v2df)_mm_set1_pd(           \
+                                          (double)(long long int) -1), \
+                                        (int)SCALE)
+
+#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)    \
+  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,  \
+                                        (double const *)BASE,   \
+                                        (__v2di)(__m128i)INDEX, \
+                                        (__v2df)(__m128d)MASK,  \
+                                        (int)SCALE)
+
+#define _mm256_i64gather_pd(BASE, INDEX, SCALE)                                \
+  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),        \
+                                        (double const *)BASE,          \
+                                        (__v4di)(__m256i)INDEX,        \
+                                        (__v4df)_mm256_set1_pd(        \
+                                          (double)(long long int) -1), \
+                                        (int)SCALE)
+
+#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)         \
+  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,  \
+                                        (double const *)BASE,   \
+                                        (__v4di)(__m256i)INDEX, \
+                                        (__v4df)(__m256d)MASK,  \
+                                        (int)SCALE)
+
+#define _mm_i32gather_ps(BASE, INDEX, SCALE)                           \
+  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),    \
+                                       (float const *)BASE,            \
+                                       (__v4si)(__m128i)INDEX,         \
+                                       _mm_set1_ps ((float)(int) -1),  \
+                                       (int)SCALE)
+
+#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)    \
+  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,   \
+                                       (float const *)BASE,     \
+                                       (__v4si)(__m128i)INDEX,  \
+                                       (__v4sf)(__m128d)MASK,   \
+                                       (int)SCALE)
+
+#define _mm256_i32gather_ps(BASE, INDEX, SCALE)                               \
+  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
+                                       (float const *)BASE,           \
+                                       (__v8si)(__m256i)INDEX,        \
+                                       (__v8sf)_mm256_set1_ps (       \
+                                         (float)(int) -1),            \
+                                       (int)SCALE)
+
+#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,   \
+                                       (float const *)BASE,    \
+                                       (__v8si)(__m256i)INDEX, \
+                                       (__v8sf)(__m256d)MASK,  \
+                                       (int)SCALE)
+
+#define _mm_i64gather_ps(BASE, INDEX, SCALE)                           \
+  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),    \
+                                       (float const *)BASE,            \
+                                       (__v2di)(__m128i)INDEX,         \
+                                       (__v4sf)_mm_set1_ps (           \
+                                         (float)(int) -1),             \
+                                       (int)SCALE)
+
+#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)    \
+  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,    \
+                                       (float const *)BASE,     \
+                                       (__v2di)(__m128i)INDEX,  \
+                                       (__v4sf)(__m128d)MASK,   \
+                                       (int)SCALE)
+
+#define _mm256_i64gather_ps(BASE, INDEX, SCALE)                                \
+  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
+                                          (float const *)BASE,         \
+                                          (__v4di)(__m256i)INDEX,      \
+                                          (__v4sf)_mm_set1_ps(         \
+                                            (float)(int) -1),          \
+                                          (int)SCALE)
+
+#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)           \
+  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,           \
+                                          (float const *)BASE,    \
+                                          (__v4di)(__m256i)INDEX, \
+                                          (__v4sf)(__m128)MASK,   \
+                                          (int)SCALE)
+
+#define _mm_i32gather_epi64(BASE, INDEX, SCALE)                                \
+  (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
+                                        (long long const *)BASE,       \
+                                        (__v4si)(__m128i)INDEX,        \
+                                        (__v2di)_mm_set1_epi64x (-1),  \
+                                        (int)SCALE)
+
+#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)          \
+  (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC,   \
+                                        (long long const *)BASE, \
+                                        (__v4si)(__m128i)INDEX,  \
+                                        (__v2di)(__m128i)MASK,   \
+                                        (int)SCALE)
+
+#define _mm256_i32gather_epi64(BASE, INDEX, SCALE)                        \
+  (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
+                                        (long long const *)BASE,          \
+                                        (__v4si)(__m128i)INDEX,           \
+                                        (__v4di)_mm256_set1_epi64x (-1),  \
+                                        (int)SCALE)
+
+#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC,    \
+                                        (long long const *)BASE,  \
+                                        (__v4si)(__m128i)INDEX,   \
+                                        (__v4di)(__m256i)MASK,    \
+                                        (int)SCALE)
+
+#define _mm_i64gather_epi64(BASE, INDEX, SCALE)                                \
+  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
+                                        (long long const *)BASE,       \
+                                        (__v2di)(__m128i)INDEX,        \
+                                        (__v2di)_mm_set1_epi64x (-1),  \
+                                        (int)SCALE)
+
+#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)          \
+  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC,   \
+                                        (long long const *)BASE, \
+                                        (__v2di)(__m128i)INDEX,  \
+                                        (__v2di)(__m128i)MASK,   \
+                                        (int)SCALE)
+
+#define _mm256_i64gather_epi64(BASE, INDEX, SCALE)                        \
+  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
+                                        (long long const *)BASE,          \
+                                        (__v4di)(__m256i)INDEX,           \
+                                        (__v4di)_mm256_set1_epi64x (-1),  \
+                                        (int)SCALE)
+
+#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC,    \
+                                        (long long const *)BASE,  \
+                                        (__v4di)(__m256i)INDEX,   \
+                                        (__v4di)(__m256i)MASK,    \
+                                        (int)SCALE)
+
+#define _mm_i32gather_epi32(BASE, INDEX, SCALE)                                \
+  (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),        \
+                                        (int const *)BASE,             \
+                                        (__v4si)(__m128i)INDEX,        \
+                                        (__v4si)_mm_set1_epi32 (-1),   \
+                                        (int)SCALE)
+
+#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \
+                                       (int const *)BASE,      \
+                                       (__v4si)(__m128i)INDEX, \
+                                       (__v4si)(__m128i)MASK,  \
+                                       (int)SCALE)
+
+#define _mm256_i32gather_epi32(BASE, INDEX, SCALE)                        \
+  (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
+                                        (int const *)BASE,                \
+                                        (__v8si)(__m256i)INDEX,           \
+                                        (__v8si)_mm256_set1_epi32 (-1),   \
+                                        (int)SCALE)
+
+#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC,    \
+                                       (int const *)BASE,         \
+                                       (__v8si)(__m256i)INDEX,    \
+                                       (__v8si)(__m256i)MASK,     \
+                                       (int)SCALE)
+
+#define _mm_i64gather_epi32(BASE, INDEX, SCALE)                                \
+  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),        \
+                                        (int const *)BASE,             \
+                                        (__v2di)(__m128i)INDEX,        \
+                                        (__v4si)_mm_set1_epi32 (-1),   \
+                                        (int)SCALE)
+
+#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \
+                                       (int const *)BASE,      \
+                                       (__v2di)(__m128i)INDEX, \
+                                       (__v4si)(__m128i)MASK,  \
+                                       (int)SCALE)
+
+#define _mm256_i64gather_epi32(BASE, INDEX, SCALE)                        \
+  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
+                                           (int const *)BASE,             \
+                                           (__v4di)(__m256i)INDEX,        \
+                                           (__v4si)_mm_set1_epi32(-1),    \
+                                           (int)SCALE)
+
+#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC,  \
+                                          (int const *)BASE,      \
+                                          (__v4di)(__m256i)INDEX, \
+                                          (__v4si)(__m128i)MASK,  \
+                                          (int)SCALE)
+#endif  /* __OPTIMIZE__ */
index d4b3e82..c4070e4 100644 (file)
@@ -102,6 +102,8 @@ DEF_VECTOR_TYPE (V32QI, QI)
 DEF_POINTER_TYPE (PCCHAR, CHAR, CONST)
 DEF_POINTER_TYPE (PCDOUBLE, DOUBLE, CONST)
 DEF_POINTER_TYPE (PCFLOAT, FLOAT, CONST)
+DEF_POINTER_TYPE (PCINT, INT, CONST)
+DEF_POINTER_TYPE (PCINT64, INT64, CONST)
 DEF_POINTER_TYPE (PCHAR, CHAR)
 DEF_POINTER_TYPE (PCVOID, VOID, CONST)
 DEF_POINTER_TYPE (PVOID, VOID)
@@ -119,6 +121,8 @@ DEF_POINTER_TYPE (PV4DF, V4DF)
 DEF_POINTER_TYPE (PV4DI, V4DI)
 DEF_POINTER_TYPE (PV4SF, V4SF)
 DEF_POINTER_TYPE (PV8SF, V8SF)
+DEF_POINTER_TYPE (PV4SI, V4SI)
+DEF_POINTER_TYPE (PV8SI, V8SI)
 
 DEF_POINTER_TYPE (PCV2DF, V2DF, CONST)
 DEF_POINTER_TYPE (PCV2SF, V2SF, CONST)
@@ -126,6 +130,11 @@ DEF_POINTER_TYPE (PCV4DF, V4DF, CONST)
 DEF_POINTER_TYPE (PCV4SF, V4SF, CONST)
 DEF_POINTER_TYPE (PCV8SF, V8SF, CONST)
 
+DEF_POINTER_TYPE (PCV2DI, V2DI, CONST)
+DEF_POINTER_TYPE (PCV4SI, V4SI, CONST)
+DEF_POINTER_TYPE (PCV4DI, V4DI, CONST)
+DEF_POINTER_TYPE (PCV8SI, V8SI, CONST)
+
 DEF_FUNCTION_TYPE (FLOAT128)
 DEF_FUNCTION_TYPE (UINT64)
 DEF_FUNCTION_TYPE (UNSIGNED)
@@ -141,6 +150,7 @@ DEF_FUNCTION_TYPE (INT, V4DF)
 DEF_FUNCTION_TYPE (INT, V4SF)
 DEF_FUNCTION_TYPE (INT, V8QI)
 DEF_FUNCTION_TYPE (INT, V8SF)
+DEF_FUNCTION_TYPE (INT, V32QI)
 DEF_FUNCTION_TYPE (INT64, INT64)
 DEF_FUNCTION_TYPE (INT64, V2DF)
 DEF_FUNCTION_TYPE (INT64, V4SF)
@@ -199,6 +209,11 @@ DEF_FUNCTION_TYPE (V8SF, V8SI)
 DEF_FUNCTION_TYPE (V8SF, V8HI)
 DEF_FUNCTION_TYPE (V8SI, V4SI)
 DEF_FUNCTION_TYPE (V8SI, V8SF)
+DEF_FUNCTION_TYPE (V32QI, V32QI)
+DEF_FUNCTION_TYPE (V32QI, V16QI)
+DEF_FUNCTION_TYPE (V16HI, V16HI)
+DEF_FUNCTION_TYPE (V16HI, V8HI)
+DEF_FUNCTION_TYPE (V8SI, V8SI)
 DEF_FUNCTION_TYPE (VOID, PCVOID)
 DEF_FUNCTION_TYPE (VOID, PVOID)
 DEF_FUNCTION_TYPE (VOID, UINT64)
@@ -206,6 +221,14 @@ DEF_FUNCTION_TYPE (VOID, UNSIGNED)
 DEF_FUNCTION_TYPE (INT, PUSHORT)
 DEF_FUNCTION_TYPE (INT, PUNSIGNED)
 DEF_FUNCTION_TYPE (INT, PULONGLONG)
+DEF_FUNCTION_TYPE (V16HI, V16QI)
+DEF_FUNCTION_TYPE (V8SI, V16QI)
+DEF_FUNCTION_TYPE (V4DI, V16QI)
+DEF_FUNCTION_TYPE (V8SI, V8HI)
+DEF_FUNCTION_TYPE (V4DI, V8HI)
+DEF_FUNCTION_TYPE (V4DI, V4SI)
+DEF_FUNCTION_TYPE (V4DI, PV4DI)
+DEF_FUNCTION_TYPE (V4DI, V2DI)
 
 DEF_FUNCTION_TYPE (DI, V2DI, INT)
 DEF_FUNCTION_TYPE (DOUBLE, V2DF, INT)
@@ -252,6 +275,7 @@ DEF_FUNCTION_TYPE (V2DI, V2DI, SI)
 DEF_FUNCTION_TYPE (V2DI, V2DI, V16QI)
 DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI)
 DEF_FUNCTION_TYPE (V2DI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (V2DI, PCV2DI, V2DI)
 DEF_FUNCTION_TYPE (V2SF, V2SF, V2SF)
 DEF_FUNCTION_TYPE (V2SI, INT, INT)
 DEF_FUNCTION_TYPE (V2SI, V2SF, V2SF)
@@ -284,6 +308,7 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, SI)
 DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI)
 DEF_FUNCTION_TYPE (V4SI, V8HI, V8HI)
 DEF_FUNCTION_TYPE (V4SI, V8SI, INT)
+DEF_FUNCTION_TYPE (V4SI, PCV4SI, V4SI)
 DEF_FUNCTION_TYPE (V8HI, V16QI, V16QI)
 DEF_FUNCTION_TYPE (V8HI, V4SI, V4SI)
 DEF_FUNCTION_TYPE (V8HI, V8HI, INT)
@@ -297,6 +322,28 @@ DEF_FUNCTION_TYPE (V8SF, PCV8SF, V8SI)
 DEF_FUNCTION_TYPE (V8SF, V8SF, INT)
 DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF)
 DEF_FUNCTION_TYPE (V8SF, V8SF, V8SI)
+DEF_FUNCTION_TYPE (V32QI, V16HI, V16HI)
+DEF_FUNCTION_TYPE (V16HI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI)
+DEF_FUNCTION_TYPE (V16HI, V32QI, V32QI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, V8HI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, INT)
+DEF_FUNCTION_TYPE (V16HI, V16HI, SI)
+DEF_FUNCTION_TYPE (V16HI, V16HI, V16HI, INT)
+DEF_FUNCTION_TYPE (V32QI, V32QI, V32QI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI)
+DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V8SI, V16HI, V16HI)
+DEF_FUNCTION_TYPE (V8SI, V8SI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, SI)
+DEF_FUNCTION_TYPE (V8SI, PCV8SI, V8SI)
+DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI)
+DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI)
+DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI)
+DEF_FUNCTION_TYPE (V4DI, V4DI, INT)
+DEF_FUNCTION_TYPE (V2DI, V4DI, INT)
 DEF_FUNCTION_TYPE (VOID, PCHAR, V16QI)
 DEF_FUNCTION_TYPE (VOID, PCHAR, V32QI)
 DEF_FUNCTION_TYPE (VOID, PDOUBLE, V2DF)
@@ -351,11 +398,17 @@ DEF_FUNCTION_TYPE (V8SF, V8SF, V8SF, V8SI, INT)
 DEF_FUNCTION_TYPE (V8SI, V8SI, V4SI, INT)
 DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, INT)
 DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI, INT)
 DEF_FUNCTION_TYPE (VOID, PCVOID, UNSIGNED, UNSIGNED)
 DEF_FUNCTION_TYPE (VOID, PV2DF, V2DI, V2DF)
 DEF_FUNCTION_TYPE (VOID, PV4DF, V4DI, V4DF)
 DEF_FUNCTION_TYPE (VOID, PV4SF, V4SI, V4SF)
 DEF_FUNCTION_TYPE (VOID, PV8SF, V8SI, V8SF)
+DEF_FUNCTION_TYPE (VOID, PV2DI, V2DI, V2DI)
+DEF_FUNCTION_TYPE (VOID, PV4DI, V4DI, V4DI)
+DEF_FUNCTION_TYPE (VOID, PV4SI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (VOID, PV8SI, V8SI, V8SI)
 DEF_FUNCTION_TYPE (VOID, UINT, UINT, UINT)
 DEF_FUNCTION_TYPE (VOID, UINT64, UINT, UINT)
 DEF_FUNCTION_TYPE (VOID, V16QI, V16QI, PCHAR)
@@ -377,6 +430,23 @@ DEF_FUNCTION_TYPE (V16QI, V16QI, INT, V16QI, INT, INT)
 
 DEF_FUNCTION_TYPE (V8QI, QI, QI, QI, QI, QI, QI, QI, QI)
 
+DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V4SI, V2DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4SI, V4DF, INT)
+DEF_FUNCTION_TYPE (V2DF, V2DF, PCDOUBLE, V2DI, V2DF, INT)
+DEF_FUNCTION_TYPE (V4DF, V4DF, PCDOUBLE, V4DI, V4DF, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4SI, V4SF, INT)
+DEF_FUNCTION_TYPE (V8SF, V8SF, PCFLOAT, V8SI, V8SF, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V2DI, V4SF, INT)
+DEF_FUNCTION_TYPE (V4SF, V4SF, PCFLOAT, V4DI, V4SF, INT)
+DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V4SI, V2DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4SI, V4DI, INT)
+DEF_FUNCTION_TYPE (V2DI, V2DI, PCINT64, V2DI, V2DI, INT)
+DEF_FUNCTION_TYPE (V4DI, V4DI, PCINT64, V4DI, V4DI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4SI, V4SI, INT)
+DEF_FUNCTION_TYPE (V8SI, V8SI, PCINT, V8SI, V8SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V2DI, V4SI, INT)
+DEF_FUNCTION_TYPE (V4SI, V4SI, PCINT, V4DI, V4SI, INT)
+
 DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V4DF_FTYPE_V4DF, ROUND)
 DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF, ROUND)
@@ -404,11 +474,19 @@ DEF_FUNCTION_TYPE_ALIAS (V2SI_FTYPE_V2SI_V2SI, COUNT)
 DEF_FUNCTION_TYPE_ALIAS (V4HI_FTYPE_V4HI_V4HI, COUNT)
 DEF_FUNCTION_TYPE_ALIAS (V4SI_FTYPE_V4SI_V4SI, COUNT)
 DEF_FUNCTION_TYPE_ALIAS (V8HI_FTYPE_V8HI_V8HI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V16HI_FTYPE_V16HI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V16HI_FTYPE_V16HI_V8HI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SI_SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V8SI_FTYPE_V8SI_V4SI, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_INT, COUNT)
+DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_V2DI, COUNT)
 
 DEF_FUNCTION_TYPE_ALIAS (V2DF_FTYPE_V2DF_V2DF, SWAP)
 DEF_FUNCTION_TYPE_ALIAS (V4SF_FTYPE_V4SF_V4SF, SWAP)
 
+DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_INT, CONVERT)
 DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_INT, CONVERT)
+DEF_FUNCTION_TYPE_ALIAS (V4DI_FTYPE_V4DI_V4DI_INT, CONVERT)
 DEF_FUNCTION_TYPE_ALIAS (V2DI_FTYPE_V2DI_V2DI_INT, CONVERT)
 DEF_FUNCTION_TYPE_ALIAS (V1DI_FTYPE_V1DI_V1DI_INT, CONVERT)
 
index 7b7ac87..ef02673 100644 (file)
@@ -23867,6 +23867,180 @@ enum ix86_builtins
   IX86_BUILTIN_MOVMSKPD256,
   IX86_BUILTIN_MOVMSKPS256,
 
+  /* AVX2 */
+  IX86_BUILTIN_MPSADBW256,
+  IX86_BUILTIN_PABSB256,
+  IX86_BUILTIN_PABSW256,
+  IX86_BUILTIN_PABSD256,
+  IX86_BUILTIN_PACKSSDW256,
+  IX86_BUILTIN_PACKSSWB256,
+  IX86_BUILTIN_PACKUSDW256,
+  IX86_BUILTIN_PACKUSWB256,
+  IX86_BUILTIN_PADDB256,
+  IX86_BUILTIN_PADDW256,
+  IX86_BUILTIN_PADDD256,
+  IX86_BUILTIN_PADDQ256,
+  IX86_BUILTIN_PADDSB256,
+  IX86_BUILTIN_PADDSW256,
+  IX86_BUILTIN_PADDUSB256,
+  IX86_BUILTIN_PADDUSW256,
+  IX86_BUILTIN_PALIGNR256,
+  IX86_BUILTIN_AND256I,
+  IX86_BUILTIN_ANDNOT256I,
+  IX86_BUILTIN_PAVGB256,
+  IX86_BUILTIN_PAVGW256,
+  IX86_BUILTIN_PBLENDVB256,
+  IX86_BUILTIN_PBLENDVW256,
+  IX86_BUILTIN_PCMPEQB256,
+  IX86_BUILTIN_PCMPEQW256,
+  IX86_BUILTIN_PCMPEQD256,
+  IX86_BUILTIN_PCMPEQQ256,
+  IX86_BUILTIN_PCMPGTB256,
+  IX86_BUILTIN_PCMPGTW256,
+  IX86_BUILTIN_PCMPGTD256,
+  IX86_BUILTIN_PCMPGTQ256,
+  IX86_BUILTIN_PHADDW256,
+  IX86_BUILTIN_PHADDD256,
+  IX86_BUILTIN_PHADDSW256,
+  IX86_BUILTIN_PHSUBW256,
+  IX86_BUILTIN_PHSUBD256,
+  IX86_BUILTIN_PHSUBSW256,
+  IX86_BUILTIN_PMADDUBSW256,
+  IX86_BUILTIN_PMADDWD256,
+  IX86_BUILTIN_PMAXSB256,
+  IX86_BUILTIN_PMAXSW256,
+  IX86_BUILTIN_PMAXSD256,
+  IX86_BUILTIN_PMAXUB256,
+  IX86_BUILTIN_PMAXUW256,
+  IX86_BUILTIN_PMAXUD256,
+  IX86_BUILTIN_PMINSB256,
+  IX86_BUILTIN_PMINSW256,
+  IX86_BUILTIN_PMINSD256,
+  IX86_BUILTIN_PMINUB256,
+  IX86_BUILTIN_PMINUW256,
+  IX86_BUILTIN_PMINUD256,
+  IX86_BUILTIN_PMOVMSKB256,
+  IX86_BUILTIN_PMOVSXBW256,
+  IX86_BUILTIN_PMOVSXBD256,
+  IX86_BUILTIN_PMOVSXBQ256,
+  IX86_BUILTIN_PMOVSXWD256,
+  IX86_BUILTIN_PMOVSXWQ256,
+  IX86_BUILTIN_PMOVSXDQ256,
+  IX86_BUILTIN_PMOVZXBW256,
+  IX86_BUILTIN_PMOVZXBD256,
+  IX86_BUILTIN_PMOVZXBQ256,
+  IX86_BUILTIN_PMOVZXWD256,
+  IX86_BUILTIN_PMOVZXWQ256,
+  IX86_BUILTIN_PMOVZXDQ256,
+  IX86_BUILTIN_PMULDQ256,
+  IX86_BUILTIN_PMULHRSW256,
+  IX86_BUILTIN_PMULHUW256,
+  IX86_BUILTIN_PMULHW256,
+  IX86_BUILTIN_PMULLW256,
+  IX86_BUILTIN_PMULLD256,
+  IX86_BUILTIN_PMULUDQ256,
+  IX86_BUILTIN_POR256,
+  IX86_BUILTIN_PSADBW256,
+  IX86_BUILTIN_PSHUFB256,
+  IX86_BUILTIN_PSHUFD256,
+  IX86_BUILTIN_PSHUFHW256,
+  IX86_BUILTIN_PSHUFLW256,
+  IX86_BUILTIN_PSIGNB256,
+  IX86_BUILTIN_PSIGNW256,
+  IX86_BUILTIN_PSIGND256,
+  IX86_BUILTIN_PSLLDQI256,
+  IX86_BUILTIN_PSLLWI256,
+  IX86_BUILTIN_PSLLW256,
+  IX86_BUILTIN_PSLLDI256,
+  IX86_BUILTIN_PSLLD256,
+  IX86_BUILTIN_PSLLQI256,
+  IX86_BUILTIN_PSLLQ256,
+  IX86_BUILTIN_PSRAWI256,
+  IX86_BUILTIN_PSRAW256,
+  IX86_BUILTIN_PSRADI256,
+  IX86_BUILTIN_PSRAD256,
+  IX86_BUILTIN_PSRLDQI256,
+  IX86_BUILTIN_PSRLWI256,
+  IX86_BUILTIN_PSRLW256,
+  IX86_BUILTIN_PSRLDI256,
+  IX86_BUILTIN_PSRLD256,
+  IX86_BUILTIN_PSRLQI256,
+  IX86_BUILTIN_PSRLQ256,
+  IX86_BUILTIN_PSUBB256,
+  IX86_BUILTIN_PSUBW256,
+  IX86_BUILTIN_PSUBD256,
+  IX86_BUILTIN_PSUBQ256,
+  IX86_BUILTIN_PSUBSB256,
+  IX86_BUILTIN_PSUBSW256,
+  IX86_BUILTIN_PSUBUSB256,
+  IX86_BUILTIN_PSUBUSW256,
+  IX86_BUILTIN_PUNPCKHBW256,
+  IX86_BUILTIN_PUNPCKHWD256,
+  IX86_BUILTIN_PUNPCKHDQ256,
+  IX86_BUILTIN_PUNPCKHQDQ256,
+  IX86_BUILTIN_PUNPCKLBW256,
+  IX86_BUILTIN_PUNPCKLWD256,
+  IX86_BUILTIN_PUNPCKLDQ256,
+  IX86_BUILTIN_PUNPCKLQDQ256,
+  IX86_BUILTIN_PXOR256,
+  IX86_BUILTIN_MOVNTDQA256,
+  IX86_BUILTIN_VBROADCASTSS_PS,
+  IX86_BUILTIN_VBROADCASTSS_PS256,
+  IX86_BUILTIN_VBROADCASTSD_PD256,
+  IX86_BUILTIN_VBROADCASTSI256,
+  IX86_BUILTIN_PBLENDD256,
+  IX86_BUILTIN_PBLENDD128,
+  IX86_BUILTIN_PBROADCASTB256,
+  IX86_BUILTIN_PBROADCASTW256,
+  IX86_BUILTIN_PBROADCASTD256,
+  IX86_BUILTIN_PBROADCASTQ256,
+  IX86_BUILTIN_PBROADCASTB128,
+  IX86_BUILTIN_PBROADCASTW128,
+  IX86_BUILTIN_PBROADCASTD128,
+  IX86_BUILTIN_PBROADCASTQ128,
+  IX86_BUILTIN_VPERMVARSI256,
+  IX86_BUILTIN_VPERMDF256,
+  IX86_BUILTIN_VPERMVARSF256,
+  IX86_BUILTIN_VPERMDI256,
+  IX86_BUILTIN_VPERMTI256,
+  IX86_BUILTIN_VEXTRACT128I256,
+  IX86_BUILTIN_VINSERT128I256,
+  IX86_BUILTIN_MASKLOADD,
+  IX86_BUILTIN_MASKLOADQ,
+  IX86_BUILTIN_MASKLOADD256,
+  IX86_BUILTIN_MASKLOADQ256,
+  IX86_BUILTIN_MASKSTORED,
+  IX86_BUILTIN_MASKSTOREQ,
+  IX86_BUILTIN_MASKSTORED256,
+  IX86_BUILTIN_MASKSTOREQ256,
+  IX86_BUILTIN_PSLLVV4DI,
+  IX86_BUILTIN_PSLLVV2DI,
+  IX86_BUILTIN_PSLLVV8SI,
+  IX86_BUILTIN_PSLLVV4SI,
+  IX86_BUILTIN_PSRAVV8SI,
+  IX86_BUILTIN_PSRAVV4SI,
+  IX86_BUILTIN_PSRLVV4DI,
+  IX86_BUILTIN_PSRLVV2DI,
+  IX86_BUILTIN_PSRLVV8SI,
+  IX86_BUILTIN_PSRLVV4SI,
+
+  IX86_BUILTIN_GATHERSIV2DF,
+  IX86_BUILTIN_GATHERSIV4DF,
+  IX86_BUILTIN_GATHERDIV2DF,
+  IX86_BUILTIN_GATHERDIV4DF,
+  IX86_BUILTIN_GATHERSIV4SF,
+  IX86_BUILTIN_GATHERSIV8SF,
+  IX86_BUILTIN_GATHERDIV4SF,
+  IX86_BUILTIN_GATHERDIV8SF,
+  IX86_BUILTIN_GATHERSIV2DI,
+  IX86_BUILTIN_GATHERSIV4DI,
+  IX86_BUILTIN_GATHERDIV2DI,
+  IX86_BUILTIN_GATHERDIV4DI,
+  IX86_BUILTIN_GATHERSIV4SI,
+  IX86_BUILTIN_GATHERSIV8SI,
+  IX86_BUILTIN_GATHERDIV4SI,
+  IX86_BUILTIN_GATHERDIV8SI,
+
   /* TFmode support builtins.  */
   IX86_BUILTIN_INFQ,
   IX86_BUILTIN_HUGE_VALQ,
@@ -24362,6 +24536,17 @@ static const struct builtin_description bdesc_special_args[] =
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
 
+  /* AVX2 */
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
+
   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
   { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
@@ -25026,6 +25211,154 @@ static const struct builtin_description bdesc_args[] =
   { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3,  "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
   { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3,  "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
 
+  /* AVX2 */
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256",  IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256",  IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256",  IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256",  IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv4di, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256",  IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256",  IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2  , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2  , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2  , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2  , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2  , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2  , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2  , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2  , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2  , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2  , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3  , "__builtin_ia32_pmuldq256"  , IX86_BUILTIN_PMULDQ256  , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256"  , IX86_BUILTIN_PMULHW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256"  , IX86_BUILTIN_PMULLW256  , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256"  , IX86_BUILTIN_PMULLD256  , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3  , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlqv4di3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrqv4di3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI  },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN,  (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+
   { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt,   "__builtin_clzs",   IX86_BUILTIN_CLZS,    UNKNOWN,     (int) UINT16_FTYPE_UINT16 },
 
   /* BMI */
@@ -25415,6 +25748,71 @@ ix86_init_mmx_sse_builtins (void)
               "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
               IX86_BUILTIN_RDRAND64_STEP);
 
+  /* AVX2 */
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
+              V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
+              IX86_BUILTIN_GATHERSIV2DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
+              V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
+              IX86_BUILTIN_GATHERSIV4DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
+              V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
+              IX86_BUILTIN_GATHERDIV2DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
+              V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
+              IX86_BUILTIN_GATHERDIV4DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
+              V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
+              IX86_BUILTIN_GATHERSIV4SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
+              V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
+              IX86_BUILTIN_GATHERSIV8SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
+              V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
+              IX86_BUILTIN_GATHERDIV4SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
+              V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
+              IX86_BUILTIN_GATHERDIV8SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
+              V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
+              IX86_BUILTIN_GATHERSIV2DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
+              V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
+              IX86_BUILTIN_GATHERSIV4DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
+              V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
+              IX86_BUILTIN_GATHERDIV2DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
+              V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
+              IX86_BUILTIN_GATHERDIV4DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
+              V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
+              IX86_BUILTIN_GATHERSIV4SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
+              V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
+              IX86_BUILTIN_GATHERSIV8SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
+              V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
+              IX86_BUILTIN_GATHERDIV4SI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
+              V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
+              IX86_BUILTIN_GATHERDIV8SI);
+
   /* MMX access to the vec_init patterns.  */
   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
                     V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
@@ -26364,6 +26762,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case INT_FTYPE_V4DF:
     case INT_FTYPE_V4SF:
     case INT_FTYPE_V2DF:
+    case INT_FTYPE_V32QI:
     case V16QI_FTYPE_V16QI:
     case V8SI_FTYPE_V8SF:
     case V8SI_FTYPE_V4SI:
@@ -26407,6 +26806,18 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V2SI_FTYPE_V2DF:
     case V2SF_FTYPE_V2SF:
     case V2SF_FTYPE_V2SI:
+    case V32QI_FTYPE_V32QI:
+    case V32QI_FTYPE_V16QI:
+    case V16HI_FTYPE_V16HI:
+    case V16HI_FTYPE_V8HI:
+    case V8SI_FTYPE_V8SI:
+    case V16HI_FTYPE_V16QI:
+    case V8SI_FTYPE_V16QI:
+    case V4DI_FTYPE_V16QI:
+    case V8SI_FTYPE_V8HI:
+    case V4DI_FTYPE_V8HI:
+    case V4DI_FTYPE_V4SI:
+    case V4DI_FTYPE_V2DI:
       nargs = 1;
       break;
     case V4SF_FTYPE_V4SF_VEC_MERGE:
@@ -26454,6 +26865,15 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V1DI_FTYPE_V1DI_V1DI:
     case V1DI_FTYPE_V8QI_V8QI:
     case V1DI_FTYPE_V2SI_V2SI:
+    case V32QI_FTYPE_V16HI_V16HI:
+    case V16HI_FTYPE_V8SI_V8SI:
+    case V32QI_FTYPE_V32QI_V32QI:
+    case V16HI_FTYPE_V32QI_V32QI:
+    case V16HI_FTYPE_V16HI_V16HI:
+    case V8SI_FTYPE_V8SI_V8SI:
+    case V8SI_FTYPE_V16HI_V16HI:
+    case V4DI_FTYPE_V4DI_V4DI:
+    case V4DI_FTYPE_V8SI_V8SI:
       if (comparison == UNKNOWN)
        return ix86_expand_binop_builtin (icode, exp, target);
       nargs = 2;
@@ -26464,6 +26884,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
       nargs = 2;
       swap = true;
       break;
+    case V16HI_FTYPE_V16HI_V8HI_COUNT:
+    case V16HI_FTYPE_V16HI_SI_COUNT:
+    case V8SI_FTYPE_V8SI_V4SI_COUNT:
+    case V8SI_FTYPE_V8SI_SI_COUNT:
+    case V4DI_FTYPE_V4DI_V2DI_COUNT:
+    case V4DI_FTYPE_V4DI_INT_COUNT:
     case V8HI_FTYPE_V8HI_V8HI_COUNT:
     case V8HI_FTYPE_V8HI_SI_COUNT:
     case V4SI_FTYPE_V4SI_V4SI_COUNT:
@@ -26505,6 +26931,10 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V2DI_FTYPE_V2DI_INT:
     case V2DF_FTYPE_V2DF_INT:
     case V2DF_FTYPE_V4DF_INT:
+    case V16HI_FTYPE_V16HI_INT:
+    case V8SI_FTYPE_V8SI_INT:
+    case V4DI_FTYPE_V4DI_INT:
+    case V2DI_FTYPE_V4DI_INT:
       nargs = 2;
       nargs_constant = 1;
       break;
@@ -26513,9 +26943,13 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V4DF_FTYPE_V4DF_V4DF_V4DF:
     case V4SF_FTYPE_V4SF_V4SF_V4SF:
     case V2DF_FTYPE_V2DF_V2DF_V2DF:
+    case V32QI_FTYPE_V32QI_V32QI_V32QI:
       nargs = 3;
       break;
+    case V32QI_FTYPE_V32QI_V32QI_INT:
+    case V16HI_FTYPE_V16HI_V16HI_INT:
     case V16QI_FTYPE_V16QI_V16QI_INT:
+    case V4DI_FTYPE_V4DI_V4DI_INT:
     case V8HI_FTYPE_V8HI_V8HI_INT:
     case V8SI_FTYPE_V8SI_V8SI_INT:
     case V8SI_FTYPE_V8SI_V4SI_INT:
@@ -26526,10 +26960,16 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V4DF_FTYPE_V4DF_V2DF_INT:
     case V4SF_FTYPE_V4SF_V4SF_INT:
     case V2DI_FTYPE_V2DI_V2DI_INT:
+    case V4DI_FTYPE_V4DI_V2DI_INT:
     case V2DF_FTYPE_V2DF_V2DF_INT:
       nargs = 3;
       nargs_constant = 1;
       break;
+    case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
+      nargs = 3;
+      rmode = V4DImode;
+      nargs_constant = 1;
+      break;
     case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
       nargs = 3;
       rmode = V2DImode;
@@ -26606,6 +27046,11 @@ ix86_expand_args_builtin (const struct builtin_description *d,
          if (!match)
            switch (icode)
              {
+             case CODE_FOR_avx2_inserti128:
+             case CODE_FOR_avx2_extracti128:
+               error ("the last argument must be an 1-bit immediate");
+               return const0_rtx;
+
              case CODE_FOR_sse4_1_roundpd:
              case CODE_FOR_sse4_1_roundps:
              case CODE_FOR_sse4_1_roundsd:
@@ -26759,6 +27204,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
       break;
     case UINT64_FTYPE_PUNSIGNED:
     case V2DI_FTYPE_PV2DI:
+    case V4DI_FTYPE_PV4DI:
     case V32QI_FTYPE_PCCHAR:
     case V16QI_FTYPE_PCCHAR:
     case V8SF_FTYPE_PCV4SF:
@@ -26798,6 +27244,10 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
     case V4DF_FTYPE_PCV4DF_V4DI:
     case V4SF_FTYPE_PCV4SF_V4SI:
     case V2DF_FTYPE_PCV2DF_V2DI:
+    case V8SI_FTYPE_PCV8SI_V8SI:
+    case V4DI_FTYPE_PCV4DI_V4DI:
+    case V4SI_FTYPE_PCV4SI_V4SI:
+    case V2DI_FTYPE_PCV2DI_V2DI:
       nargs = 2;
       klass = load;
       memory = 0;
@@ -26806,6 +27256,10 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
     case VOID_FTYPE_PV4DF_V4DI_V4DF:
     case VOID_FTYPE_PV4SF_V4SI_V4SF:
     case VOID_FTYPE_PV2DF_V2DI_V2DF:
+    case VOID_FTYPE_PV8SI_V8SI_V8SI:
+    case VOID_FTYPE_PV4DI_V4DI_V4DI:
+    case VOID_FTYPE_PV4SI_V4SI_V4SI:
+    case VOID_FTYPE_PV2DI_V2DI_V2DI:
       nargs = 2;
       klass = store;
       /* Reserve memory operand for target.  */
@@ -27062,9 +27516,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
   size_t i;
   enum insn_code icode;
   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
-  tree arg0, arg1, arg2;
-  rtx op0, op1, op2, pat;
-  enum machine_mode mode0, mode1, mode2;
+  tree arg0, arg1, arg2, arg3, arg4;
+  rtx op0, op1, op2, op3, op4, pat;
+  enum machine_mode mode0, mode1, mode2, mode3, mode4;
   unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
 
   /* Determine whether the builtin function is available under the current ISA.
@@ -27333,6 +27787,100 @@ rdrand_step:
                              gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
       return target;
 
+    case IX86_BUILTIN_GATHERSIV2DF:
+      icode = CODE_FOR_avx2_gathersiv2df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4DF:
+      icode = CODE_FOR_avx2_gathersiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV2DF:
+      icode = CODE_FOR_avx2_gatherdiv2df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4DF:
+      icode = CODE_FOR_avx2_gatherdiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4SF:
+      icode = CODE_FOR_avx2_gathersiv4sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV8SF:
+      icode = CODE_FOR_avx2_gathersiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4SF:
+      icode = CODE_FOR_avx2_gatherdiv4sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV8SF:
+      icode = CODE_FOR_avx2_gatherdiv4sf256;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV2DI:
+      icode = CODE_FOR_avx2_gathersiv2di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4DI:
+      icode = CODE_FOR_avx2_gathersiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV2DI:
+      icode = CODE_FOR_avx2_gatherdiv2di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4DI:
+      icode = CODE_FOR_avx2_gatherdiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV4SI:
+      icode = CODE_FOR_avx2_gathersiv4si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERSIV8SI:
+      icode = CODE_FOR_avx2_gathersiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV4SI:
+      icode = CODE_FOR_avx2_gatherdiv4si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERDIV8SI:
+      icode = CODE_FOR_avx2_gatherdiv4si256;
+
+    gather_gen:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      arg1 = CALL_EXPR_ARG (exp, 1);
+      arg2 = CALL_EXPR_ARG (exp, 2);
+      arg3 = CALL_EXPR_ARG (exp, 3);
+      arg4 = CALL_EXPR_ARG (exp, 4);
+      op0 = expand_normal (arg0);
+      op1 = expand_normal (arg1);
+      op2 = expand_normal (arg2);
+      op3 = expand_normal (arg3);
+      op4 = expand_normal (arg4);
+      /* Note the arg order is different from the operand order.  */
+      mode0 = insn_data[icode].operand[1].mode;
+      mode1 = insn_data[icode].operand[2].mode;
+      mode2 = insn_data[icode].operand[3].mode;
+      mode3 = insn_data[icode].operand[4].mode;
+      mode4 = insn_data[icode].operand[5].mode;
+
+      if (target == NULL_RTX)
+       target = gen_reg_rtx (insn_data[icode].operand[0].mode);
+
+      /* Force memory operand only with base register here.  But we
+        don't want to do it on memory operand for other builtin
+        functions.  */
+      op1 = force_reg (Pmode, op1);
+      op1 = gen_rtx_MEM (mode1, op1);
+
+      if (!insn_data[icode].operand[1].predicate (op0, mode0))
+       op0 = copy_to_mode_reg (mode0, op0);
+      if (!insn_data[icode].operand[2].predicate (op1, mode1))
+       op1 = copy_to_mode_reg (mode1, op1);
+      if (!insn_data[icode].operand[3].predicate (op2, mode2))
+       op2 = copy_to_mode_reg (mode2, op2);
+      if (!insn_data[icode].operand[4].predicate (op3, mode3))
+       op3 = copy_to_mode_reg (mode3, op3);
+      if (!insn_data[icode].operand[5].predicate (op4, mode4))
+       {
+          error ("last argument must be scale 1, 2, 4, 8");
+          return const0_rtx;
+       }
+      pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4);
+      if (! pat)
+       return const0_rtx;
+      emit_insn (pat);
+      return target;
+
     default:
       break;
     }
@@ -35044,13 +35592,13 @@ ix86_preferred_simd_mode (enum machine_mode mode)
   switch (mode)
     {
     case QImode:
-      return V16QImode;
+      return TARGET_AVX2 ? V32QImode : V16QImode;
     case HImode:
-      return V8HImode;
+      return TARGET_AVX2 ? V16HImode : V8HImode;
     case SImode:
-      return V4SImode;
+      return TARGET_AVX2 ? V8SImode : V4SImode;
     case DImode:
-      return V2DImode;
+      return TARGET_AVX2 ? V4DImode : V2DImode;
 
     case SFmode:
       if (TARGET_AVX && !TARGET_PREFER_AVX128)
index b73d46f..d343fc2 100644 (file)
   UNSPEC_VCVTPH2PS
   UNSPEC_VCVTPS2PH
 
+  ;; For AVX2 support
+  UNSPEC_VPERMSI
+  UNSPEC_VPERMDF
+  UNSPEC_VPERMSF
+  UNSPEC_VPERMDI
+  UNSPEC_VPERMTI
+  UNSPEC_GATHER
+
   ;; For BMI support
   UNSPEC_BEXTR
 
   [(SF "ss") (DF "sd")
    (V8SF "ps") (V4DF "pd")
    (V4SF "ps") (V2DF "pd")
-   (V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")])
+   (V16QI "b") (V8HI "w") (V4SI "d") (V2DI "q")
+   (V32QI "b") (V16HI "w") (V8SI "d") (V4DI "q")])
 
 ;; SSE vector suffix for floating point modes
 (define_mode_attr ssevecmodesuffix [(SF "ps") (DF "pd")])
index 11a1a4e..3704df7 100644 (file)
 #include <avxintrin.h>
 #endif
 
+#ifdef __AVX2__
+#include <avx2intrin.h>
+#endif
+
 #ifdef __RDRND__
 extern __inline int
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
index bc0a357..b4fa04e 100644 (file)
   return i == 2 || i == 4 || i == 8;
 })
 
+;; Match 1, 2, 4, or 8
+(define_predicate "const1248_operand"
+  (match_code "const_int")
+{
+  HOST_WIDE_INT i = INTVAL (op);
+  return i == 1 || i == 2 || i == 4 || i == 8;
+})
+
 ;; Match 3, 5, or 9.  Used for leal multiplicands.
 (define_predicate "const359_operand"
   (match_code "const_int")
index e9f6c3d..5bc8586 100644 (file)
 (define_mode_iterator VI8
   [(V4DI "TARGET_AVX") V2DI])
 
+(define_mode_iterator VI1_AVX2
+  [(V32QI "TARGET_AVX2") V16QI])
+
+(define_mode_iterator VI2_AVX2
+  [(V16HI "TARGET_AVX2") V8HI])
+
+(define_mode_iterator VI4_AVX2
+  [(V8SI "TARGET_AVX2") V4SI])
+
+(define_mode_iterator VI8_AVX2
+  [(V4DI "TARGET_AVX2") V2DI])
+
+(define_mode_iterator VIMAX_AVX2
+  [(V4DI "TARGET_AVX2") V1TI])
+
+(define_mode_iterator SSESCALARMODE
+  [(V4DI "TARGET_AVX2") TI])
+
+(define_mode_iterator VI12_AVX2
+  [(V32QI "TARGET_AVX2") V16QI
+   (V16HI "TARGET_AVX2") V8HI])
+
+(define_mode_iterator VI24_AVX2
+  [(V16HI "TARGET_AVX2") V8HI
+   (V8SI "TARGET_AVX2") V4SI])
+
+(define_mode_iterator VI124_AVX2
+  [(V32QI "TARGET_AVX2") V16QI
+   (V16HI "TARGET_AVX2") V8HI
+   (V8SI "TARGET_AVX2") V4SI])
+
+(define_mode_iterator VI248_AVX2
+  [(V16HI "TARGET_AVX2") V8HI
+   (V8SI "TARGET_AVX2") V4SI
+   (V4DI "TARGET_AVX2") V2DI])
+
+(define_mode_iterator VI48_AVX2
+  [V8SI V4SI V4DI V2DI])
+
+(define_mode_iterator VI4SD_AVX2
+  [V4SI V4DI])
+
+(define_mode_iterator V48_AVX2
+  [(V4SF "TARGET_SSE") (V2DF "TARGET_SSE2")
+   (V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
+   (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2")
+   (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")])
+
+(define_mode_attr sse2_avx2
+  [(V16QI "sse2") (V32QI "avx2")
+   (V8HI "sse2") (V16HI "avx2")
+   (V4SI "sse2") (V8SI "avx2")
+   (V2DI "sse2") (V4DI "avx2")
+   (V1TI "sse2")])
+
+(define_mode_attr ssse3_avx2
+   [(V16QI "ssse3") (V32QI "avx2")
+    (V8HI "ssse3") (V16HI "avx2")
+    (V4SI "ssse3") (V8SI "avx2")
+    (V2DI "ssse3") (V4DI "avx2")
+    (TI "ssse3")])
+
+(define_mode_attr sse4_1_avx2
+   [(V16QI "sse4_1") (V32QI "avx2")
+    (V8HI "sse4_1") (V16HI "avx2")
+    (V4SI "sse4_1") (V8SI "avx2")
+    (V2DI "sse4_1") (V4DI "avx2")])
+
+(define_mode_attr avx_avx2
+  [(V4SF "avx") (V2DF "avx")
+   (V8SF "avx") (V4DF "avx")
+   (V4SI "avx2") (V2DI "avx2")
+   (V8SI "avx2") (V4DI "avx2")])
+
+;; Mapping of logic-shift operators
+(define_code_iterator lshift [lshiftrt ashift])
+
+;; Base name for define_insn
+(define_code_attr lshift_insn [(lshiftrt "srl") (ashift "sll")])
+
+;; Base name for insn mnemonic
+(define_code_attr lshift [(lshiftrt "lshr") (ashift "lshl")])
+
+(define_mode_attr ssedoublemode
+  [(V16HI "V16SI") (V8HI "V8SI")])
+
+(define_mode_attr ssebytemode
+  [(V4DI "V32QI") (V2DI "V16QI")])
+
+(define_mode_attr shortmode
+  [(V4DI "v4si") (V2DI "v2si")])
+
 ;; All 128bit vector integer modes
 (define_mode_iterator VI_128 [V16QI V8HI V4SI V2DI])
 
+;; All 256bit vector integer modes
+(define_mode_iterator VI_256 [V32QI V16HI V8SI V4DI])
+
 ;; Random 128bit vector integer mode combinations
 (define_mode_iterator VI12_128 [V16QI V8HI])
 (define_mode_iterator VI14_128 [V16QI V4SI])
 (define_mode_iterator VI24_128 [V8HI V4SI])
 (define_mode_iterator VI248_128 [V8HI V4SI V2DI])
 
+;; Random 256bit vector integer mode combinations
+(define_mode_iterator VI124_256 [V32QI V16HI V8SI])
+(define_mode_iterator VI1248_256 [V32QI V16HI V8SI V4DI])
+(define_mode_iterator VI248_256 [V16HI V8SI V4DI])
+
 ;; Int-float size matches
 (define_mode_iterator VI4F_128 [V4SI V4SF])
 (define_mode_iterator VI8F_128 [V2DI V2DF])
   [(V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI")
    (V16QI "TI") (V8HI "TI") (V4SI "TI") (V2DI "TI") (V1TI "TI")
    (V8SF "V8SF") (V4DF "V4DF")
-   (V4SF "V4SF") (V2DF "V2DF")])
+   (V4SF "V4SF") (V2DF "V2DF")
+   (TI "TI") (V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI")])
 
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr sseintvecmode
   [(V8SF "V8SI") (V4DF "V4DI")
-   (V4SF "V4SI") (V2DF "V2DI")])
+   (V4SF "V4SI") (V2DF "V2DI")
+   (V4DF "V4DI") (V8SF "V8SI")
+   (V8SI "V8SI") (V4DI "V4DI")
+   (V4SI "V4SI") (V2DI "V2DI")])
 
 ;; Mapping of vector modes to a vector mode of double size
 (define_mode_attr ssedoublevecmode
 
 ;; SSE scalar suffix for vector modes
 (define_mode_attr ssescalarmodesuffix
-  [(V8SF "ss") (V4DF "sd")
+  [(SF "ss") (DF "sd")
+   (V8SF "ss") (V4DF "sd")
    (V4SF "ss") (V2DF "sd")
    (V8SI "ss") (V4DI "sd")
    (V4SI "d")])
 
 ;; Pack/unpack vector modes
 (define_mode_attr sseunpackmode
-  [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI")])
+  [(V16QI "V8HI") (V8HI "V4SI") (V4SI "V2DI")
+   (V32QI "V16HI") (V16HI "V8SI") (V8SI "V4DI")])
 
 (define_mode_attr ssepackmode
-  [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI")])
+  [(V8HI "V16QI") (V4SI "V8HI") (V2DI "V4SI")
+   (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")])
 
 ;; Mapping of the max integer size for xop rotate immediate constraint
 (define_mode_attr sserotatemax
 ;; Instruction suffix for sign and zero extensions.
 (define_code_attr extsuffix [(sign_extend "sx") (zero_extend "zx")])
 
-
-
 ;; Mix-n-match
 (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
 
+(define_mode_iterator AVXMODE48P_DI
+                     [V2DI V2DF V4DI V4DF V4SF V4SI])
+(define_mode_attr AVXMODE48P_DI
+                     [(V2DI "V2DI") (V2DF "V2DI")
+                      (V4DI "V4DI") (V4DF "V4DI")
+                      (V4SI "V2DI") (V4SF "V2DI")
+                      (V8SI "V4DI") (V8SF "V4DI")])
+(define_mode_attr gthrfirstp
+                     [(V2DI "p") (V2DF "")
+                      (V4DI "p") (V4DF "")
+                      (V4SI "p") (V4SF "")
+                      (V8SI "p") (V8SF "")])
+(define_mode_attr gthrlastp
+                     [(V2DI "q") (V2DF "pd")
+                      (V4DI "q") (V4DF "pd")
+                      (V4SI "d") (V4SF "ps")
+                      (V8SI "d") (V8SF "ps")])
+
 (define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF])
 
 ;; Mapping of immediate bits for blend instructions
     case 1:
     case 2:
       switch (get_attr_mode (insn))
-        {
+       {
        case MODE_V8SF:
        case MODE_V4SF:
          if (TARGET_AVX
    (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
        (cond [(ne (symbol_ref "TARGET_AVX") (const_int 0))
-                (const_string "<sseinsnmode>")
+                (const_string "<sseinsnmode>")
               (ior (ior
-                     (ne (symbol_ref "optimize_function_for_size_p (cfun)")
-                         (const_int 0))
+                     (ne (symbol_ref "optimize_function_for_size_p (cfun)")
+                         (const_int 0))
                      (eq (symbol_ref "TARGET_SSE2") (const_int 0)))
                    (and (eq_attr "alternative" "2")
                         (ne (symbol_ref "TARGET_SSE_TYPELESS_STORES")
       /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax).
         Assemble the 64-bit DImode value in an xmm register.  */
       emit_insn (gen_sse2_loadld (operands[0], CONST0_RTX (V4SImode),
-                                 gen_rtx_SUBREG (SImode, operands[1], 0)));
+                                 gen_rtx_SUBREG (SImode, operands[1], 0)));
       emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode),
                                  gen_rtx_SUBREG (SImode, operands[1], 4)));
       emit_insn (gen_vec_interleave_lowv4si (operands[0], operands[0],
-                                            operands[2]));
+                                            operands[2]));
     }
  else if (memory_operand (operands[1], DImode))
    emit_insn (gen_vec_concatv2di (gen_lowpart (V2DImode, operands[0]),
-                                 operands[1], const0_rtx));
+                                 operands[1], const0_rtx));
  else
    gcc_unreachable ();
 })
 
 (define_expand "vcond<mode>"
   [(set (match_operand:VF 0 "register_operand" "")
-        (if_then_else:VF
-          (match_operator 3 ""
-            [(match_operand:VF 4 "nonimmediate_operand" "")
-             (match_operand:VF 5 "nonimmediate_operand" "")])
-          (match_operand:VF 1 "general_operand" "")
-          (match_operand:VF 2 "general_operand" "")))]
+       (if_then_else:VF
+         (match_operator 3 ""
+           [(match_operand:VF 4 "nonimmediate_operand" "")
+            (match_operand:VF 5 "nonimmediate_operand" "")])
+         (match_operand:VF 1 "general_operand" "")
+         (match_operand:VF 2 "general_operand" "")))]
   "TARGET_SSE"
 {
   bool ok = ix86_expand_fp_vcond (operands);
          (parallel [(const_int 2) (const_int 3)
                     (const_int 2) (const_int 3)])))
    (set (match_operand:V2DF 0 "register_operand" "")
-        (float:V2DF
+       (float:V2DF
          (vec_select:V2SI
          (match_dup 2)
            (parallel [(const_int 0) (const_int 1)]))))]
          (parallel [(const_int 4) (const_int 5)
                     (const_int 6) (const_int 7)])))
    (set (match_operand:V4DF 0 "register_operand" "")
-        (float:V4DF
+       (float:V4DF
          (match_dup 2)))]
   "TARGET_AVX"
   "operands[2] = gen_reg_rtx (V4SImode);")
          (parallel [(const_int 2) (const_int 3)
                     (const_int 2) (const_int 3)])))
    (set (match_dup 6)
-        (float:V2DF
+       (float:V2DF
          (vec_select:V2SI
          (match_dup 5)
            (parallel [(const_int 0) (const_int 1)]))))
   emit_insn (gen_sse2_cvttpd2dq (r1, operands[1]));
   emit_insn (gen_sse2_cvttpd2dq (r2, operands[2]));
   emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
-                                        gen_lowpart (V2DImode, r1),
-                                        gen_lowpart (V2DImode, r2)));
+                                        gen_lowpart (V2DImode, r1),
+                                        gen_lowpart (V2DImode, r2)));
   DONE;
 })
 
   emit_insn (gen_sse2_cvtpd2dq (r1, operands[1]));
   emit_insn (gen_sse2_cvtpd2dq (r2, operands[2]));
   emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
-                                        gen_lowpart (V2DImode, r1),
-                                        gen_lowpart (V2DImode, r2)));
+                                        gen_lowpart (V2DImode, r1),
+                                        gen_lowpart (V2DImode, r2)));
   DONE;
 })
 
     operands[1] = force_reg (SFmode, operands[1]);
 })
 
+(define_insn "avx2_vec_dupv4sf"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+       (vec_duplicate:V4SF
+         (vec_select:SF
+           (match_operand:V4SF 1 "register_operand" "x")
+           (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vbroadcastss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+    (set_attr "prefix" "vex")
+    (set_attr "mode" "V4SF")])
+
 (define_insn "*vec_dupv4sf_avx"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
        (vec_duplicate:V4SF
    (set_attr "prefix" "vex")
    (set_attr "mode" "V4SF")])
 
+(define_insn "avx2_vec_dupv8sf"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+       (vec_duplicate:V8SF
+         (vec_select:SF
+           (match_operand:V4SF 1 "register_operand" "x")
+           (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vbroadcastss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
 (define_insn "*vec_dupv4sf"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
        (vec_duplicate:V4SF
            (match_dup 3)
            (match_dup 4))
          (parallel [(const_int 0) (const_int 1)
-                    (const_int 4) (const_int 5)])))]
+                    (const_int 4) (const_int 5)])))]
  "TARGET_AVX"
 {
   operands[3] = gen_reg_rtx (V4DFmode);
 })
 
 ;; punpcklqdq and punpckhqdq are shorter than shufpd.
+(define_insn "avx2_interleave_highv4di"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (vec_select:V4DI
+         (vec_concat:V8DI
+           (match_operand:V4DI 1 "register_operand" "x")
+           (match_operand:V4DI 2 "nonimmediate_operand" "xm"))
+         (parallel [(const_int 1)
+                    (const_int 5)
+                    (const_int 3)
+                    (const_int 7)])))]
+  "TARGET_AVX2"
+  "vpunpckhqdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
 
 (define_insn "vec_interleave_highv2di"
   [(set (match_operand:V2DI 0 "register_operand" "=x,x")
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_interleave_lowv4di"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (vec_select:V4DI
+         (vec_concat:V8DI
+           (match_operand:V4DI 1 "register_operand" "x")
+           (match_operand:V4DI 2 "nonimmediate_operand" "xm"))
+         (parallel [(const_int 0)
+                    (const_int 4)
+                    (const_int 2)
+                    (const_int 6)])))]
+  "TARGET_AVX2"
+  "vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "vec_interleave_lowv2di"
   [(set (match_operand:V2DI 0 "register_operand" "=x,x")
        (vec_select:V2DI
   "operands[2] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));")
 
 (define_expand "<plusminus_insn><mode>3"
-  [(set (match_operand:VI_128 0 "register_operand" "")
-       (plusminus:VI_128
-         (match_operand:VI_128 1 "nonimmediate_operand" "")
-         (match_operand:VI_128 2 "nonimmediate_operand" "")))]
+  [(set (match_operand:VI 0 "register_operand" "")
+       (plusminus:VI
+         (match_operand:VI 1 "nonimmediate_operand" "")
+         (match_operand:VI 2 "nonimmediate_operand" "")))]
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
 (define_insn "*<plusminus_insn><mode>3"
-  [(set (match_operand:VI_128 0 "register_operand" "=x,x")
-       (plusminus:VI_128
-         (match_operand:VI_128 1 "nonimmediate_operand" "<comm>0,x")
-         (match_operand:VI_128 2 "nonimmediate_operand" "xm,xm")))]
+  [(set (match_operand:VI 0 "register_operand" "=x,x")
+       (plusminus:VI
+         (match_operand:VI 1 "nonimmediate_operand" "<comm>0,x")
+         (match_operand:VI 2 "nonimmediate_operand" "xm,xm")))]
   "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "@
    p<plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2}
    (set_attr "type" "sseiadd")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
-(define_expand "sse2_<plusminus_insn><mode>3"
-  [(set (match_operand:VI12_128 0 "register_operand" "")
-       (sat_plusminus:VI12_128
-         (match_operand:VI12_128 1 "nonimmediate_operand" "")
-         (match_operand:VI12_128 2 "nonimmediate_operand" "")))]
+(define_expand "<sse2_avx2>_<plusminus_insn><mode>3"
+  [(set (match_operand:VI12_AVX2 0 "register_operand" "")
+       (sat_plusminus:VI12_AVX2
+         (match_operand:VI12_AVX2 1 "nonimmediate_operand" "")
+         (match_operand:VI12_AVX2 2 "nonimmediate_operand" "")))]
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
-(define_insn "*sse2_<plusminus_insn><mode>3"
-  [(set (match_operand:VI12_128 0 "register_operand" "=x,x")
-       (sat_plusminus:VI12_128
-         (match_operand:VI12_128 1 "nonimmediate_operand" "<comm>0,x")
-         (match_operand:VI12_128 2 "nonimmediate_operand" "xm,xm")))]
+(define_insn "*<sse2_avx2>_<plusminus_insn><mode>3"
+  [(set (match_operand:VI12_AVX2 0 "register_operand" "=x,x")
+       (sat_plusminus:VI12_AVX2
+         (match_operand:VI12_AVX2 1 "nonimmediate_operand" "<comm>0,x")
+         (match_operand:VI12_AVX2 2 "nonimmediate_operand" "xm,xm")))]
   "TARGET_SSE2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "@
    p<plusminus_mnemonic><ssemodesuffix>\t{%2, %0|%0, %2}
   DONE;
 })
 
-(define_expand "mulv8hi3"
-  [(set (match_operand:V8HI 0 "register_operand" "")
-       (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "")
-                  (match_operand:V8HI 2 "nonimmediate_operand" "")))]
+(define_expand "mul<mode>3"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "")
+       (mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "nonimmediate_operand" "")
+                      (match_operand:VI2_AVX2 2 "nonimmediate_operand" "")))]
   "TARGET_SSE2"
-  "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);")
+  "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
 
-(define_insn "*mulv8hi3"
-  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
-       (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%0,x")
-                  (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm")))]
-  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
+(define_insn "*mul<mode>3"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x")
+       (mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "nonimmediate_operand" "%0,x")
+                      (match_operand:VI2_AVX2 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
   "@
    pmullw\t{%2, %0|%0, %2}
    vpmullw\t{%2, %1, %0|%0, %1, %2}"
    (set_attr "type" "sseimul")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
-(define_expand "<s>mulv8hi3_highpart"
-  [(set (match_operand:V8HI 0 "register_operand" "")
-        (truncate:V8HI
-          (lshiftrt:V8SI
-            (mult:V8SI
-              (any_extend:V8SI
-                (match_operand:V8HI 1 "nonimmediate_operand" ""))
-              (any_extend:V8SI
-                (match_operand:V8HI 2 "nonimmediate_operand" "")))
-            (const_int 16))))]
+(define_expand "<s>mul<mode>3_highpart"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "")
+       (truncate:VI2_AVX2
+         (lshiftrt:<ssedoublemode>
+           (mult:<ssedoublemode>
+             (any_extend:<ssedoublemode>
+               (match_operand:VI2_AVX2 1 "nonimmediate_operand" ""))
+             (any_extend:<ssedoublemode>
+               (match_operand:VI2_AVX2 2 "nonimmediate_operand" "")))
+           (const_int 16))))]
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);")
 
-(define_insn "*<s>mulv8hi3_highpart"
-  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
-       (truncate:V8HI
-         (lshiftrt:V8SI
-           (mult:V8SI
-             (any_extend:V8SI
-               (match_operand:V8HI 1 "nonimmediate_operand" "%0,x"))
-             (any_extend:V8SI
-               (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm")))
+(define_insn "*<s>mul<mode>3_highpart"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x")
+       (truncate:VI2_AVX2
+         (lshiftrt:<ssedoublemode>
+           (mult:<ssedoublemode>
+             (any_extend:<ssedoublemode>
+               (match_operand:VI2_AVX2 1 "nonimmediate_operand" "%0,x"))
+             (any_extend:<ssedoublemode>
+               (match_operand:VI2_AVX2 2 "nonimmediate_operand" "xm,xm")))
            (const_int 16))))]
-  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
+  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
   "@
    pmulh<u>w\t{%2, %0|%0, %2}
    vpmulh<u>w\t{%2, %1, %0|%0, %1, %2}"
    (set_attr "type" "sseimul")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx2_umulv4siv4di3"
+  [(set (match_operand:V4DI 0 "register_operand" "")
+       (mult:V4DI
+         (zero_extend:V4DI
+           (vec_select:V4SI
+             (match_operand:V8SI 1 "nonimmediate_operand" "")
+             (parallel [(const_int 0) (const_int 2)
+                        (const_int 4) (const_int 6)])))
+         (zero_extend:V4DI
+           (vec_select:V4SI
+             (match_operand:V8SI 2 "nonimmediate_operand" "")
+             (parallel [(const_int 0) (const_int 2)
+                        (const_int 4) (const_int 6)])))))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V8SImode, operands);")
+
+(define_insn "*avx_umulv4siv4di3"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (mult:V4DI
+         (zero_extend:V4DI
+           (vec_select:V4SI
+             (match_operand:V8SI 1 "nonimmediate_operand" "%x")
+             (parallel [(const_int 0) (const_int 2)
+                        (const_int 4) (const_int 6)])))
+         (zero_extend:V4DI
+           (vec_select:V4SI
+             (match_operand:V8SI 2 "nonimmediate_operand" "xm")
+             (parallel [(const_int 0) (const_int 2)
+                        (const_int 4) (const_int 6)])))))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V8SImode, operands)"
+  "vpmuludq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
 
 (define_expand "sse2_umulv2siv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "")
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_expand "avx2_mulv4siv4di3"
+  [(set (match_operand:V4DI 0 "register_operand" "")
+       (mult:V4DI
+         (sign_extend:V4DI
+           (vec_select:V4SI
+             (match_operand:V8SI 1 "nonimmediate_operand" "")
+             (parallel [(const_int 0) (const_int 2)
+                        (const_int 4) (const_int 6)])))
+         (sign_extend:V4DI
+           (vec_select:V4SI
+             (match_operand:V8SI 2 "nonimmediate_operand" "")
+             (parallel [(const_int 0) (const_int 2)
+                        (const_int 4) (const_int 6)])))))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V8SImode, operands);")
+
+(define_insn "*avx2_mulv4siv4di3"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (mult:V4DI
+         (sign_extend:V4DI
+           (vec_select:V4SI
+             (match_operand:V8SI 1 "nonimmediate_operand" "x")
+             (parallel [(const_int 0) (const_int 2)
+                        (const_int 4) (const_int 6)])))
+         (sign_extend:V4DI
+           (vec_select:V4SI
+             (match_operand:V8SI 2 "nonimmediate_operand" "xm")
+             (parallel [(const_int 0) (const_int 2)
+                        (const_int 4) (const_int 6)])))))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V8SImode, operands)"
+  "vpmuldq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "isa" "avx")
+   (set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_expand "sse4_1_mulv2siv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "")
        (mult:V2DI
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_expand "avx2_pmaddwd"
+  [(set (match_operand:V8SI 0 "register_operand" "")
+       (plus:V8SI
+         (mult:V8SI
+           (sign_extend:V8SI
+             (vec_select:V8HI
+               (match_operand:V16HI 1 "nonimmediate_operand" "")
+               (parallel [(const_int 0)
+                          (const_int 2)
+                          (const_int 4)
+                          (const_int 6)
+                          (const_int 8)
+                          (const_int 10)
+                          (const_int 12)
+                          (const_int 14)])))
+           (sign_extend:V8SI
+             (vec_select:V8HI
+               (match_operand:V16HI 2 "nonimmediate_operand" "")
+               (parallel [(const_int 0)
+                          (const_int 2)
+                          (const_int 4)
+                          (const_int 6)
+                          (const_int 8)
+                          (const_int 10)
+                          (const_int 12)
+                          (const_int 14)]))))
+         (mult:V8SI
+           (sign_extend:V8SI
+             (vec_select:V8HI (match_dup 1)
+               (parallel [(const_int 1)
+                          (const_int 3)
+                          (const_int 5)
+                          (const_int 7)
+                          (const_int 9)
+                          (const_int 11)
+                          (const_int 13)
+                          (const_int 15)])))
+           (sign_extend:V8SI
+             (vec_select:V8HI (match_dup 2)
+               (parallel [(const_int 1)
+                          (const_int 3)
+                          (const_int 5)
+                          (const_int 7)
+                          (const_int 9)
+                          (const_int 11)
+                          (const_int 13)
+                          (const_int 15)]))))))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V16HImode, operands);")
+
 (define_expand "sse2_pmaddwd"
   [(set (match_operand:V4SI 0 "register_operand" "")
        (plus:V4SI
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);")
 
+(define_insn "*avx2_pmaddwd"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (plus:V8SI
+         (mult:V8SI
+           (sign_extend:V8SI
+             (vec_select:V8HI
+               (match_operand:V16HI 1 "nonimmediate_operand" "%x")
+               (parallel [(const_int 0)
+                          (const_int 2)
+                          (const_int 4)
+                          (const_int 6)
+                          (const_int 8)
+                          (const_int 10)
+                          (const_int 12)
+                          (const_int 14)])))
+           (sign_extend:V8SI
+             (vec_select:V8HI
+               (match_operand:V16HI 2 "nonimmediate_operand" "xm")
+               (parallel [(const_int 0)
+                          (const_int 2)
+                          (const_int 4)
+                          (const_int 6)
+                          (const_int 8)
+                          (const_int 10)
+                          (const_int 12)
+                          (const_int 14)]))))
+         (mult:V8SI
+           (sign_extend:V8SI
+             (vec_select:V8HI (match_dup 1)
+               (parallel [(const_int 1)
+                          (const_int 3)
+                          (const_int 5)
+                          (const_int 7)
+                          (const_int 9)
+                          (const_int 11)
+                          (const_int 13)
+                          (const_int 15)])))
+           (sign_extend:V8SI
+             (vec_select:V8HI (match_dup 2)
+               (parallel [(const_int 1)
+                          (const_int 3)
+                          (const_int 5)
+                          (const_int 7)
+                          (const_int 9)
+                          (const_int 11)
+                          (const_int 13)
+                          (const_int 15)]))))))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V16HImode, operands)"
+  "vpmaddwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "*sse2_pmaddwd"
   [(set (match_operand:V4SI 0 "register_operand" "=x,x")
        (plus:V4SI
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
-(define_expand "mulv4si3"
-  [(set (match_operand:V4SI 0 "register_operand" "")
-       (mult:V4SI (match_operand:V4SI 1 "register_operand" "")
-                  (match_operand:V4SI 2 "register_operand" "")))]
+(define_expand "mul<mode>3"
+  [(set (match_operand:VI4_AVX2 0 "register_operand" "")
+       (mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand" "")
+                      (match_operand:VI4_AVX2 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
   if (TARGET_SSE4_1 || TARGET_AVX)
-    ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);
+    ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);
 })
 
-(define_insn "*sse4_1_mulv4si3"
-  [(set (match_operand:V4SI 0 "register_operand" "=x,x")
-       (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%0,x")
-                  (match_operand:V4SI 2 "nonimmediate_operand" "xm,xm")))]
-  "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)"
+(define_insn "*<sse4_1_avx2>_mul<mode>3"
+  [(set (match_operand:VI4_AVX2 0 "register_operand" "=x,x")
+       (mult:VI4_AVX2 (match_operand:VI4_AVX2 1 "nonimmediate_operand" "%0,x")
+                      (match_operand:VI4_AVX2 2 "nonimmediate_operand" "xm,xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
   "@
    pmulld\t{%2, %0|%0, %2}
    vpmulld\t{%2, %1, %0|%0, %1, %2}"
    (set_attr "type" "sseimul")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn_and_split "*sse2_mulv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "")
 
       /* Multiply low parts.  */
       emit_insn (gen_sse2_umulv2siv2di3 (t1, gen_lowpart (V4SImode, op1),
-                                        gen_lowpart (V4SImode, op2)));
+                                        gen_lowpart (V4SImode, op2)));
 
       /* Shift input vectors left 32 bits so we can multiply high parts.  */
       emit_insn (gen_lshrv2di3 (t2, op1, thirtytwo));
 })
 
 (define_insn "ashr<mode>3"
-  [(set (match_operand:VI24_128 0 "register_operand" "=x,x")
-       (ashiftrt:VI24_128
-         (match_operand:VI24_128 1 "register_operand" "0,x")
+  [(set (match_operand:VI24_AVX2 0 "register_operand" "=x,x")
+       (ashiftrt:VI24_AVX2
+         (match_operand:VI24_AVX2 1 "register_operand" "0,x")
          (match_operand:SI 2 "nonmemory_operand" "xN,xN")))]
   "TARGET_SSE2"
   "@
        (const_string "0")))
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_lshrqv4di3"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (lshiftrt:V4DI
+        (match_operand:V4DI 1 "register_operand" "x")
+        (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))]
+  "TARGET_AVX2"
+{
+  operands[2] = GEN_INT (INTVAL (operands[2]) / 8);
+  return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "OI")])
 
 (define_insn "lshr<mode>3"
-  [(set (match_operand:VI248_128 0 "register_operand" "=x,x")
-       (lshiftrt:VI248_128
-         (match_operand:VI248_128 1 "register_operand" "0,x")
+  [(set (match_operand:VI248_AVX2 0 "register_operand" "=x,x")
+       (lshiftrt:VI248_AVX2
+         (match_operand:VI248_AVX2 1 "register_operand" "0,x")
          (match_operand:SI 2 "nonmemory_operand" "xN,xN")))]
   "TARGET_SSE2"
   "@
        (const_string "0")))
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_lshlqv4di3"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (ashift:V4DI (match_operand:V4DI 1 "register_operand" "x")
+                    (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))]
+  "TARGET_AVX2"
+{
+  operands[2] = GEN_INT (INTVAL (operands[2]) / 8);
+  return "vpslldq\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_lshl<mode>3"
+  [(set (match_operand:VI248_256 0 "register_operand" "=x")
+       (ashift:VI248_256
+         (match_operand:VI248_256 1 "register_operand" "x")
+         (match_operand:SI 2 "nonmemory_operand" "xN")))]
+  "TARGET_AVX2"
+  "vpsll<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set (attr "length_immediate")
+     (if_then_else (match_operand 2 "const_int_operand" "")
+       (const_string "1")
+       (const_string "0")))
+   (set_attr "mode" "OI")])
 
 (define_insn "ashl<mode>3"
   [(set (match_operand:VI248_128 0 "register_operand" "=x,x")
 
 (define_expand "vec_shl_<mode>"
   [(set (match_operand:VI_128 0 "register_operand" "")
-        (ashift:V1TI
+       (ashift:V1TI
         (match_operand:VI_128 1 "register_operand" "")
         (match_operand:SI 2 "const_0_to_255_mul_8_operand" "")))]
   "TARGET_SSE2"
   operands[1] = gen_lowpart (V1TImode, operands[1]);
 })
 
-(define_insn "sse2_ashlv1ti3"
-  [(set (match_operand:V1TI 0 "register_operand" "=x,x")
-       (ashift:V1TI
-        (match_operand:V1TI 1 "register_operand" "0,x")
+(define_insn "<sse2_avx2>_ashl<mode>3"
+  [(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,x")
+       (ashift:VIMAX_AVX2
+        (match_operand:VIMAX_AVX2 1 "register_operand" "0,x")
         (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))]
   "TARGET_SSE2"
 {
    (set_attr "length_immediate" "1")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_expand "vec_shr_<mode>"
   [(set (match_operand:VI_128 0 "register_operand" "")
-        (lshiftrt:V1TI
+       (lshiftrt:V1TI
         (match_operand:VI_128 1 "register_operand" "")
         (match_operand:SI 2 "const_0_to_255_mul_8_operand" "")))]
   "TARGET_SSE2"
   operands[1] = gen_lowpart (V1TImode, operands[1]);
 })
 
+(define_expand "avx2_<code><mode>3"
+  [(set (match_operand:VI124_256 0 "register_operand" "")
+       (umaxmin:VI124_256
+         (match_operand:VI124_256 1 "nonimmediate_operand" "")
+         (match_operand:VI124_256 2 "nonimmediate_operand" "")))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*avx2_<code><mode>3"
+  [(set (match_operand:VI124_256 0 "register_operand" "=x")
+       (umaxmin:VI124_256
+         (match_operand:VI124_256 1 "nonimmediate_operand" "%x")
+         (match_operand:VI124_256 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "sse2_lshrv1ti3"
   [(set (match_operand:V1TI 0 "register_operand" "=x,x")
-       (lshiftrt:V1TI
+       (lshiftrt:V1TI
         (match_operand:V1TI 1 "register_operand" "0,x")
         (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))]
   "TARGET_SSE2"
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_expand "avx2_<code><mode>3"
+  [(set (match_operand:VI124_256 0 "register_operand" "")
+       (smaxmin:VI124_256
+         (match_operand:VI124_256 1 "nonimmediate_operand" "")
+         (match_operand:VI124_256 2 "nonimmediate_operand" "")))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*avx2_<code><mode>3"
+  [(set (match_operand:VI124_256 0 "register_operand" "=x")
+       (smaxmin:VI124_256
+         (match_operand:VI124_256 1 "nonimmediate_operand" "%x")
+         (match_operand:VI124_256 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "*sse4_1_<code><mode>3"
   [(set (match_operand:VI14_128 0 "register_operand" "=x,x")
        (smaxmin:VI14_128
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(define_expand "avx2_eq<mode>3"
+  [(set (match_operand:VI1248_256 0 "register_operand" "")
+       (eq:VI1248_256
+         (match_operand:VI1248_256 1 "nonimmediate_operand" "")
+         (match_operand:VI1248_256 2 "nonimmediate_operand" "")))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);")
+
+(define_insn "*avx2_eq<mode>3"
+  [(set (match_operand:VI1248_256 0 "register_operand" "=x")
+       (eq:VI1248_256
+         (match_operand:VI1248_256 1 "nonimmediate_operand" "%x")
+         (match_operand:VI1248_256 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (EQ, <MODE>mode, operands)"
+  "vpcmpeq<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "*sse4_1_eqv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "=x,x")
        (eq:V2DI
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_gt<mode>3"
+  [(set (match_operand:VI1248_256 0 "register_operand" "=x")
+       (gt:VI1248_256
+         (match_operand:VI1248_256 1 "register_operand" "x")
+         (match_operand:VI1248_256 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vpcmpgt<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "sse2_gt<mode>3"
   [(set (match_operand:VI124_128 0 "register_operand" "=x,x")
        (gt:VI124_128
 
 (define_expand "vcond<mode>"
   [(set (match_operand:VI124_128 0 "register_operand" "")
-        (if_then_else:VI124_128
-          (match_operator 3 ""
-            [(match_operand:VI124_128 4 "nonimmediate_operand" "")
-             (match_operand:VI124_128 5 "nonimmediate_operand" "")])
-          (match_operand:VI124_128 1 "general_operand" "")
-          (match_operand:VI124_128 2 "general_operand" "")))]
+       (if_then_else:VI124_128
+         (match_operator 3 ""
+           [(match_operand:VI124_128 4 "nonimmediate_operand" "")
+            (match_operand:VI124_128 5 "nonimmediate_operand" "")])
+         (match_operand:VI124_128 1 "general_operand" "")
+         (match_operand:VI124_128 2 "general_operand" "")))]
   "TARGET_SSE2"
 {
   bool ok = ix86_expand_int_vcond (operands);
 
 (define_expand "vcondv2di"
   [(set (match_operand:V2DI 0 "register_operand" "")
-        (if_then_else:V2DI
-          (match_operator 3 ""
-            [(match_operand:V2DI 4 "nonimmediate_operand" "")
-             (match_operand:V2DI 5 "nonimmediate_operand" "")])
-          (match_operand:V2DI 1 "general_operand" "")
-          (match_operand:V2DI 2 "general_operand" "")))]
+       (if_then_else:V2DI
+         (match_operator 3 ""
+           [(match_operand:V2DI 4 "nonimmediate_operand" "")
+            (match_operand:V2DI 5 "nonimmediate_operand" "")])
+         (match_operand:V2DI 1 "general_operand" "")
+         (match_operand:V2DI 2 "general_operand" "")))]
   "TARGET_SSE4_2"
 {
   bool ok = ix86_expand_int_vcond (operands);
 
 (define_expand "vcondu<mode>"
   [(set (match_operand:VI124_128 0 "register_operand" "")
-        (if_then_else:VI124_128
-          (match_operator 3 ""
-            [(match_operand:VI124_128 4 "nonimmediate_operand" "")
-             (match_operand:VI124_128 5 "nonimmediate_operand" "")])
-          (match_operand:VI124_128 1 "general_operand" "")
-          (match_operand:VI124_128 2 "general_operand" "")))]
+       (if_then_else:VI124_128
+         (match_operator 3 ""
+           [(match_operand:VI124_128 4 "nonimmediate_operand" "")
+            (match_operand:VI124_128 5 "nonimmediate_operand" "")])
+         (match_operand:VI124_128 1 "general_operand" "")
+         (match_operand:VI124_128 2 "general_operand" "")))]
   "TARGET_SSE2"
 {
   bool ok = ix86_expand_int_vcond (operands);
 
 (define_expand "vconduv2di"
   [(set (match_operand:V2DI 0 "register_operand" "")
-        (if_then_else:V2DI
-          (match_operator 3 ""
-            [(match_operand:V2DI 4 "nonimmediate_operand" "")
-             (match_operand:V2DI 5 "nonimmediate_operand" "")])
-          (match_operand:V2DI 1 "general_operand" "")
-          (match_operand:V2DI 2 "general_operand" "")))]
+       (if_then_else:V2DI
+         (match_operator 3 ""
+           [(match_operand:V2DI 4 "nonimmediate_operand" "")
+            (match_operand:V2DI 5 "nonimmediate_operand" "")])
+         (match_operand:V2DI 1 "general_operand" "")
+         (match_operand:V2DI 2 "general_operand" "")))]
   "TARGET_SSE4_2"
 {
   bool ok = ix86_expand_int_vcond (operands);
   operands[2] = force_reg (<MODE>mode, gen_rtx_CONST_VECTOR (<MODE>mode, v));
 })
 
-(define_expand "sse2_andnot<mode>3"
-  [(set (match_operand:VI_128 0 "register_operand" "")
-       (and:VI_128
-         (not:VI_128 (match_operand:VI_128 1 "register_operand" ""))
-         (match_operand:VI_128 2 "nonimmediate_operand" "")))]
+(define_expand "<sse2_avx2>_andnot<mode>3"
+  [(set (match_operand:VI 0 "register_operand" "")
+       (and:VI
+         (not:VI (match_operand:VI 1 "register_operand" ""))
+         (match_operand:VI 2 "nonimmediate_operand" "")))]
   "TARGET_SSE2")
 
 (define_insn "*andnot<mode>3"
   static char buf[32];
   const char *ops;
   const char *tmp
-    = (get_attr_mode (insn) == MODE_TI) ? "pandn" : "andnps";
+    = ((get_attr_mode (insn) == MODE_TI) ||
+       (get_attr_mode (insn) == MODE_OI)) ? "pandn" : "andnps";
 
   switch (which_alternative)
     {
              (const_string "V8SF")
            (ne (symbol_ref "TARGET_SSE2") (const_int 0))
              (const_string "TI")
+           (ne (symbol_ref "TARGET_AVX2") (const_int 0))
+             (const_string "OI")
           ]
           (const_string "V4SF")))])
 
   static char buf[32];
   const char *ops;
   const char *tmp
-    = (get_attr_mode (insn) == MODE_TI) ? "p<logic>" : "<logic>ps";
+    = (get_attr_mode (insn) == MODE_TI)||
+      (get_attr_mode (insn) == MODE_OI) ? "p<logic>" : "<logic>ps";
 
   switch (which_alternative)
     {
              (const_string "V8SF")
            (ne (symbol_ref "TARGET_SSE2") (const_int 0))
              (const_string "TI")
+           (ne (symbol_ref "TARGET_AVX2") (const_int 0))
+             (const_string "OI")
           ]
           (const_string "V4SF")))])
 
   DONE;
 })
 
-(define_insn "sse2_packsswb"
-  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
-       (vec_concat:V16QI
-         (ss_truncate:V8QI
-           (match_operand:V8HI 1 "register_operand" "0,x"))
-         (ss_truncate:V8QI
-           (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm"))))]
+(define_insn "<sse2_avx2>_packsswb"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+       (vec_concat:VI1_AVX2
+         (ss_truncate:<ssehalfvecmode>
+           (match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
+         (ss_truncate:<ssehalfvecmode>
+           (match_operand:<sseunpackmode> 2 "nonimmediate_operand" "xm,xm"))))]
   "TARGET_SSE2"
   "@
    packsswb\t{%2, %0|%0, %2}
    (set_attr "type" "sselog")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "sse2_packssdw"
-  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
-       (vec_concat:V8HI
-         (ss_truncate:V4HI
-           (match_operand:V4SI 1 "register_operand" "0,x"))
-         (ss_truncate:V4HI
-           (match_operand:V4SI 2 "nonimmediate_operand" "xm,xm"))))]
+(define_insn "<sse2_avx2>_packssdw"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x")
+       (vec_concat:VI2_AVX2
+         (ss_truncate:<ssehalfvecmode>
+           (match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
+         (ss_truncate:<ssehalfvecmode>
+           (match_operand:<sseunpackmode> 2 "nonimmediate_operand" "xm,xm"))))]
   "TARGET_SSE2"
   "@
    packssdw\t{%2, %0|%0, %2}
    (set_attr "type" "sselog")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "sse2_packuswb"
-  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
-       (vec_concat:V16QI
-         (us_truncate:V8QI
-           (match_operand:V8HI 1 "register_operand" "0,x"))
-         (us_truncate:V8QI
-           (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm"))))]
+(define_insn "<sse2_avx2>_packuswb"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+       (vec_concat:VI1_AVX2
+         (us_truncate:<ssehalfvecmode>
+           (match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
+         (us_truncate:<ssehalfvecmode>
+           (match_operand:<sseunpackmode> 2 "nonimmediate_operand" "xm,xm"))))]
   "TARGET_SSE2"
   "@
    packuswb\t{%2, %0|%0, %2}
    (set_attr "type" "sselog")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "vec_interleave_highv16qi"
-  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
+(define_insn "avx2_interleave_highv32qi"
+  [(set (match_operand:V32QI 0 "register_operand" "=x")
+       (vec_select:V32QI
+         (vec_concat:V64QI
+           (match_operand:V32QI 1 "register_operand" "x")
+           (match_operand:V32QI 2 "nonimmediate_operand" "xm"))
+         (parallel [(const_int 8)  (const_int 40)
+                    (const_int 9)  (const_int 41)
+                    (const_int 10) (const_int 42)
+                    (const_int 11) (const_int 43)
+                    (const_int 12) (const_int 44)
+                    (const_int 13) (const_int 45)
+                    (const_int 14) (const_int 46)
+                    (const_int 15) (const_int 47)
+                    (const_int 24) (const_int 56)
+                    (const_int 25) (const_int 57)
+                    (const_int 26) (const_int 58)
+                    (const_int 27) (const_int 59)
+                    (const_int 28) (const_int 60)
+                    (const_int 29) (const_int 61)
+                    (const_int 30) (const_int 62)
+                    (const_int 31) (const_int 63)
+                    (const_int 32) (const_int 64)])))]
+  "TARGET_AVX2"
+  "vpunpckhbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "vec_interleave_highv16qi"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
        (vec_select:V16QI
          (vec_concat:V32QI
            (match_operand:V16QI 1 "register_operand" "0,x")
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_interleave_lowv32qi"
+  [(set (match_operand:V32QI 0 "register_operand" "=x")
+       (vec_select:V32QI
+         (vec_concat:V64QI
+           (match_operand:V32QI 1 "register_operand" "x")
+           (match_operand:V32QI 2 "nonimmediate_operand" "xm"))
+         (parallel [(const_int 0) (const_int 32)
+                    (const_int 1) (const_int 33)
+                    (const_int 2) (const_int 34)
+                    (const_int 3) (const_int 35)
+                    (const_int 4) (const_int 36)
+                    (const_int 5) (const_int 37)
+                    (const_int 6) (const_int 38)
+                    (const_int 7) (const_int 39)
+                    (const_int 15) (const_int 47)
+                    (const_int 16) (const_int 48)
+                    (const_int 17) (const_int 49)
+                    (const_int 18) (const_int 50)
+                    (const_int 19) (const_int 51)
+                    (const_int 20) (const_int 52)
+                    (const_int 21) (const_int 53)
+                    (const_int 22) (const_int 54)
+                    (const_int 23) (const_int 55)])))]
+  "TARGET_AVX2"
+  "vpunpcklbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "vec_interleave_lowv16qi"
   [(set (match_operand:V16QI 0 "register_operand" "=x,x")
        (vec_select:V16QI
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_interleave_highv16hi"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (vec_select:V16HI
+         (vec_concat:V32HI
+           (match_operand:V16HI 1 "register_operand" "x")
+           (match_operand:V16HI 2 "nonimmediate_operand" "xm"))
+         (parallel [(const_int 4) (const_int 20)
+                    (const_int 5) (const_int 21)
+                    (const_int 6) (const_int 22)
+                    (const_int 7) (const_int 23)
+                    (const_int 12) (const_int 28)
+                    (const_int 13) (const_int 29)
+                    (const_int 14) (const_int 30)
+                    (const_int 15) (const_int 31)])))]
+  "TARGET_AVX2"
+  "vpunpckhwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "vec_interleave_highv8hi"
   [(set (match_operand:V8HI 0 "register_operand" "=x,x")
        (vec_select:V8HI
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_interleave_lowv16hi"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (vec_select:V16HI
+         (vec_concat:V32HI
+           (match_operand:V16HI 1 "register_operand" "x")
+           (match_operand:V16HI 2 "nonimmediate_operand" "xm"))
+         (parallel [(const_int 0) (const_int 16)
+                    (const_int 1) (const_int 17)
+                    (const_int 2) (const_int 18)
+                    (const_int 3) (const_int 19)
+                    (const_int 8) (const_int 24)
+                    (const_int 9) (const_int 25)
+                    (const_int 10) (const_int 26)
+                    (const_int 11) (const_int 27)])))]
+  "TARGET_AVX2"
+  "vpunpcklwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "vec_interleave_lowv8hi"
   [(set (match_operand:V8HI 0 "register_operand" "=x,x")
        (vec_select:V8HI
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_interleave_highv8si"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (vec_select:V8SI
+         (vec_concat:V16SI
+           (match_operand:V8SI 1 "register_operand" "x")
+           (match_operand:V8SI 2 "nonimmediate_operand" "xm"))
+         (parallel [(const_int 2) (const_int 10)
+                    (const_int 3) (const_int 11)
+                    (const_int 6) (const_int 14)
+                    (const_int 7) (const_int 15)])))]
+  "TARGET_AVX2"
+  "vpunpckhdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "vec_interleave_highv4si"
   [(set (match_operand:V4SI 0 "register_operand" "=x,x")
        (vec_select:V4SI
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_interleave_lowv8si"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (vec_select:V8SI
+         (vec_concat:V16SI
+           (match_operand:V8SI 1 "register_operand" "x")
+           (match_operand:V8SI 2 "nonimmediate_operand" "xm"))
+         (parallel [(const_int 0) (const_int 8)
+                    (const_int 1) (const_int 9)
+                    (const_int 4) (const_int 12)
+                    (const_int 5) (const_int 13)])))]
+  "TARGET_AVX2"
+  "vpunpckldq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "vec_interleave_lowv4si"
   [(set (match_operand:V4SI 0 "register_operand" "=x,x")
        (vec_select:V4SI
     {
     case 0:
       if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
-        return "pinsr<ssemodesuffix>\t{%3, %k2, %0|%0, %k2, %3}";
+       return "pinsr<ssemodesuffix>\t{%3, %k2, %0|%0, %k2, %3}";
       /* FALLTHRU */
     case 1:
       return "pinsr<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}";
     case 2:
       if (GET_MODE_SIZE (<ssescalarmode>mode) < GET_MODE_SIZE (SImode))
-        return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
+       return "vpinsr<ssemodesuffix>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
       /* FALLTHRU */
     case 3:
       return "vpinsr<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_expand "avx2_pshufdv3"
+  [(match_operand:V8SI 0 "register_operand" "")
+   (match_operand:V8SI 1 "nonimmediate_operand" "")
+   (match_operand:SI 2 "const_0_to_255_operand" "")]
+  "TARGET_AVX2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_avx2_pshufd_1 (operands[0], operands[1],
+                               GEN_INT ((mask >> 0) & 3),
+                               GEN_INT ((mask >> 2) & 3),
+                               GEN_INT ((mask >> 4) & 3),
+                               GEN_INT ((mask >> 6) & 3)));
+  DONE;
+})
+
+(define_insn "avx2_pshufd_1"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (vec_select:V8SI
+         (match_operand:V8SI 1 "nonimmediate_operand" "xm")
+         (parallel [(match_operand 2 "const_0_to_3_operand" "")
+                    (match_operand 3 "const_0_to_3_operand" "")
+                    (match_operand 4 "const_0_to_3_operand" "")
+                    (match_operand 5 "const_0_to_3_operand" "")
+                    (match_dup 2)
+                    (match_dup 3)
+                    (match_dup 4)
+                    (match_dup 5)])))]
+  "TARGET_AVX2"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "vpshufd\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "OI")])
+
 (define_expand "sse2_pshufd"
   [(match_operand:V4SI 0 "register_operand" "")
    (match_operand:V4SI 1 "nonimmediate_operand" "")
    (set_attr "length_immediate" "1")
    (set_attr "mode" "TI")])
 
+(define_expand "avx2_pshuflwv3"
+  [(match_operand:V16HI 0 "register_operand" "")
+   (match_operand:V16HI 1 "nonimmediate_operand" "")
+   (match_operand:SI 2 "const_0_to_255_operand" "")]
+  "TARGET_AVX2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_avx2_pshuflw_1 (operands[0], operands[1],
+                                GEN_INT ((mask >> 0) & 3),
+                                GEN_INT ((mask >> 2) & 3),
+                                GEN_INT ((mask >> 4) & 3),
+                                GEN_INT ((mask >> 6) & 3)));
+  DONE;
+})
+
+(define_insn "avx2_pshuflw_1"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (vec_select:V16HI
+         (match_operand:V16HI 1 "nonimmediate_operand" "xm")
+         (parallel [(match_operand 2 "const_0_to_3_operand" "")
+                    (match_operand 3 "const_0_to_3_operand" "")
+                    (match_operand 4 "const_0_to_3_operand" "")
+                    (match_operand 5 "const_0_to_3_operand" "")
+                    (const_int 4)
+                    (const_int 5)
+                    (const_int 6)
+                    (const_int 7)
+                    (match_dup 2)
+                    (match_dup 3)
+                    (match_dup 4)
+                    (match_dup 5)
+                    (const_int 12)
+                    (const_int 13)
+                    (const_int 14)
+                    (const_int 15)])))]
+  "TARGET_AVX2"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[2]) << 0;
+  mask |= INTVAL (operands[3]) << 2;
+  mask |= INTVAL (operands[4]) << 4;
+  mask |= INTVAL (operands[5]) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "vpshuflw\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "OI")])
+
 (define_expand "sse2_pshuflw"
   [(match_operand:V8HI 0 "register_operand" "")
    (match_operand:V8HI 1 "nonimmediate_operand" "")
    (set_attr "length_immediate" "1")
    (set_attr "mode" "TI")])
 
+(define_expand "avx2_pshufhwv3"
+  [(match_operand:V16HI 0 "register_operand" "")
+   (match_operand:V16HI 1 "nonimmediate_operand" "")
+   (match_operand:SI 2 "const_0_to_255_operand" "")]
+  "TARGET_AVX2"
+{
+  int mask = INTVAL (operands[2]);
+  emit_insn (gen_avx2_pshufhw_1 (operands[0], operands[1],
+                                GEN_INT (((mask >> 0) & 3) + 4),
+                                GEN_INT (((mask >> 2) & 3) + 4),
+                                GEN_INT (((mask >> 4) & 3) + 4),
+                                GEN_INT (((mask >> 6) & 3) + 4)));
+  DONE;
+})
+
+(define_insn "avx2_pshufhw_1"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (vec_select:V16HI
+         (match_operand:V16HI 1 "nonimmediate_operand" "xm")
+         (parallel [(const_int 0)
+                    (const_int 1)
+                    (const_int 2)
+                    (const_int 3)
+                    (match_operand 2 "const_4_to_7_operand" "")
+                    (match_operand 3 "const_4_to_7_operand" "")
+                    (match_operand 4 "const_4_to_7_operand" "")
+                    (match_operand 5 "const_4_to_7_operand" "")
+                    (const_int 8)
+                    (const_int 9)
+                    (const_int 10)
+                    (const_int 11)
+                    (match_dup 2)
+                    (match_dup 3)
+                    (match_dup 4)
+                    (match_dup 5)])))]
+  "TARGET_AVX2"
+{
+  int mask = 0;
+  mask |= (INTVAL (operands[2]) - 4) << 0;
+  mask |= (INTVAL (operands[3]) - 4) << 2;
+  mask |= (INTVAL (operands[4]) - 4) << 4;
+  mask |= (INTVAL (operands[5]) - 4) << 6;
+  operands[2] = GEN_INT (mask);
+
+  return "vpshufhw\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "length_immediate" "1")
+   (set_attr "mode" "OI")])
+
 (define_expand "sse2_pshufhw"
   [(match_operand:V8HI 0 "register_operand" "")
    (match_operand:V8HI 1 "nonimmediate_operand" "")
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(define_expand "avx2_uavgv32qi3"
+  [(set (match_operand:V32QI 0 "register_operand" "")
+       (truncate:V32QI
+         (lshiftrt:V32HI
+           (plus:V32HI
+             (plus:V32HI
+               (zero_extend:V32HI
+                 (match_operand:V32QI 1 "nonimmediate_operand" ""))
+               (zero_extend:V32HI
+                 (match_operand:V32QI 2 "nonimmediate_operand" "")))
+             (const_vector:V32QI [(const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)]))
+           (const_int 1))))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (PLUS, V32QImode, operands);")
+
 (define_expand "sse2_uavgv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "")
        (truncate:V16QI
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (PLUS, V16QImode, operands);")
 
+(define_insn "*avx2_uavgv32qi3"
+  [(set (match_operand:V32QI 0 "register_operand" "=x")
+       (truncate:V32QI
+         (lshiftrt:V32HI
+           (plus:V32HI
+             (plus:V32HI
+               (zero_extend:V32HI
+                 (match_operand:V32QI 1 "nonimmediate_operand" "%x"))
+               (zero_extend:V32HI
+                 (match_operand:V32QI 2 "nonimmediate_operand" "xm")))
+             (const_vector:V32QI [(const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)]))
+           (const_int 1))))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (PLUS, V32QImode, operands)"
+  "vpavgb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "*sse2_uavgv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "=x,x")
        (truncate:V16QI
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
+(define_expand "avx2_uavgv16hi3"
+  [(set (match_operand:V16HI 0 "register_operand" "")
+       (truncate:V16HI
+         (lshiftrt:V16SI
+           (plus:V16SI
+             (plus:V16SI
+               (zero_extend:V16SI
+                 (match_operand:V16HI 1 "nonimmediate_operand" ""))
+               (zero_extend:V16SI
+                 (match_operand:V16HI 2 "nonimmediate_operand" "")))
+             (const_vector:V16HI [(const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)]))
+           (const_int 1))))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (PLUS, V16HImode, operands);")
+
 (define_expand "sse2_uavgv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "")
        (truncate:V8HI
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (PLUS, V8HImode, operands);")
 
+(define_insn "*avx2_uavgv16hi3"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (truncate:V16HI
+         (lshiftrt:V16SI
+           (plus:V16SI
+             (plus:V16SI
+               (zero_extend:V16SI
+                 (match_operand:V16HI 1 "nonimmediate_operand" "%x"))
+               (zero_extend:V16SI
+                 (match_operand:V16HI 2 "nonimmediate_operand" "xm")))
+             (const_vector:V16HI [(const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)]))
+           (const_int 1))))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (PLUS, V16HImode, operands)"
+  "vpavgw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "*sse2_uavgv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x,x")
        (truncate:V8HI
 
 ;; The correct representation for this is absolutely enormous, and
 ;; surely not generally useful.
-(define_insn "sse2_psadbw"
-  [(set (match_operand:V2DI 0 "register_operand" "=x,x")
-       (unspec:V2DI [(match_operand:V16QI 1 "register_operand" "0,x")
-                     (match_operand:V16QI 2 "nonimmediate_operand" "xm,xm")]
-                    UNSPEC_PSADBW))]
+(define_insn "<sse2_avx2>_psadbw"
+  [(set (match_operand:VI8_AVX2 0 "register_operand" "=x,x")
+       (unspec:VI8_AVX2 [(match_operand:<ssebytemode> 1 "register_operand" "0,x")
+                         (match_operand:<ssebytemode> 2 "nonimmediate_operand" "xm,xm")]
+                         UNSPEC_PSADBW))]
   "TARGET_SSE2"
   "@
    psadbw\t{%2, %0|%0, %2}
    (set_attr "atom_unit" "simul")
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "<sse>_movmsk<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:SI 0 "register_operand" "=r")
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "avx2_pmovmskb"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+       (unspec:SI [(match_operand:V32QI 1 "register_operand" "x")]
+                  UNSPEC_MOVMSK))]
+  "TARGET_AVX2"
+  "vpmovmskb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DI")])
+
 (define_insn "sse2_pmovmskb"
   [(set (match_operand:SI 0 "register_operand" "=r")
        (unspec:SI [(match_operand:V16QI 1 "register_operand" "x")]
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(define_insn "avx2_phaddwv16hi3"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (vec_concat:V16HI
+         (vec_concat:V8HI
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (plus:HI
+                 (vec_select:HI
+                   (match_operand:V16HI 1 "register_operand" "x")
+                   (parallel [(const_int 0)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+               (plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+             (vec_concat:V2HI
+               (plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+               (plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 8)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 9)])))
+               (plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 10)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 11)]))))
+             (vec_concat:V2HI
+               (plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 12)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 13)])))
+               (plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 14)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 15)]))))))
+         (vec_concat:V8HI
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (plus:HI
+                 (vec_select:HI
+                   (match_operand:V16HI 2 "nonimmediate_operand" "xm")
+                   (parallel [(const_int 0)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+               (plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+             (vec_concat:V2HI
+               (plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+               (plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 8)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 9)])))
+               (plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 10)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 11)]))))
+             (vec_concat:V2HI
+               (plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 12)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 13)])))
+               (plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 14)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 15)]))))))))]
+  "TARGET_AVX2"
+  "vphaddw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "ssse3_phaddwv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x,x")
        (vec_concat:V8HI
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
+(define_insn "avx2_phadddv8si3"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (vec_concat:V8SI
+         (vec_concat:V4SI
+           (vec_concat:V2SI
+             (plus:SI
+               (vec_select:SI
+                 (match_operand:V8SI 1 "register_operand" "x")
+                 (parallel [(const_int 0)]))
+               (vec_select:SI (match_dup 1) (parallel [(const_int 1)])))
+             (plus:SI
+               (vec_select:SI (match_dup 1) (parallel [(const_int 2)]))
+               (vec_select:SI (match_dup 1) (parallel [(const_int 3)]))))
+           (vec_concat:V2SI
+             (plus:SI
+               (vec_select:SI (match_dup 1) (parallel [(const_int 4)]))
+               (vec_select:SI (match_dup 1) (parallel [(const_int 5)])))
+             (plus:SI
+               (vec_select:SI (match_dup 1) (parallel [(const_int 6)]))
+               (vec_select:SI (match_dup 1) (parallel [(const_int 7)])))))
+         (vec_concat:V4SI
+           (vec_concat:V2SI
+             (plus:SI
+               (vec_select:SI
+                 (match_operand:V8SI 2 "nonimmediate_operand" "xm")
+                 (parallel [(const_int 0)]))
+               (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))
+             (plus:SI
+               (vec_select:SI (match_dup 2) (parallel [(const_int 2)]))
+               (vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))
+           (vec_concat:V2SI
+             (plus:SI
+               (vec_select:SI (match_dup 2) (parallel [(const_int 4)]))
+               (vec_select:SI (match_dup 2) (parallel [(const_int 5)])))
+             (plus:SI
+               (vec_select:SI (match_dup 2) (parallel [(const_int 6)]))
+               (vec_select:SI (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_AVX2"
+  "vphaddd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "ssse3_phadddv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "=x,x")
        (vec_concat:V4SI
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
+(define_insn "avx2_phaddswv16hi3"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (vec_concat:V16HI
+         (vec_concat:V8HI
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (ss_plus:HI
+                 (vec_select:HI
+                   (match_operand:V16HI 1 "register_operand" "x")
+                   (parallel [(const_int 0)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+             (vec_concat:V2HI
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 8)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 9)])))
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 10)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 11)]))))
+             (vec_concat:V2HI
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 12)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 13)])))
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 14)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 15)]))))))
+         (vec_concat:V8HI
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (ss_plus:HI
+                 (vec_select:HI
+                   (match_operand:V16HI 2 "nonimmediate_operand" "xm")
+                   (parallel [(const_int 0)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+             (vec_concat:V2HI
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 8)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 9)])))
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 10)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 11)]))))
+             (vec_concat:V2HI
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 12)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 13)])))
+               (ss_plus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 14)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 15)]))))))))]
+  "TARGET_AVX2"
+  "vphaddsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "ssse3_phaddswv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x,x")
        (vec_concat:V8HI
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
+(define_insn "avx2_phsubwv16hi3"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (vec_concat:V16HI
+         (vec_concat:V8HI
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (minus:HI
+                 (vec_select:HI
+                   (match_operand:V16HI 1 "register_operand" "x")
+                   (parallel [(const_int 0)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+               (minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+             (vec_concat:V2HI
+               (minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+               (minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 8)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 9)])))
+               (minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 10)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 11)]))))
+             (vec_concat:V2HI
+               (minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 12)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 13)])))
+               (minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 14)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 15)]))))))
+         (vec_concat:V8HI
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (minus:HI
+                 (vec_select:HI
+                   (match_operand:V16HI 2 "nonimmediate_operand" "xm")
+                   (parallel [(const_int 0)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+               (minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+             (vec_concat:V2HI
+               (minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+               (minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 8)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 9)])))
+               (minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 10)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 11)]))))
+             (vec_concat:V2HI
+               (minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 12)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 13)])))
+               (minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 14)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 15)]))))))))]
+  "TARGET_AVX2"
+  "vphsubw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "ssse3_phsubwv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x,x")
        (vec_concat:V8HI
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
+(define_insn "avx2_phsubdv8si3"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (vec_concat:V8SI
+         (vec_concat:V4SI
+           (vec_concat:V2SI
+             (minus:SI
+               (vec_select:SI
+                 (match_operand:V8SI 1 "register_operand" "x")
+                 (parallel [(const_int 0)]))
+               (vec_select:SI (match_dup 1) (parallel [(const_int 1)])))
+             (minus:SI
+               (vec_select:SI (match_dup 1) (parallel [(const_int 2)]))
+               (vec_select:SI (match_dup 1) (parallel [(const_int 3)]))))
+           (vec_concat:V2SI
+             (minus:SI
+               (vec_select:SI (match_dup 1) (parallel [(const_int 4)]))
+               (vec_select:SI (match_dup 1) (parallel [(const_int 5)])))
+             (minus:SI
+               (vec_select:SI (match_dup 1) (parallel [(const_int 6)]))
+               (vec_select:SI (match_dup 1) (parallel [(const_int 7)])))))
+         (vec_concat:V4SI
+           (vec_concat:V2SI
+             (minus:SI
+               (vec_select:SI
+                 (match_operand:V8SI 2 "nonimmediate_operand" "xm")
+                 (parallel [(const_int 0)]))
+               (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))
+             (minus:SI
+               (vec_select:SI (match_dup 2) (parallel [(const_int 2)]))
+               (vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))
+           (vec_concat:V2SI
+             (minus:SI
+               (vec_select:SI (match_dup 2) (parallel [(const_int 4)]))
+               (vec_select:SI (match_dup 2) (parallel [(const_int 5)])))
+             (minus:SI
+               (vec_select:SI (match_dup 2) (parallel [(const_int 6)]))
+               (vec_select:SI (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_AVX2"
+  "vphsubd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "ssse3_phsubdv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "=x,x")
        (vec_concat:V4SI
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
+(define_insn "avx2_phsubswv16hi3"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (vec_concat:V16HI
+         (vec_concat:V8HI
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (ss_minus:HI
+                 (vec_select:HI
+                   (match_operand:V16HI 1 "register_operand" "x")
+                   (parallel [(const_int 0)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+             (vec_concat:V2HI
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 8)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 9)])))
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 10)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 11)]))))
+             (vec_concat:V2HI
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 12)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 13)])))
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 14)]))
+                 (vec_select:HI (match_dup 1) (parallel [(const_int 15)]))))))
+         (vec_concat:V8HI
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (ss_minus:HI
+                 (vec_select:HI
+                   (match_operand:V16HI 2 "nonimmediate_operand" "xm")
+                   (parallel [(const_int 0)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+             (vec_concat:V2HI
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))
+           (vec_concat:V4HI
+             (vec_concat:V2HI
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 8)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 9)])))
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 10)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 11)]))))
+             (vec_concat:V2HI
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 12)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 13)])))
+               (ss_minus:HI
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 14)]))
+                 (vec_select:HI (match_dup 2) (parallel [(const_int 15)]))))))))]
+  "TARGET_AVX2"
+  "vphsubsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "ssse3_phsubswv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x,x")
        (vec_concat:V8HI
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
+(define_insn "avx2_pmaddubsw256"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (ss_plus:V16HI
+         (mult:V16HI
+           (zero_extend:V16HI
+             (vec_select:V16QI
+               (match_operand:V32QI 1 "register_operand" "x")
+               (parallel [(const_int 0)
+                          (const_int 2)
+                          (const_int 4)
+                          (const_int 6)
+                          (const_int 8)
+                          (const_int 10)
+                          (const_int 12)
+                          (const_int 14)
+                          (const_int 16)
+                          (const_int 18)
+                          (const_int 20)
+                          (const_int 22)
+                          (const_int 24)
+                          (const_int 26)
+                          (const_int 28)
+                          (const_int 30)])))
+           (sign_extend:V16HI
+             (vec_select:V16QI
+               (match_operand:V32QI 2 "nonimmediate_operand" "xm")
+               (parallel [(const_int 0)
+                          (const_int 2)
+                          (const_int 4)
+                          (const_int 6)
+                          (const_int 8)
+                          (const_int 10)
+                          (const_int 12)
+                          (const_int 14)
+                          (const_int 16)
+                          (const_int 18)
+                          (const_int 20)
+                          (const_int 22)
+                          (const_int 24)
+                          (const_int 26)
+                          (const_int 28)
+                          (const_int 30)]))))
+         (mult:V16HI
+           (zero_extend:V16HI
+             (vec_select:V16QI (match_dup 1)
+               (parallel [(const_int 1)
+                          (const_int 3)
+                          (const_int 5)
+                          (const_int 7)
+                          (const_int 9)
+                          (const_int 11)
+                          (const_int 13)
+                          (const_int 15)
+                          (const_int 17)
+                          (const_int 19)
+                          (const_int 21)
+                          (const_int 23)
+                          (const_int 25)
+                          (const_int 27)
+                          (const_int 29)
+                          (const_int 31)])))
+           (sign_extend:V16HI
+             (vec_select:V16QI (match_dup 2)
+               (parallel [(const_int 1)
+                          (const_int 3)
+                          (const_int 5)
+                          (const_int 7)
+                          (const_int 9)
+                          (const_int 11)
+                          (const_int 13)
+                          (const_int 15)
+                          (const_int 17)
+                          (const_int 19)
+                          (const_int 21)
+                          (const_int 23)
+                          (const_int 25)
+                          (const_int 27)
+                          (const_int 29)
+                          (const_int 31)]))))))]
+  "TARGET_AVX2"
+  "vpmaddubsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "ssse3_pmaddubsw128"
   [(set (match_operand:V8HI 0 "register_operand" "=x,x")
        (ss_plus:V8HI
   [(set_attr "type" "sseiadd")
    (set_attr "atom_unit" "simul")
    (set_attr "prefix_extra" "1")
-   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
-   (set_attr "mode" "DI")])
+   (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
+   (set_attr "mode" "DI")])
+
+(define_expand "avx2_umulhrswv16hi3"
+  [(set (match_operand:V16HI 0 "register_operand" "")
+       (truncate:V16HI
+         (lshiftrt:V16SI
+           (plus:V16SI
+             (lshiftrt:V16SI
+               (mult:V16SI
+                 (sign_extend:V16SI
+                   (match_operand:V16HI 1 "nonimmediate_operand" ""))
+                 (sign_extend:V16SI
+                   (match_operand:V16HI 2 "nonimmediate_operand" "")))
+               (const_int 14))
+             (const_vector:V16HI [(const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)]))
+           (const_int 1))))]
+  "TARGET_AVX2"
+  "ix86_fixup_binary_operands_no_copy (MULT, V16HImode, operands);")
+
+(define_insn "*avx2_umulhrswv16hi3"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (truncate:V16HI
+         (lshiftrt:V16SI
+           (plus:V16SI
+             (lshiftrt:V16SI
+               (mult:V16SI
+                 (sign_extend:V16SI
+                   (match_operand:V16HI 1 "nonimmediate_operand" "%x"))
+                 (sign_extend:V16SI
+                   (match_operand:V16HI 2 "nonimmediate_operand" "xm")))
+               (const_int 14))
+             (const_vector:V16HI [(const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)
+                                  (const_int 1) (const_int 1)]))
+           (const_int 1))))]
+  "TARGET_AVX2 && ix86_binary_operator_ok (MULT, V16HImode, operands)"
+  "vpmulhrsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
 
 (define_expand "ssse3_pmulhrswv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
-(define_insn "ssse3_pshufbv16qi3"
-  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
-       (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,x")
-                      (match_operand:V16QI 2 "nonimmediate_operand" "xm,xm")]
-                     UNSPEC_PSHUFB))]
+(define_insn "<ssse3_avx2>_pshufb<mode>3"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+       (unspec:VI1_AVX2 [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
+                         (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+                        UNSPEC_PSHUFB))]
   "TARGET_SSSE3"
   "@
    pshufb\t{%2, %0|%0, %2}
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "ssse3_pshufbv8qi3"
   [(set (match_operand:V8QI 0 "register_operand" "=y")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
-(define_insn "ssse3_psign<mode>3"
-  [(set (match_operand:VI124_128 0 "register_operand" "=x,x")
-       (unspec:VI124_128
-         [(match_operand:VI124_128 1 "register_operand" "0,x")
-          (match_operand:VI124_128 2 "nonimmediate_operand" "xm,xm")]
+(define_insn "<ssse3_avx2>_psign<mode>3"
+  [(set (match_operand:VI124_AVX2 0 "register_operand" "=x,x")
+       (unspec:VI124_AVX2
+         [(match_operand:VI124_AVX2 1 "register_operand" "0,x")
+          (match_operand:VI124_AVX2 2 "nonimmediate_operand" "xm,xm")]
          UNSPEC_PSIGN))]
   "TARGET_SSSE3"
   "@
    (set_attr "prefix_data16" "1,*")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "ssse3_psign<mode>3"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
-(define_insn "ssse3_palignrti"
-  [(set (match_operand:TI 0 "register_operand" "=x,x")
-       (unspec:TI [(match_operand:TI 1 "register_operand" "0,x")
-                   (match_operand:TI 2 "nonimmediate_operand" "xm,xm")
-                   (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n,n")]
-                  UNSPEC_PALIGNR))]
+(define_insn "<ssse3_avx2>_palignr<mode>"
+  [(set (match_operand:SSESCALARMODE 0 "register_operand" "=x,x")
+       (unspec:SSESCALARMODE [(match_operand:SSESCALARMODE 1 "register_operand" "0,x")
+                              (match_operand:SSESCALARMODE 2 "nonimmediate_operand" "xm,xm")
+                              (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n,n")]
+                             UNSPEC_PALIGNR))]
   "TARGET_SSSE3"
 {
   operands[3] = GEN_INT (INTVAL (operands[3]) / 8);
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "ssse3_palignrdi"
   [(set (match_operand:DI 0 "register_operand" "=y")
    (set_attr "mode" "DI")])
 
 (define_insn "abs<mode>2"
-  [(set (match_operand:VI124_128 0 "register_operand" "=x")
-       (abs:VI124_128
-         (match_operand:VI124_128 1 "nonimmediate_operand" "xm")))]
+  [(set (match_operand:VI124_AVX2 0 "register_operand" "=x")
+       (abs:VI124_AVX2
+         (match_operand:VI124_AVX2 1 "nonimmediate_operand" "xm")))]
   "TARGET_SSSE3"
   "%vpabs<ssemodesuffix>\t{%1, %0|%0, %1}"
   [(set_attr "type" "sselog1")
    (set_attr "prefix_data16" "1")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "maybe_vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "abs<mode>2"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
   [(set (match_operand:MODEF 0 "memory_operand" "=m")
        (unspec:MODEF
          [(match_operand:MODEF 1 "register_operand" "x")]
-          UNSPEC_MOVNT))]
+         UNSPEC_MOVNT))]
   "TARGET_SSE4A"
   "movnt<ssemodesuffix>\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
 
 (define_insn "sse4a_extrqi"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
-        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
-                      (match_operand 2 "const_0_to_255_operand" "")
-                      (match_operand 3 "const_0_to_255_operand" "")]
-                     UNSPEC_EXTRQI))]
+       (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                     (match_operand 2 "const_0_to_255_operand" "")
+                     (match_operand 3 "const_0_to_255_operand" "")]
+                    UNSPEC_EXTRQI))]
   "TARGET_SSE4A"
   "extrq\t{%3, %2, %0|%0, %2, %3}"
   [(set_attr "type" "sse")
 
 (define_insn "sse4a_extrq"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
-        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
-                      (match_operand:V16QI 2 "register_operand" "x")]
-                     UNSPEC_EXTRQ))]
+       (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                     (match_operand:V16QI 2 "register_operand" "x")]
+                    UNSPEC_EXTRQ))]
   "TARGET_SSE4A"
   "extrq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sse")
 
 (define_insn "sse4a_insertqi"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
-        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
-                     (match_operand:V2DI 2 "register_operand" "x")
-                      (match_operand 3 "const_0_to_255_operand" "")
-                      (match_operand 4 "const_0_to_255_operand" "")]
-                     UNSPEC_INSERTQI))]
+       (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                     (match_operand:V2DI 2 "register_operand" "x")
+                     (match_operand 3 "const_0_to_255_operand" "")
+                     (match_operand 4 "const_0_to_255_operand" "")]
+                    UNSPEC_INSERTQI))]
   "TARGET_SSE4A"
   "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}"
   [(set_attr "type" "sseins")
 
 (define_insn "sse4a_insertq"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
-        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
-                     (match_operand:V2DI 2 "register_operand" "x")]
-                    UNSPEC_INSERTQ))]
+       (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                     (match_operand:V2DI 2 "register_operand" "x")]
+                    UNSPEC_INSERTQ))]
   "TARGET_SSE4A"
   "insertq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseins")
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "sse4_1_movntdqa"
-  [(set (match_operand:V2DI 0 "register_operand" "=x")
-       (unspec:V2DI [(match_operand:V2DI 1 "memory_operand" "m")]
+(define_insn "<sse4_1_avx2>_movntdqa"
+  [(set (match_operand:VI8_AVX2 0 "register_operand" "=x")
+       (unspec:VI8_AVX2 [(match_operand:VI8_AVX2 1 "memory_operand" "m")]
                     UNSPEC_MOVNTDQA))]
   "TARGET_SSE4_1"
   "%vmovntdqa\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "maybe_vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "sse4_1_mpsadbw"
-  [(set (match_operand:V16QI 0 "register_operand" "=x,x")
-       (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0,x")
-                      (match_operand:V16QI 2 "nonimmediate_operand" "xm,xm")
-                      (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
-                     UNSPEC_MPSADBW))]
+(define_insn "<sse4_1_avx2>_mpsadbw"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+       (unspec:VI1_AVX2 [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
+                         (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")
+                         (match_operand:SI 3 "const_0_to_255_operand" "n,n")]
+                        UNSPEC_MPSADBW))]
   "TARGET_SSE4_1"
   "@
    mpsadbw\t{%3, %2, %0|%0, %2, %3}
    (set_attr "length_immediate" "1")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_packusdw"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (vec_concat:V16HI
+         (us_truncate:V8HI
+           (match_operand:V8SI 1 "register_operand" "x"))
+         (us_truncate:V8HI
+           (match_operand:V8SI 2 "nonimmediate_operand" "xm"))))]
+  "TARGET_AVX2"
+  "vpackusdw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
 
 (define_insn "sse4_1_packusdw"
   [(set (match_operand:V8HI 0 "register_operand" "=x,x")
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "TI")])
 
-(define_insn "sse4_1_pblendvb"
-  [(set (match_operand:V16QI 0 "reg_not_xmm0_operand" "=x,x")
-       (unspec:V16QI
-         [(match_operand:V16QI 1 "reg_not_xmm0_operand_maybe_avx" "0,x")
-          (match_operand:V16QI 2 "nonimm_not_xmm0_operand_maybe_avx" "xm,xm")
-          (match_operand:V16QI 3 "register_operand" "Yz,x")]
+(define_insn "<sse4_1_avx2>_pblendvb"
+  [(set (match_operand:VI1_AVX2 0 "reg_not_xmm0_operand" "=x,x")
+       (unspec:VI1_AVX2
+         [(match_operand:VI1_AVX2 1 "reg_not_xmm0_operand_maybe_avx"  "0,x")
+          (match_operand:VI1_AVX2 2 "nonimm_not_xmm0_operand_maybe_avx" "xm,xm")
+          (match_operand:VI1_AVX2 3 "register_operand" "Yz,x")]
          UNSPEC_BLENDV))]
   "TARGET_SSE4_1"
   "@
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "*,1")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "sse4_1_pblendw"
-  [(set (match_operand:V8HI 0 "register_operand" "=x,x")
-       (vec_merge:V8HI
-         (match_operand:V8HI 2 "nonimmediate_operand" "xm,xm")
-         (match_operand:V8HI 1 "register_operand" "0,x")
+(define_insn "<sse4_1_avx2>_pblendw"
+  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,x")
+       (vec_merge:VI2_AVX2
+         (match_operand:VI2_AVX2 2 "nonimmediate_operand" "xm,xm")
+         (match_operand:VI2_AVX2 1 "register_operand" "0,x")
          (match_operand:SI 3 "const_0_to_255_operand" "n,n")))]
   "TARGET_SSE4_1"
   "@
    (set_attr "prefix_extra" "1")
    (set_attr "length_immediate" "1")
    (set_attr "prefix" "orig,vex")
-   (set_attr "mode" "TI")])
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_pblendd<mode>"
+  [(set (match_operand:VI4_AVX2 0 "register_operand" "=x")
+       (vec_merge:VI4_AVX2
+         (match_operand:VI4_AVX2 2 "nonimmediate_operand" "xm")
+         (match_operand:VI4_AVX2 1 "register_operand" "x")
+         (match_operand:SI 3 "const_0_to_255_operand" "n")))]
+  "TARGET_AVX2"
+  "vpblendd\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
 
 (define_insn "sse4_1_phminposuw"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_<code>v16qiv16hi2"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+       (any_extend:V16HI
+         (match_operand:V16QI 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>bw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "sse4_1_<code>v8qiv8hi2"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
        (any_extend:V8HI
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_<code>v8qiv8si2"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (any_extend:V8SI
+         (vec_select:V8QI
+           (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)
+                      (const_int 4)
+                      (const_int 5)
+                      (const_int 6)
+                      (const_int 7)]))))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>bd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "sse4_1_<code>v4qiv4si2"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
        (any_extend:V4SI
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_<code>v8hiv8si2"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (any_extend:V8SI
+           (match_operand:V8HI 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>wd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "sse4_1_<code>v4hiv4si2"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
        (any_extend:V4SI
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_<code>v4qiv4di2"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (any_extend:V4DI
+         (vec_select:V4QI
+           (match_operand:V16QI 1 "nonimmediate_operand" "xm")
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>bq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "sse4_1_<code>v2qiv2di2"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
        (any_extend:V2DI
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_<code>v4hiv4di2"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (any_extend:V4DI
+         (vec_select:V4HI
+           (match_operand:V8HI 1 "nonimmediate_operand" "xm")
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>wq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "sse4_1_<code>v2hiv2di2"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
        (any_extend:V2DI
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx2_<code>v4siv4di2"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (any_extend:V4DI
+           (match_operand:V4SI 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vpmov<extsuffix>dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "OI")])
+
 (define_insn "sse4_1_<code>v2siv2di2"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
        (any_extend:V2DI
 ;; do not allow the value being added to be a memory operation.
 (define_insn "xop_pmacsww"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
-        (plus:V8HI
+       (plus:V8HI
         (mult:V8HI
          (match_operand:V8HI 1 "nonimmediate_operand" "%x")
          (match_operand:V8HI 2 "nonimmediate_operand" "xm"))
 
 (define_insn "xop_pmacssww"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
-        (ss_plus:V8HI
+       (ss_plus:V8HI
         (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%x")
                    (match_operand:V8HI 2 "nonimmediate_operand" "xm"))
         (match_operand:V8HI 3 "nonimmediate_operand" "x")))]
 
 (define_insn "xop_pmacsdd"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
-        (plus:V4SI
+       (plus:V4SI
         (mult:V4SI
          (match_operand:V4SI 1 "nonimmediate_operand" "%x")
          (match_operand:V4SI 2 "nonimmediate_operand" "xm"))
 
 (define_insn "xop_pmacssdd"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
-        (ss_plus:V4SI
+       (ss_plus:V4SI
         (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%x")
                    (match_operand:V4SI 2 "nonimmediate_operand" "xm"))
         (match_operand:V4SI 3 "nonimmediate_operand" "x")))]
       int i;
 
       if (GET_MODE (op2) != <ssescalarmode>mode)
-        {
+       {
          op2 = gen_reg_rtx (<ssescalarmode>mode);
          convert_move (op2, operands[2], false);
        }
       int i;
 
       if (GET_MODE (op2) != <ssescalarmode>mode)
-        {
+       {
          op2 = gen_reg_rtx (<ssescalarmode>mode);
          convert_move (op2, operands[2], false);
        }
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
 
+(define_mode_attr AVXTOSSEMODE
+  [(V4DI "V2DI") (V2DI "V2DI")
+   (V8SI "V4SI") (V4SI "V4SI")
+   (V16HI "V8HI") (V8HI "V8HI")
+   (V32QI "V16QI") (V16QI "V16QI")])
+
+(define_insn "avx2_pbroadcast<mode>"
+  [(set (match_operand:VI 0 "register_operand" "=x")
+       (vec_duplicate:VI
+         (vec_select:<ssescalarmode>
+           (match_operand:<AVXTOSSEMODE> 1 "nonimmediate_operand" "xm")
+           (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vpbroadcast<ssemodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_permvarv8si"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (unspec:V8SI
+         [(match_operand:V8SI 1 "register_operand" "x")
+          (match_operand:V8SI 2 "nonimmediate_operand" "xm")]
+         UNSPEC_VPERMSI))]
+  "TARGET_AVX2"
+  "vpermd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_permv4df"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+       (unspec:V4DF
+         [(match_operand:V4DF 1 "register_operand" "xm")
+          (match_operand:SI 2 "const_0_to_255_operand" "n")]
+         UNSPEC_VPERMDF))]
+  "TARGET_AVX2"
+  "vpermpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_permvarv8sf"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+       (unspec:V8SF
+         [(match_operand:V8SF 1 "register_operand" "x")
+          (match_operand:V8SF 2 "nonimmediate_operand" "xm")]
+         UNSPEC_VPERMSF))]
+  "TARGET_AVX2"
+  "vpermps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_permv4di"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (unspec:V4DI
+         [(match_operand:V4DI 1 "register_operand" "xm")
+          (match_operand:SI 2 "const_0_to_255_operand" "n")]
+         UNSPEC_VPERMDI))]
+  "TARGET_AVX2"
+  "vpermq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_permv2ti"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (unspec:V4DI
+         [(match_operand:V4DI 1 "register_operand" "x")
+          (match_operand:V4DI 2 "register_operand" "xm")
+          (match_operand:SI 3 "const_0_to_255_operand" "n")]
+         UNSPEC_VPERMTI))]
+  "TARGET_AVX2"
+  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_vec_dupv4df"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+       (vec_duplicate:V4DF
+         (vec_select:DF
+           (match_operand:V2DF 1 "register_operand" "x")
+           (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vbroadcastsd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
 ;; Modes handled by AVX vec_dup patterns.
 (define_mode_iterator AVX_VEC_DUP_MODE
   [V8SI V8SF V4DI V4DF])
    (set_attr "prefix" "vex")
    (set_attr "mode" "V8SF")])
 
+(define_insn "avx2_vbroadcasti128_<mode>"
+  [(set (match_operand:VI_256 0 "register_operand" "=x")
+       (vec_concat:VI_256
+         (match_operand:<ssehalfvecmode> 1 "memory_operand" "m")
+         (match_dup 1)))]
+  "TARGET_AVX2"
+  "vbroadcasti128\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_split
   [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "")
        (vec_duplicate:AVX_VEC_DUP_MODE
     }
 
   operands[1] = adjust_address_nv (op1, <ssescalarmode>mode,
-                                  elt * GET_MODE_SIZE (<ssescalarmode>mode));
+                                  elt * GET_MODE_SIZE (<ssescalarmode>mode));
 })
 
 (define_expand "avx_vpermil<mode>"
   DONE;
 })
 
+(define_insn "avx2_vec_set_lo_v4di"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (vec_concat:V4DI
+         (match_operand:V2DI 2 "nonimmediate_operand" "xm")
+         (vec_select:V2DI
+           (match_operand:V4DI 1 "register_operand" "x")
+           (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_AVX2"
+  "vinserti128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_vec_set_hi_v4di"
+  [(set (match_operand:V4DI 0 "register_operand" "=x")
+       (vec_concat:V4DI
+         (vec_select:V2DI
+           (match_operand:V4DI 1 "register_operand" "x")
+           (parallel [(const_int 0) (const_int 1)]))
+         (match_operand:V2DI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX2"
+  "vinserti128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "vec_set_lo_<mode>"
   [(set (match_operand:VI8F_256 0 "register_operand" "=x")
        (vec_concat:VI8F_256
    (set_attr "prefix" "vex")
    (set_attr "mode" "V8SF")])
 
-(define_expand "avx_maskload<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:VF 0 "register_operand" "")
-       (unspec:VF
+(define_expand "<avx_avx2>_maskload<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:V48_AVX2 0 "register_operand" "")
+       (unspec:V48_AVX2
          [(match_operand:<sseintvecmode> 2 "register_operand" "")
-          (match_operand:VF 1 "memory_operand" "")
+          (match_operand:V48_AVX2 1 "memory_operand" "")
           (match_dup 0)]
          UNSPEC_MASKMOV))]
   "TARGET_AVX")
 
-(define_expand "avx_maskstore<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:VF 0 "memory_operand" "")
-       (unspec:VF
+(define_expand "<avx_avx2>_maskstore<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:V48_AVX2 0 "memory_operand" "")
+       (unspec:V48_AVX2
          [(match_operand:<sseintvecmode> 1 "register_operand" "")
-          (match_operand:VF 2 "register_operand" "")
+          (match_operand:V48_AVX2 2 "register_operand" "")
           (match_dup 0)]
          UNSPEC_MASKMOV))]
   "TARGET_AVX")
 
+(define_insn "*avx2_maskmov<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VI48_AVX2 0 "nonimmediate_operand" "=x,m")
+       (unspec:VI48_AVX2
+         [(match_operand:<sseintvecmode> 1 "register_operand" "x,x")
+          (match_operand:VI48_AVX2 2 "nonimmediate_operand" "m,x")
+          (match_dup 0)]
+         UNSPEC_MASKMOV))]
+  "TARGET_AVX2
+   && (REG_P (operands[0]) == MEM_P (operands[2]))"
+  "vpmaskmov<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn "*avx_maskmov<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:VF 0 "nonimmediate_operand" "=x,m")
        (unspec:VF
   DONE;
 })
 
+(define_insn "avx2_extracti128"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (vec_select:V2DI
+         (match_operand:V4DI 1 "nonimmediate_operand" "xm")
+         (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")])))]
+  "TARGET_AVX2"
+  "vextracti128\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_expand "avx2_inserti128"
+  [(match_operand:V4DI 0 "register_operand" "")
+   (match_operand:V4DI 1 "register_operand" "")
+   (match_operand:V2DI 2 "nonimmediate_operand" "")
+   (match_operand:SI 3 "const_0_to_1_operand" "")]
+  "TARGET_AVX2"
+{
+  rtx (*insn)(rtx, rtx, rtx);
+
+  switch (INTVAL (operands[3]))
+    {
+    case 0:
+      insn = gen_avx2_vec_set_lo_v4di;
+      break;
+    case 1:
+      insn = gen_avx2_vec_set_hi_v4di;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (insn (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_insn "avx2_ashrvv8si"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (vec_concat:V8SI
+         (vec_concat:V4SI
+           (vec_concat:V2SI
+             (ashiftrt:SI
+               (vec_select:SI
+                 (match_operand:V8SI 1 "register_operand" "x")
+                 (parallel [(const_int 0)]))
+               (vec_select:SI
+                 (match_operand:V8SI 2 "nonimmediate_operand" "xm")
+                 (parallel [(const_int 0)])))
+             (ashiftrt:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 1)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 1)]))))
+           (vec_concat:V2SI
+             (ashiftrt:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 2)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 2)])))
+             (ashiftrt:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 3)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 3)])))))
+         (vec_concat:V4SI
+           (vec_concat:V2SI
+             (ashiftrt:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 0)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 0)])))
+             (ashiftrt:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 1)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 1)]))))
+           (vec_concat:V2SI
+             (ashiftrt:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 2)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 2)])))
+             (ashiftrt:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 3)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 3)])))))))]
+  "TARGET_AVX2"
+  "vpsravd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_ashrvv4si"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (vec_concat:V4SI
+         (vec_concat:V2SI
+           (ashiftrt:SI
+             (vec_select:SI
+               (match_operand:V4SI 1 "register_operand" "x")
+               (parallel [(const_int 0)]))
+             (vec_select:SI
+               (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+               (parallel [(const_int 0)])))
+           (ashiftrt:SI
+             (vec_select:SI
+               (match_dup 1)
+               (parallel [(const_int 1)]))
+             (vec_select:SI
+               (match_dup 2)
+               (parallel [(const_int 1)]))))
+         (vec_concat:V2SI
+           (ashiftrt:SI
+             (vec_select:SI
+               (match_dup 1)
+               (parallel [(const_int 2)]))
+             (vec_select:SI
+               (match_dup 2)
+               (parallel [(const_int 2)])))
+           (ashiftrt:SI
+             (vec_select:SI
+               (match_dup 1)
+               (parallel [(const_int 3)]))
+             (vec_select:SI
+               (match_dup 2)
+               (parallel [(const_int 3)]))))))]
+  "TARGET_AVX2"
+  "vpsravd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "avx2_<lshift>vv8si"
+  [(set (match_operand:V8SI 0 "register_operand" "=x")
+       (vec_concat:V8SI
+         (vec_concat:V4SI
+           (vec_concat:V2SI
+             (lshift:SI
+               (vec_select:SI
+                 (match_operand:V8SI 1 "register_operand" "x")
+                 (parallel [(const_int 0)]))
+               (vec_select:SI
+                 (match_operand:V8SI 2 "nonimmediate_operand" "xm")
+                 (parallel [(const_int 0)])))
+             (lshift:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 1)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 1)]))))
+           (vec_concat:V2SI
+             (lshift:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 2)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 2)])))
+             (lshift:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 3)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 3)])))))
+         (vec_concat:V4SI
+           (vec_concat:V2SI
+             (lshift:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 0)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 0)])))
+             (lshift:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 1)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 1)]))))
+           (vec_concat:V2SI
+             (lshift:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 2)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 2)])))
+             (lshift:SI
+               (vec_select:SI
+                 (match_dup 1)
+                 (parallel [(const_int 3)]))
+               (vec_select:SI
+                 (match_dup 2)
+                 (parallel [(const_int 3)])))))))]
+  "TARGET_AVX2"
+  "vp<lshift_insn>vd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx2_<lshift>v<mode>"
+  [(set (match_operand:VI4SD_AVX2 0 "register_operand" "=x")
+       (vec_concat:VI4SD_AVX2
+         (vec_concat:<ssehalfvecmode>
+           (lshift:<ssescalarmode>
+             (vec_select:<ssescalarmode>
+               (match_operand:VI4SD_AVX2 1 "register_operand" "x")
+               (parallel [(const_int 0)]))
+             (vec_select:<ssescalarmode>
+               (match_operand:VI4SD_AVX2 2 "nonimmediate_operand" "xm")
+               (parallel [(const_int 0)])))
+           (lshift:<ssescalarmode>
+             (vec_select:<ssescalarmode>
+               (match_dup 1)
+               (parallel [(const_int 1)]))
+             (vec_select:<ssescalarmode>
+               (match_dup 2)
+               (parallel [(const_int 1)]))))
+         (vec_concat:<ssehalfvecmode>
+           (lshift:<ssescalarmode>
+             (vec_select:<ssescalarmode>
+               (match_dup 1)
+               (parallel [(const_int 2)]))
+             (vec_select:<ssescalarmode>
+               (match_dup 2)
+               (parallel [(const_int 2)])))
+           (lshift:<ssescalarmode>
+             (vec_select:<ssescalarmode>
+               (match_dup 1)
+               (parallel [(const_int 3)]))
+             (vec_select:<ssescalarmode>
+               (match_dup 2)
+               (parallel [(const_int 3)]))))))]
+  "TARGET_AVX2"
+  "vp<lshift_insn>v<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "avx2_<lshift>vv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (vec_concat:V2DI
+         (lshift:DI
+           (vec_select:DI
+             (match_operand:V2DI 1 "register_operand" "x")
+             (parallel [(const_int 0)]))
+           (vec_select:DI
+             (match_operand:V2DI 2 "nonimmediate_operand" "xm")
+             (parallel [(const_int 0)])))
+         (lshift:DI
+           (vec_select:DI
+             (match_dup 1)
+             (parallel [(const_int 1)]))
+           (vec_select:DI
+             (match_dup 2)
+             (parallel [(const_int 1)])))))]
+  "TARGET_AVX2"
+  "vp<lshift_insn>vq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*vec_concat<mode>_avx"
   [(set (match_operand:V_256 0 "register_operand" "=x,x")
        (vec_concat:V_256
       return "vinsertf128\t{$0x1, %2, %t1, %0|%0, %t1, %2, 0x1}";
     case 1:
       switch (get_attr_mode (insn))
-        {
+       {
        case MODE_V8SF:
          return "vmovaps\t{%1, %x0|%x0, %1}";
        case MODE_V4DF:
   [(set_attr "type" "ssecvt")
    (set_attr "prefix" "vex")
    (set_attr "mode" "V8SF")])
+
+;; For gather* insn patterns
+(define_mode_iterator VEC_GATHER_MODE
+                     [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF])
+(define_mode_attr VEC_GATHER_MODE
+                     [(V2DI "V4SI") (V2DF "V4SI")
+                      (V4DI "V4SI") (V4DF "V4SI")
+                      (V4SI "V4SI") (V4SF "V4SI")
+                      (V8SI "V8SI") (V8SF "V8SI")])
+
+(define_expand "avx2_gathersi<mode>"
+  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "")
+       (unspec:VEC_GATHER_MODE
+         [(match_operand:VEC_GATHER_MODE 1 "register_operand" "")
+          (match_operand:<ssescalarmode> 2 "memory_operand" "")
+          (match_operand:<VEC_GATHER_MODE> 3 "register_operand" "")
+          (match_operand:VEC_GATHER_MODE 4 "register_operand" "")
+          (match_operand:SI 5 "const1248_operand " "")]
+         UNSPEC_GATHER))]
+  "TARGET_AVX2")
+
+(define_insn "*avx2_gathersi<mode>"
+  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=x")
+       (unspec:VEC_GATHER_MODE
+         [(match_operand:VEC_GATHER_MODE 1 "register_operand" "0")
+          (mem:<ssescalarmode>
+            (match_operand:P 2 "register_operand" "r"))
+          (match_operand:<VEC_GATHER_MODE> 3 "register_operand" "x")
+          (match_operand:VEC_GATHER_MODE 4 "register_operand" "x")
+          (match_operand:SI 5 "const1248_operand" "n")]
+         UNSPEC_GATHER))]
+  "TARGET_AVX2"
+  "v<gthrfirstp>gatherd<gthrlastp>\t{%4, (%2, %3, %c5), %0|%0, (%2, %3, %c5), %4}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_expand "avx2_gatherdi<mode>"
+  [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "")
+       (unspec:VEC_GATHER_MODE
+         [(match_operand:VEC_GATHER_MODE 1 "register_operand" "")
+          (match_operand:<ssescalarmode> 2 "memory_operand" "")
+          (match_operand:<AVXMODE48P_DI> 3 "register_operand" "")
+          (match_operand:VEC_GATHER_MODE 4 "register_operand" "")
+          (match_operand:SI 5 "const1248_operand " "")]
+         UNSPEC_GATHER))]
+  "TARGET_AVX2")
+
+(define_insn "*avx2_gatherdi<mode>"
+  [(set (match_operand:AVXMODE48P_DI 0 "register_operand" "=x")
+       (unspec:AVXMODE48P_DI
+         [(match_operand:AVXMODE48P_DI 1 "register_operand" "0")
+          (mem:<ssescalarmode>
+            (match_operand:P 2 "register_operand" "r"))
+          (match_operand:<AVXMODE48P_DI> 3 "register_operand" "x")
+          (match_operand:AVXMODE48P_DI 4 "register_operand" "x")
+          (match_operand:SI 5 "const1248_operand" "n")]
+         UNSPEC_GATHER))]
+  "TARGET_AVX2"
+  "v<gthrfirstp>gatherq<gthrlastp>\t{%4, (%2, %3, %c5), %0|%0, (%2, %3, %c5), %4}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+;; Special handling for VEX.256 with float arguments
+;; since there're still xmms as operands
+(define_expand "avx2_gatherdi<mode>256"
+  [(set (match_operand:VI4F_128 0 "register_operand" "")
+       (unspec:VI4F_128
+         [(match_operand:VI4F_128 1 "register_operand" "")
+          (match_operand:<ssescalarmode> 2 "memory_operand" "")
+          (match_operand:V4DI 3 "register_operand" "")
+          (match_operand:VI4F_128 4 "register_operand" "")
+          (match_operand:SI 5 "const1248_operand " "")]
+         UNSPEC_GATHER))]
+  "TARGET_AVX2")
+
+(define_insn "*avx2_gatherdi<mode>256"
+  [(set (match_operand:VI4F_128 0 "register_operand" "=x")
+       (unspec:VI4F_128
+         [(match_operand:VI4F_128 1 "register_operand" "0")
+          (mem:<ssescalarmode>
+            (match_operand:P 2 "register_operand" "r"))
+          (match_operand:V4DI 3 "register_operand" "x")
+          (match_operand:VI4F_128 4 "register_operand" "x")
+          (match_operand:SI 5 "const1248_operand" "n")]
+         UNSPEC_GATHER))]
+  "TARGET_AVX2"
+  "v<gthrfirstp>gatherq<gthrlastp>\t{%4, (%2, %3, %c5), %0|%0, (%2, %3, %c5), %4}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
index cf7fdbf..29c02b8 100644 (file)
@@ -9455,6 +9455,184 @@ v4df __builtin_ia32_xorpd256 (v4df,v4df)
 v8sf __builtin_ia32_xorps256 (v8sf,v8sf)
 @end smallexample
 
+The following built-in functions are available when @option{-mavx2} is
+used. All of them generate the machine instruction that is part of the
+name.
+
+@smallexample
+v32qi __builtin_ia32_mpsadbw256 (v32qi,v32qi,v32qi,int)
+v32qi __builtin_ia32_pabsb256 (v32qi)
+v16hi __builtin_ia32_pabsw256 (v16hi)
+v8si __builtin_ia32_pabsd256 (v8si)
+v16hi builtin_ia32_packssdw256 (v8si,v8si)
+v32qi __builtin_ia32_packsswb256 (v16hi,v16hi)
+v16hi __builtin_ia32_packusdw256 (v8si,v8si)
+v32qi __builtin_ia32_packuswb256 (v16hi,v16hi)
+v32qi__builtin_ia32_paddb256 (v32qi,v32qi)
+v16hi __builtin_ia32_paddw256 (v16hi,v16hi)
+v8si __builtin_ia32_paddd256 (v8si,v8si)
+v4di __builtin_ia32_paddq256 (v4di,v4di)
+v32qi __builtin_ia32_paddsb256 (v32qi,v32qi)
+v16hi __builtin_ia32_paddsw256 (v16hi,v16hi)
+v32qi __builtin_ia32_paddusb256 (v32qi,v32qi)
+v16hi __builtin_ia32_paddusw256 (v16hi,v16hi)
+v4di __builtin_ia32_palignr256 (v4di,v4di,int)
+v4di __builtin_ia32_andsi256 (v4di,v4di)
+v4di __builtin_ia32_andnotsi256 (v4di,v4di)
+v32qi__builtin_ia32_pavgb256 (v32qi,v32qi)
+v16hi __builtin_ia32_pavgw256 (v16hi,v16hi)
+v32qi __builtin_ia32_pblendvb256 (v32qi,v32qi,v32qi)
+v16hi __builtin_ia32_pblendw256 (v16hi,v16hi,int)
+v32qi __builtin_ia32_pcmpeqb256 (v32qi,v32qi)
+v16hi __builtin_ia32_pcmpeqw256 (v16hi,v16hi)
+v8si __builtin_ia32_pcmpeqd256 (c8si,v8si)
+v4di __builtin_ia32_pcmpeqq256 (v4di,v4di)
+v32qi __builtin_ia32_pcmpgtb256 (v32qi,v32qi)
+v16hi __builtin_ia32_pcmpgtw256 (16hi,v16hi)
+v8si __builtin_ia32_pcmpgtd256 (v8si,v8si)
+v4di __builtin_ia32_pcmpgtq256 (v4di,v4di)
+v16hi __builtin_ia32_phaddw256 (v16hi,v16hi)
+v8si __builtin_ia32_phaddd256 (v8si,v8si)
+v16hi __builtin_ia32_phaddsw256 (v16hi,v16hi)
+v16hi __builtin_ia32_phsubw256 (v16hi,v16hi)
+v8si __builtin_ia32_phsubd256 (v8si,v8si)
+v16hi __builtin_ia32_phsubsw256 (v16hi,v16hi)
+v32qi __builtin_ia32_pmaddubsw256 (v32qi,v32qi)
+v16hi __builtin_ia32_pmaddwd256 (v16hi,v16hi)
+v32qi __builtin_ia32_pmaxsb256 (v32qi,v32qi)
+v16hi __builtin_ia32_pmaxsw256 (v16hi,v16hi)
+v8si __builtin_ia32_pmaxsd256 (v8si,v8si)
+v32qi __builtin_ia32_pmaxub256 (v32qi,v32qi)
+v16hi __builtin_ia32_pmaxuw256 (v16hi,v16hi)
+v8si __builtin_ia32_pmaxud256 (v8si,v8si)
+v32qi __builtin_ia32_pminsb256 (v32qi,v32qi)
+v16hi __builtin_ia32_pminsw256 (v16hi,v16hi)
+v8si __builtin_ia32_pminsd256 (v8si,v8si)
+v32qi __builtin_ia32_pminub256 (v32qi,v32qi)
+v16hi __builtin_ia32_pminuw256 (v16hi,v16hi)
+v8si __builtin_ia32_pminud256 (v8si,v8si)
+int __builtin_ia32_pmovmskb256 (v32qi)
+v16hi __builtin_ia32_pmovsxbw256 (v16qi)
+v8si __builtin_ia32_pmovsxbd256 (v16qi)
+v4di __builtin_ia32_pmovsxbq256 (v16qi)
+v8si __builtin_ia32_pmovsxwd256 (v8hi)
+v4di __builtin_ia32_pmovsxwq256 (v8hi)
+v4di __builtin_ia32_pmovsxdq256 (v4si)
+v16hi __builtin_ia32_pmovzxbw256 (v16qi)
+v8si __builtin_ia32_pmovzxbd256 (v16qi)
+v4di __builtin_ia32_pmovzxbq256 (v16qi)
+v8si __builtin_ia32_pmovzxwd256 (v8hi)
+v4di __builtin_ia32_pmovzxwq256 (v8hi)
+v4di __builtin_ia32_pmovzxdq256 (v4si)
+v4di __builtin_ia32_pmuldq256 (v8si,v8si)
+v16hi __builtin_ia32_pmulhrsw256 (v16hi, v16hi)
+v16hi __builtin_ia32_pmulhuw256 (v16hi,v16hi)
+v16hi __builtin_ia32_pmulhw256 (v16hi,v16hi)
+v16hi __builtin_ia32_pmullw256 (v16hi,v16hi)
+v8si __builtin_ia32_pmulld256 (v8si,v8si)
+v4di __builtin_ia32_pmuludq256 (v8si,v8si)
+v4di __builtin_ia32_por256 (v4di,v4di)
+v16hi __builtin_ia32_psadbw256 (v32qi,v32qi)
+v32qi __builtin_ia32_pshufb256 (v32qi,v32qi)
+v8si __builtin_ia32_pshufd256 (v8si,int)
+v16hi __builtin_ia32_pshufhw256 (v16hi,int)
+v16hi __builtin_ia32_pshuflw256 (v16hi,int)
+v32qi __builtin_ia32_psignb256 (v32qi,v32qi)
+v16hi __builtin_ia32_psignw256 (v16hi,v16hi)
+v8si __builtin_ia32_psignd256 (v8si,v8si)
+v4di __builtin_ia32_pslldqi256 (v4di,int)
+v16hi __builtin_ia32_psllwi256 (16hi,int)
+v16hi __builtin_ia32_psllw256(v16hi,v8hi)
+v8si __builtin_ia32_pslldi256 (v8si,int)
+v8si __builtin_ia32_pslld256(v8si,v4si)
+v4di __builtin_ia32_psllqi256 (v4di,int)
+v4di __builtin_ia32_psllq256(v4di,v2di)
+v16hi __builtin_ia32_psrawi256 (v16hi,int)
+v16hi __builtin_ia32_psraw256 (v16hi,v8hi)
+v8si __builtin_ia32_psradi256 (v8si,int)
+v8si __builtin_ia32_psrad256 (v8si,v4si)
+v4di __builtin_ia32_psrldqi256 (v4di, int)
+v16hi __builtin_ia32_psrlwi256 (v16hi,int)
+v16hi __builtin_ia32_psrlw256 (v16hi,v8hi)
+v8si __builtin_ia32_psrldi256 (v8si,int)
+v8si __builtin_ia32_psrld256 (v8si,v4si)
+v4di __builtin_ia32_psrlqi256 (v4di,int)
+v4di __builtin_ia32_psrlq256(v4di,v2di)
+v32qi __builtin_ia32_psubb256 (v32qi,v32qi)
+v32hi __builtin_ia32_psubw256 (v16hi,v16hi)
+v8si __builtin_ia32_psubd256 (v8si,v8si)
+v4di __builtin_ia32_psubq256 (v4di,v4di)
+v32qi __builtin_ia32_psubsb256 (v32qi,v32qi)
+v16hi __builtin_ia32_psubsw256 (v16hi,v16hi)
+v32qi __builtin_ia32_psubusb256 (v32qi,v32qi)
+v16hi __builtin_ia32_psubusw256 (v16hi,v16hi)
+v32qi __builtin_ia32_punpckhbw256 (v32qi,v32qi)
+v16hi __builtin_ia32_punpckhwd256 (v16hi,v16hi)
+v8si __builtin_ia32_punpckhdq256 (v8si,v8si)
+v4di __builtin_ia32_punpckhqdq256 (v4di,v4di)
+v32qi __builtin_ia32_punpcklbw256 (v32qi,v32qi)
+v16hi __builtin_ia32_punpcklwd256 (v16hi,v16hi)
+v8si __builtin_ia32_punpckldq256 (v8si,v8si)
+v4di __builtin_ia32_punpcklqdq256 (v4di,v4di)
+v4di __builtin_ia32_pxor256 (v4di,v4di)
+v4di __builtin_ia32_movntdqa256 (pv4di)
+v4sf __builtin_ia32_vbroadcastss_ps (v4sf)
+v8sf __builtin_ia32_vbroadcastss_ps256 (v4sf)
+v4df __builtin_ia32_vbroadcastsd_pd256 (v2df)
+v4di __builtin_ia32_vbroadcastsi256 (v2di)
+v4si __builtin_ia32_pblendd128 (v4si,v4si)
+v8si __builtin_ia32_pblendd256 (v8si,v8si)
+v32qi __builtin_ia32_pbroadcastb256 (v16qi)
+v16hi __builtin_ia32_pbroadcastw256 (v8hi)
+v8si __builtin_ia32_pbroadcastd256 (v4si)
+v4di __builtin_ia32_pbroadcastq256 (v2di)
+v16qi __builtin_ia32_pbroadcastb128 (v16qi)
+v8hi __builtin_ia32_pbroadcastw128 (v8hi)
+v4si __builtin_ia32_pbroadcastd128 (v4si)
+v2di __builtin_ia32_pbroadcastq128 (v2di)
+v8si __builtin_ia32_permvarsi256 (v8si,v8si)
+v4df __builtin_ia32_permdf256 (v4df,int)
+v8sf __builtin_ia32_permvarsf256 (v8sf,v8sf)
+v4di __builtin_ia32_permdi256 (v4di,int)
+v4di __builtin_ia32_permti256 (v4di,v4di,int)
+v4di __builtin_ia32_extract128i256 (v4di,int)
+v4di __builtin_ia32_insert128i256 (v4di,v2di,int)
+v8si __builtin_ia32_maskloadd256 (pcv8si,v8si)
+v4di __builtin_ia32_maskloadq256 (pcv4di,v4di)
+v4si __builtin_ia32_maskloadd (pcv4si,v4si)
+v2di __builtin_ia32_maskloadq (pcv2di,v2di)
+void __builtin_ia32_maskstored256 (pv8si,v8si,v8si)
+void __builtin_ia32_maskstoreq256 (pv4di,v4di,v4di)
+void __builtin_ia32_maskstored (pv4si,v4si,v4si)
+void __builtin_ia32_maskstoreq (pv2di,v2di,v2di)
+v8si __builtin_ia32_psllv8si (v8si,v8si)
+v4si __builtin_ia32_psllv4si (v4si,v4si)
+v4di __builtin_ia32_psllv4di (v4di,v4di)
+v2di __builtin_ia32_psllv2di (v2di,v2di)
+v8si __builtin_ia32_psrav8si (v8si,v8si)
+v4si __builtin_ia32_psrav4si (v4si,v4si)
+v8si __builtin_ia32_psrlv8si (v8si,v8si)
+v4si __builtin_ia32_psrlv4si (v4si,v4si)
+v4di __builtin_ia32_psrlv4di (v4di,v4di)
+v2di __builtin_ia32_psrlv2di (v2di,v2di)
+v2df __builtin_ia32_gathersiv2df (v2df, pcdouble,v4si,v2df,int)
+v4df __builtin_ia32_gathersiv4df (v4df, pcdouble,v4si,v4df,int)
+v2df __builtin_ia32_gatherdiv2df (v2df, pcdouble,v2di,v2df,int)
+v4df __builtin_ia32_gatherdiv4df (v4df, pcdouble,v4di,v4df,int)
+v4sf __builtin_ia32_gathersiv4sf (v4sf, pcfloat,v4si,v4sf,int)
+v8sf __builtin_ia32_gathersiv8sf (v8sf, pcfloat,v8si,v8sf,int)
+v4sf __builtin_ia32_gatherdiv4sf (v4sf, pcfloat,v2di,v4sf,int)
+v4sf __builtin_ia32_gatherdiv4sf256 (v4sf, pcfloat,v4di,v4sf,int)
+v2di __builtin_ia32_gathersiv2di (v2di, pcint64,v4si,v2di,int)
+v4di __builtin_ia32_gathersiv4di (v4di, pcint64,v4si,v4di,int)
+v2di __builtin_ia32_gatherdiv2di (v2di, pcint64,v2di,v2di,int)
+v4di __builtin_ia32_gatherdiv4di (v4di, pcint64,v4di,v4di,int)
+v4si __builtin_ia32_gathersiv4si (v4si, pcint,v4si,v4si,int)
+v8si __builtin_ia32_gathersiv8si (v8si, pcint,v8si,v8si,int)
+v4si __builtin_ia32_gatherdiv4si (v4si, pcint,v2di,v4si,int)
+v4si __builtin_ia32_gatherdiv4si256 (v4si, pcint,v4di,v4si,int)
+@end smallexample
+
 The following built-in functions are available when @option{-maes} is
 used.  All of them generate the machine instruction that is part of the
 name.