X-Git-Url: http://git.sourceforge.jp/view?p=pf3gnuchains%2Fgcc-fork.git;a=blobdiff_plain;f=gcc%2Fconfig%2Fi386%2Fsse.md;h=1b4684dd74f31e98570322d80d7a6437e6fe4d86;hp=016eae2d371925a76b26405478b23aba2ce31e63;hb=33bca1aae0bb7e08032c41ebfe158fe261e896e0;hpb=c241ed0ebabf9d77121162a0046f7616786a5bb3 diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 016eae2d371..1b4684dd74f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1,5 +1,5 @@ ;; GCC machine description for SSE instructions -;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 +;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 ;; Free Software Foundation, Inc. ;; ;; This file is part of GCC. @@ -18,6 +18,85 @@ ;; along with GCC; see the file COPYING3. If not see ;; . +(define_c_enum "unspec" [ + ;; SSE + UNSPEC_MOVNT + UNSPEC_MOVU + + ;; SSE3 + UNSPEC_LDDQU + + ;; SSSE3 + UNSPEC_PSHUFB + UNSPEC_PSIGN + UNSPEC_PALIGNR + + ;; For SSE4A support + UNSPEC_EXTRQI + UNSPEC_EXTRQ + UNSPEC_INSERTQI + UNSPEC_INSERTQ + + ;; For SSE4.1 support + UNSPEC_BLENDV + UNSPEC_INSERTPS + UNSPEC_DP + UNSPEC_MOVNTDQA + UNSPEC_MPSADBW + UNSPEC_PHMINPOSUW + UNSPEC_PTEST + + ;; For SSE4.2 support + UNSPEC_PCMPESTR + UNSPEC_PCMPISTR + + ;; For FMA4 support + UNSPEC_FMADDSUB + UNSPEC_XOP_UNSIGNED_CMP + UNSPEC_XOP_TRUEFALSE + UNSPEC_XOP_PERMUTE + UNSPEC_FRCZ + + ;; For AES support + UNSPEC_AESENC + UNSPEC_AESENCLAST + UNSPEC_AESDEC + UNSPEC_AESDECLAST + UNSPEC_AESIMC + UNSPEC_AESKEYGENASSIST + + ;; For PCLMUL support + UNSPEC_PCLMUL + + ;; For AVX support + UNSPEC_PCMP + UNSPEC_VPERMIL + UNSPEC_VPERMIL2 + UNSPEC_VPERMIL2F128 + UNSPEC_CAST + UNSPEC_VTESTP + UNSPEC_VCVTPH2PS + UNSPEC_VCVTPS2PH + + ;; For AVX2 support + UNSPEC_VPERMSI + UNSPEC_VPERMDF + UNSPEC_VPERMSF + UNSPEC_VPERMTI + UNSPEC_GATHER + UNSPEC_VSIBADDR +]) + +(define_c_enum "unspecv" [ + UNSPECV_LDMXCSR + UNSPECV_STMXCSR + UNSPECV_CLFLUSH + UNSPECV_MONITOR + UNSPECV_MWAIT + UNSPECV_VZEROALL + UNSPECV_VZEROUPPER +]) + ;; All vector modes including V?TImode, used in move patterns. (define_mode_iterator V16 [(V32QI "TARGET_AVX") V16QI @@ -126,10 +205,8 @@ (V4DI "TARGET_AVX2") V2DI]) (define_mode_iterator VI48_AVX2 - [V8SI V4SI V4DI V2DI]) - -(define_mode_iterator VI4SD_AVX2 - [V4SI V4DI]) + [(V8SI "TARGET_AVX2") V4SI + (V4DI "TARGET_AVX2") V2DI]) (define_mode_iterator V48_AVX2 [V4SF V2DF @@ -169,15 +246,6 @@ (V4SI "vec") (V8SI "avx2") (V2DI "vec") (V4DI "avx2")]) -;; Mapping of logic-shift operators -(define_code_iterator lshift [lshiftrt ashift]) - -;; Base name for define_insn -(define_code_attr lshift_insn [(lshiftrt "srl") (ashift "sll")]) - -;; Base name for insn mnemonic -(define_code_attr lshift [(lshiftrt "lshr") (ashift "lshl")]) - (define_mode_attr ssedoublemode [(V16HI "V16SI") (V8HI "V8SI")]) @@ -194,12 +262,14 @@ (define_mode_iterator VI12_128 [V16QI V8HI]) (define_mode_iterator VI14_128 [V16QI V4SI]) (define_mode_iterator VI124_128 [V16QI V8HI V4SI]) +(define_mode_iterator VI128_128 [V16QI V8HI V2DI]) (define_mode_iterator VI24_128 [V8HI V4SI]) (define_mode_iterator VI248_128 [V8HI V4SI V2DI]) +(define_mode_iterator VI48_128 [V4SI V2DI]) ;; Random 256bit vector integer mode combinations (define_mode_iterator VI124_256 [V32QI V16HI V8SI]) -(define_mode_iterator VI248_256 [V16HI V8SI V4DI]) +(define_mode_iterator VI48_256 [V8SI V4DI]) ;; Int-float size matches (define_mode_iterator VI4F_128 [V4SI V4SF]) @@ -242,12 +312,18 @@ (define_mode_attr sseintvecmode [(V8SF "V8SI") (V4DF "V4DI") (V4SF "V4SI") (V2DF "V2DI") - (V4DF "V4DI") (V8SF "V8SI") (V8SI "V8SI") (V4DI "V4DI") (V4SI "V4SI") (V2DI "V2DI") (V16HI "V16HI") (V8HI "V8HI") - (V32QI "V32QI") (V16QI "V16QI") - ]) + (V32QI "V32QI") (V16QI "V16QI")]) + +(define_mode_attr sseintvecmodelower + [(V8SF "v8si") (V4DF "v4di") + (V4SF "v4si") (V2DF "v2di") + (V8SI "v8si") (V4DI "v4di") + (V4SI "v4si") (V2DI "v2di") + (V16HI "v16hi") (V8HI "v8hi") + (V32QI "v32qi") (V16QI "v16qi")]) ;; Mapping of vector modes to a vector mode of double size (define_mode_attr ssedoublevecmode @@ -277,6 +353,13 @@ (V8SF "8") (V4DF "4") (V4SF "4") (V2DF "2")]) +;; SSE prefix for integer vector modes +(define_mode_attr sseintprefix + [(V2DI "p") (V2DF "") + (V4DI "p") (V4DF "") + (V4SI "p") (V4SF "") + (V8SI "p") (V8SF "")]) + ;; SSE scalar suffix for vector modes (define_mode_attr ssescalarmodesuffix [(SF "ss") (DF "sd") @@ -312,24 +395,6 @@ ;; Mix-n-match (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF]) -(define_mode_iterator AVXMODE48P_DI - [V2DI V2DF V4DI V4DF V4SF V4SI]) -(define_mode_attr AVXMODE48P_DI - [(V2DI "V2DI") (V2DF "V2DI") - (V4DI "V4DI") (V4DF "V4DI") - (V4SI "V2DI") (V4SF "V2DI") - (V8SI "V4DI") (V8SF "V4DI")]) -(define_mode_attr gthrfirstp - [(V2DI "p") (V2DF "") - (V4DI "p") (V4DF "") - (V4SI "p") (V4SF "") - (V8SI "p") (V8SF "")]) -(define_mode_attr gthrlastp - [(V2DI "q") (V2DF "pd") - (V4DI "q") (V4DF "pd") - (V4SI "d") (V4SF "ps") - (V8SI "d") (V8SF "ps")]) - (define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF]) ;; Mapping of immediate bits for blend instructions @@ -587,15 +652,15 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "")]) -(define_insn "sse2_movntsi" - [(set (match_operand:SI 0 "memory_operand" "=m") - (unspec:SI [(match_operand:SI 1 "register_operand" "r")] - UNSPEC_MOVNT))] +(define_insn "sse2_movnti" + [(set (match_operand:SWI48 0 "memory_operand" "=m") + (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")] + UNSPEC_MOVNT))] "TARGET_SSE2" "movnti\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "prefix_data16" "0") - (set_attr "mode" "V2DF")]) + (set_attr "mode" "")]) (define_insn "_movnt" [(set (match_operand:VF 0 "memory_operand" "=m") @@ -628,8 +693,9 @@ ;; Modes handled by storent patterns. (define_mode_iterator STORENT_MODE - [(SI "TARGET_SSE2") (SF "TARGET_SSE4A") (DF "TARGET_SSE4A") - (V2DI "TARGET_SSE2") + [(DI "TARGET_SSE2 && TARGET_64BIT") (SI "TARGET_SSE2") + (SF "TARGET_SSE4A") (DF "TARGET_SSE4A") + (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2") (V8SF "TARGET_AVX") V4SF (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) @@ -1110,14 +1176,14 @@ (parallel [(const_int 0)])) (vec_select:DF (match_dup 1) (parallel [(const_int 1)]))) (plusminus:DF - (vec_select:DF (match_dup 1) (parallel [(const_int 2)])) - (vec_select:DF (match_dup 1) (parallel [(const_int 3)])))) - (vec_concat:V2DF - (plusminus:DF (vec_select:DF (match_operand:V4DF 2 "nonimmediate_operand" "xm") (parallel [(const_int 0)])) - (vec_select:DF (match_dup 2) (parallel [(const_int 1)]))) + (vec_select:DF (match_dup 2) (parallel [(const_int 1)])))) + (vec_concat:V2DF + (plusminus:DF + (vec_select:DF (match_dup 1) (parallel [(const_int 2)])) + (vec_select:DF (match_dup 1) (parallel [(const_int 3)]))) (plusminus:DF (vec_select:DF (match_dup 2) (parallel [(const_int 2)])) (vec_select:DF (match_dup 2) (parallel [(const_int 3)]))))))] @@ -2237,48 +2303,23 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "DI")]) -(define_insn "avx_cvtdq2ps256" - [(set (match_operand:V8SF 0 "register_operand" "=x") - (float:V8SF (match_operand:V8SI 1 "nonimmediate_operand" "xm")))] - "TARGET_AVX" - "vcvtdq2ps\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") - (set_attr "prefix" "vex") - (set_attr "mode" "V8SF")]) - -(define_insn "sse2_cvtdq2ps" - [(set (match_operand:V4SF 0 "register_operand" "=x") - (float:V4SF (match_operand:V4SI 1 "nonimmediate_operand" "xm")))] +(define_insn "float2" + [(set (match_operand:VF1 0 "register_operand" "=x") + (float:VF1 + (match_operand: 1 "nonimmediate_operand" "xm")))] "TARGET_SSE2" "%vcvtdq2ps\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") (set_attr "prefix" "maybe_vex") - (set_attr "mode" "V4SF")]) + (set_attr "mode" "")]) -(define_expand "sse2_cvtudq2ps" - [(set (match_dup 5) - (float:V4SF (match_operand:V4SI 1 "nonimmediate_operand" ""))) - (set (match_dup 6) - (lt:V4SF (match_dup 5) (match_dup 3))) - (set (match_dup 7) - (and:V4SF (match_dup 6) (match_dup 4))) - (set (match_operand:V4SF 0 "register_operand" "") - (plus:V4SF (match_dup 5) (match_dup 7)))] - "TARGET_SSE2" +(define_expand "floatuns2" + [(match_operand:VF1 0 "register_operand" "") + (match_operand: 1 "register_operand" "")] + "TARGET_SSE2 && (mode == V4SFmode || TARGET_AVX2)" { - REAL_VALUE_TYPE TWO32r; - rtx x; - int i; - - real_ldexp (&TWO32r, &dconst1, 32); - x = const_double_from_real_value (TWO32r, SFmode); - - operands[3] = force_reg (V4SFmode, CONST0_RTX (V4SFmode)); - operands[4] = force_reg (V4SFmode, - ix86_build_const_vector (V4SFmode, 1, x)); - - for (i = 5; i < 8; i++) - operands[i] = gen_reg_rtx (V4SFmode); + ix86_expand_vector_convert_uns_vsivsf (operands[0], operands[1]); + DONE; }) (define_insn "avx_cvtps2dq256" @@ -2306,7 +2347,7 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) -(define_insn "avx_cvttps2dq256" +(define_insn "fix_truncv8sfv8si2" [(set (match_operand:V8SI 0 "register_operand" "=x") (fix:V8SI (match_operand:V8SF 1 "nonimmediate_operand" "xm")))] "TARGET_AVX" @@ -2315,7 +2356,7 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_insn "sse2_cvttps2dq" +(define_insn "fix_truncv4sfv4si2" [(set (match_operand:V4SI 0 "register_operand" "=x") (fix:V4SI (match_operand:V4SF 1 "nonimmediate_operand" "xm")))] "TARGET_SSE2" @@ -2335,6 +2376,19 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_expand "fixuns_trunc2" + [(match_operand: 0 "register_operand" "") + (match_operand:VF1 1 "register_operand" "")] + "TARGET_SSE2" +{ + rtx tmp[3]; + tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); + tmp[1] = gen_reg_rtx (mode); + emit_insn (gen_fix_trunc2 (tmp[1], tmp[0])); + emit_insn (gen_xor3 (operands[0], tmp[1], tmp[2])); + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel double-precision floating point conversion operations @@ -2508,7 +2562,7 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "DI")]) -(define_insn "avx_cvtdq2pd256" +(define_insn "floatv4siv4df2" [(set (match_operand:V4DF 0 "register_operand" "=x") (float:V4DF (match_operand:V4SI 1 "nonimmediate_operand" "xm")))] "TARGET_AVX" @@ -2537,7 +2591,7 @@ (match_operand:V4SI 1 "nonimmediate_operand" "xm") (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE2" - "%vcvtdq2pd\t{%1, %0|%0, %1}" + "%vcvtdq2pd\t{%1, %0|%0, %q1}" [(set_attr "type" "ssecvt") (set_attr "prefix" "maybe_vex") (set_attr "mode" "V2DF")]) @@ -2552,6 +2606,27 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_expand "avx_cvtpd2dq256_2" + [(set (match_operand:V8SI 0 "register_operand" "") + (vec_concat:V8SI + (unspec:V4SI [(match_operand:V4DF 1 "nonimmediate_operand" "")] + UNSPEC_FIX_NOTRUNC) + (match_dup 2)))] + "TARGET_AVX" + "operands[2] = CONST0_RTX (V4SImode);") + +(define_insn "*avx_cvtpd2dq256_2" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_concat:V8SI + (unspec:V4SI [(match_operand:V4DF 1 "nonimmediate_operand" "xm")] + UNSPEC_FIX_NOTRUNC) + (match_operand:V4SI 2 "const0_operand" "")))] + "TARGET_AVX" + "vcvtpd2dq{y}\t{%1, %x0|%x0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_expand "sse2_cvtpd2dq" [(set (match_operand:V4SI 0 "register_operand" "") (vec_concat:V4SI @@ -2583,7 +2658,7 @@ (set_attr "athlon_decode" "vector") (set_attr "bdver1_decode" "double")]) -(define_insn "avx_cvttpd2dq256" +(define_insn "fix_truncv4dfv4si2" [(set (match_operand:V4SI 0 "register_operand" "=x") (fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "xm")))] "TARGET_AVX" @@ -2592,6 +2667,25 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) +(define_expand "avx_cvttpd2dq256_2" + [(set (match_operand:V8SI 0 "register_operand" "") + (vec_concat:V8SI + (fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "")) + (match_dup 2)))] + "TARGET_AVX" + "operands[2] = CONST0_RTX (V4SImode);") + +(define_insn "*avx_cvttpd2dq256_2" + [(set (match_operand:V8SI 0 "register_operand" "=x") + (vec_concat:V8SI + (fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "xm")) + (match_operand:V4SI 2 "const0_operand" "")))] + "TARGET_AVX" + "vcvttpd2dq{y}\t{%1, %x0|%x0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) + (define_expand "sse2_cvttpd2dq" [(set (match_operand:V4SI 0 "register_operand" "") (vec_concat:V4SI @@ -2732,7 +2826,7 @@ (match_operand:V4SF 1 "nonimmediate_operand" "xm") (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE2" - "%vcvtps2pd\t{%1, %0|%0, %1}" + "%vcvtps2pd\t{%1, %0|%0, %q1}" [(set_attr "type" "ssecvt") (set_attr "amdfam10_decode" "direct") (set_attr "athlon_decode" "double") @@ -2968,7 +3062,7 @@ for (i = 2; i < 5; i++) tmp[i] = gen_reg_rtx (V4DFmode); emit_insn (gen_vec_extract_hi_v8si (tmp[5], operands[1])); - emit_insn (gen_avx_cvtdq2pd256 (tmp[2], tmp[5])); + emit_insn (gen_floatv4siv4df2 (tmp[2], tmp[5])); emit_insn (gen_rtx_SET (VOIDmode, tmp[3], gen_rtx_LT (V4DFmode, tmp[2], tmp[0]))); emit_insn (gen_andv4df3 (tmp[4], tmp[3], tmp[1])); @@ -3024,14 +3118,42 @@ (match_operand:V2DF 2 "nonimmediate_operand" "")] "TARGET_SSE2" { + rtx tmp0, tmp1; + + if (TARGET_AVX && !TARGET_PREFER_AVX128) + { + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); + + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_avx_cvtpd2ps256 (operands[0], tmp0)); + } + else + { + tmp0 = gen_reg_rtx (V4SFmode); + tmp1 = gen_reg_rtx (V4SFmode); + + emit_insn (gen_sse2_cvtpd2ps (tmp0, operands[1])); + emit_insn (gen_sse2_cvtpd2ps (tmp1, operands[2])); + emit_insn (gen_sse_movlhps (operands[0], tmp0, tmp1)); + } + DONE; +}) + +(define_expand "vec_pack_sfix_trunc_v4df" + [(match_operand:V8SI 0 "register_operand" "") + (match_operand:V4DF 1 "nonimmediate_operand" "") + (match_operand:V4DF 2 "nonimmediate_operand" "")] + "TARGET_AVX" +{ rtx r1, r2; - r1 = gen_reg_rtx (V4SFmode); - r2 = gen_reg_rtx (V4SFmode); + r1 = gen_reg_rtx (V4SImode); + r2 = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_cvtpd2ps (r1, operands[1])); - emit_insn (gen_sse2_cvtpd2ps (r2, operands[2])); - emit_insn (gen_sse_movlhps (operands[0], r1, r2)); + emit_insn (gen_fix_truncv4dfv4si2 (r1, operands[1])); + emit_insn (gen_fix_truncv4dfv4si2 (r2, operands[2])); + emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2)); DONE; }) @@ -3041,16 +3163,78 @@ (match_operand:V2DF 2 "nonimmediate_operand" "")] "TARGET_SSE2" { + rtx tmp0, tmp1; + + if (TARGET_AVX && !TARGET_PREFER_AVX128) + { + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); + + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp0)); + } + else + { + tmp0 = gen_reg_rtx (V4SImode); + tmp1 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_cvttpd2dq (tmp0, operands[1])); + emit_insn (gen_sse2_cvttpd2dq (tmp1, operands[2])); + emit_insn + (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), + gen_lowpart (V2DImode, tmp0), + gen_lowpart (V2DImode, tmp1))); + } + DONE; +}) + +(define_mode_attr ssepackfltmode + [(V4DF "V8SI") (V2DF "V4SI")]) + +(define_expand "vec_pack_ufix_trunc_" + [(match_operand: 0 "register_operand" "") + (match_operand:VF2 1 "register_operand" "") + (match_operand:VF2 2 "register_operand" "")] + "TARGET_SSE2" +{ + rtx tmp[7]; + tmp[0] = ix86_expand_adjust_ufix_to_sfix_si (operands[1], &tmp[2]); + tmp[1] = ix86_expand_adjust_ufix_to_sfix_si (operands[2], &tmp[3]); + tmp[4] = gen_reg_rtx (mode); + emit_insn (gen_vec_pack_sfix_trunc_ (tmp[4], tmp[0], tmp[1])); + if (mode == V4SImode || TARGET_AVX2) + { + tmp[5] = gen_reg_rtx (mode); + ix86_expand_vec_extract_even_odd (tmp[5], tmp[2], tmp[3], 0); + } + else + { + tmp[5] = gen_reg_rtx (V8SFmode); + ix86_expand_vec_extract_even_odd (tmp[5], gen_lowpart (V8SFmode, tmp[2]), + gen_lowpart (V8SFmode, tmp[3]), 0); + tmp[5] = gen_lowpart (V8SImode, tmp[5]); + } + tmp[6] = expand_simple_binop (mode, XOR, tmp[4], tmp[5], + operands[0], 0, OPTAB_DIRECT); + if (tmp[6] != operands[0]) + emit_move_insn (operands[0], tmp[6]); + DONE; +}) + +(define_expand "vec_pack_sfix_v4df" + [(match_operand:V8SI 0 "register_operand" "") + (match_operand:V4DF 1 "nonimmediate_operand" "") + (match_operand:V4DF 2 "nonimmediate_operand" "")] + "TARGET_AVX" +{ rtx r1, r2; r1 = gen_reg_rtx (V4SImode); r2 = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_cvttpd2dq (r1, operands[1])); - emit_insn (gen_sse2_cvttpd2dq (r2, operands[2])); - emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), - gen_lowpart (V2DImode, r1), - gen_lowpart (V2DImode, r2))); + emit_insn (gen_avx_cvtpd2dq256 (r1, operands[1])); + emit_insn (gen_avx_cvtpd2dq256 (r2, operands[2])); + emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2)); DONE; }) @@ -3060,16 +3244,28 @@ (match_operand:V2DF 2 "nonimmediate_operand" "")] "TARGET_SSE2" { - rtx r1, r2; + rtx tmp0, tmp1; - r1 = gen_reg_rtx (V4SImode); - r2 = gen_reg_rtx (V4SImode); + if (TARGET_AVX && !TARGET_PREFER_AVX128) + { + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); - emit_insn (gen_sse2_cvtpd2dq (r1, operands[1])); - emit_insn (gen_sse2_cvtpd2dq (r2, operands[2])); - emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), - gen_lowpart (V2DImode, r1), - gen_lowpart (V2DImode, r2))); + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_avx_cvtpd2dq256 (operands[0], tmp0)); + } + else + { + tmp0 = gen_reg_rtx (V4SImode); + tmp1 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_cvtpd2dq (tmp0, operands[1])); + emit_insn (gen_sse2_cvtpd2dq (tmp1, operands[2])); + emit_insn + (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]), + gen_lowpart (V2DImode, tmp0), + gen_lowpart (V2DImode, tmp1))); + } DONE; }) @@ -3601,19 +3797,9 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "SF")]) -(define_expand "vec_dupv4sf" - [(set (match_operand:V4SF 0 "register_operand" "") - (vec_duplicate:V4SF - (match_operand:SF 1 "nonimmediate_operand" "")))] - "TARGET_SSE" -{ - if (!TARGET_AVX) - operands[1] = force_reg (SFmode, operands[1]); -}) - -(define_insn "avx2_vec_dupv4sf" - [(set (match_operand:V4SF 0 "register_operand" "=x") - (vec_duplicate:V4SF +(define_insn "avx2_vec_dup" + [(set (match_operand:VF1 0 "register_operand" "=x") + (vec_duplicate:VF1 (vec_select:SF (match_operand:V4SF 1 "register_operand" "x") (parallel [(const_int 0)]))))] @@ -3621,42 +3807,22 @@ "vbroadcastss\t{%1, %0|%0, %1}" [(set_attr "type" "sselog1") (set_attr "prefix" "vex") - (set_attr "mode" "V4SF")]) + (set_attr "mode" "")]) -(define_insn "*vec_dupv4sf_avx" - [(set (match_operand:V4SF 0 "register_operand" "=x,x") +(define_insn "vec_dupv4sf" + [(set (match_operand:V4SF 0 "register_operand" "=x,x,x") (vec_duplicate:V4SF - (match_operand:SF 1 "nonimmediate_operand" "x,m")))] - "TARGET_AVX" + (match_operand:SF 1 "nonimmediate_operand" "x,m,0")))] + "TARGET_SSE" "@ vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0} - vbroadcastss\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog1,ssemov") - (set_attr "length_immediate" "1,0") - (set_attr "prefix_extra" "0,1") - (set_attr "prefix" "vex") - (set_attr "mode" "V4SF")]) - -(define_insn "avx2_vec_dupv8sf" - [(set (match_operand:V8SF 0 "register_operand" "=x") - (vec_duplicate:V8SF - (vec_select:SF - (match_operand:V4SF 1 "register_operand" "x") - (parallel [(const_int 0)]))))] - "TARGET_AVX2" - "vbroadcastss\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog1") - (set_attr "prefix" "vex") - (set_attr "mode" "V8SF")]) - -(define_insn "*vec_dupv4sf" - [(set (match_operand:V4SF 0 "register_operand" "=x") - (vec_duplicate:V4SF - (match_operand:SF 1 "register_operand" "0")))] - "TARGET_SSE" - "shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "type" "sselog1") - (set_attr "length_immediate" "1") + vbroadcastss\t{%1, %0|%0, %1} + shufps\t{$0, %0, %0|%0, %0, 0}" + [(set_attr "isa" "avx,avx,noavx") + (set_attr "type" "sselog1,ssemov,sselog1") + (set_attr "length_immediate" "1,0,1") + (set_attr "prefix_extra" "0,1,*") + (set_attr "prefix" "vex,vex,orig") (set_attr "mode" "V4SF")]) ;; Although insertps takes register source, we prefer @@ -3729,13 +3895,13 @@ ;; see comment above inline_secondary_memory_needed function in i386.c (define_insn "vec_set_0" [(set (match_operand:VI4F_128 0 "nonimmediate_operand" - "=x,x,x ,x,x,x,x ,x ,m,m ,m") + "=x,x,x ,x,x,x,x ,x ,m ,m ,m") (vec_merge:VI4F_128 (vec_duplicate:VI4F_128 (match_operand: 2 "general_operand" - " x,m,*r,m,x,x,*rm,*rm,x,fF,*r")) + " x,m,*r,m,x,x,*rm,*rm,!x,!*re,!*fF")) (match_operand:VI4F_128 1 "vector_move_operand" - " C,C,C ,C,0,x,0 ,x ,0,0 ,0") + " C,C,C ,C,0,x,0 ,x ,0 ,0 ,0") (const_int 1)))] "TARGET_SSE" "@ @@ -3755,9 +3921,9 @@ (cond [(eq_attr "alternative" "0,6,7") (const_string "sselog") (eq_attr "alternative" "9") - (const_string "fmov") - (eq_attr "alternative" "10") (const_string "imov") + (eq_attr "alternative" "10") + (const_string "fmov") ] (const_string "ssemov"))) (set_attr "prefix_extra" "*,*,*,*,*,*,1,1,*,*,*") @@ -3874,6 +4040,62 @@ DONE; }) +(define_insn_and_split "*sse4_1_extractps" + [(set (match_operand:SF 0 "nonimmediate_operand" "=rm,x,x") + (vec_select:SF + (match_operand:V4SF 1 "register_operand" "x,0,x") + (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n,n")])))] + "TARGET_SSE4_1" + "@ + %vextractps\t{%2, %1, %0|%0, %1, %2} + # + #" + "&& reload_completed && SSE_REG_P (operands[0])" + [(const_int 0)] +{ + rtx dest = gen_rtx_REG (V4SFmode, REGNO (operands[0])); + switch (INTVAL (operands[2])) + { + case 1: + case 3: + emit_insn (gen_sse_shufps_v4sf (dest, operands[1], operands[1], + operands[2], operands[2], + GEN_INT (INTVAL (operands[2]) + 4), + GEN_INT (INTVAL (operands[2]) + 4))); + break; + case 2: + emit_insn (gen_vec_interleave_highv4sf (dest, operands[1], operands[1])); + break; + default: + /* 0 should be handled by the *vec_extractv4sf_0 pattern above. */ + gcc_unreachable (); + } + DONE; +} + [(set_attr "isa" "*,noavx,avx") + (set_attr "type" "sselog,*,*") + (set_attr "prefix_data16" "1,*,*") + (set_attr "prefix_extra" "1,*,*") + (set_attr "length_immediate" "1,*,*") + (set_attr "prefix" "maybe_vex,*,*") + (set_attr "mode" "V4SF,*,*")]) + +(define_insn_and_split "*vec_extract_v4sf_mem" + [(set (match_operand:SF 0 "register_operand" "=x,*r,f") + (vec_select:SF + (match_operand:V4SF 1 "memory_operand" "o,o,o") + (parallel [(match_operand 2 "const_0_to_3_operand" "n,n,n")])))] + "TARGET_SSE" + "#" + "&& reload_completed" + [(const_int 0)] +{ + int i = INTVAL (operands[2]); + + emit_move_insn (operands[0], adjust_address (operands[1], SFmode, i*4)); + DONE; +}) + (define_expand "avx_vextractf128" [(match_operand: 0 "nonimmediate_operand" "") (match_operand:V_256 1 "register_operand" "") @@ -4052,76 +4274,20 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_insn_and_split "*sse4_1_extractps" - [(set (match_operand:SF 0 "nonimmediate_operand" "=rm,x,x") - (vec_select:SF - (match_operand:V4SF 1 "register_operand" "x,0,x") - (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n,n,n")])))] - "TARGET_SSE4_1" - "@ - %vextractps\t{%2, %1, %0|%0, %1, %2} - # - #" - "&& reload_completed && SSE_REG_P (operands[0])" - [(const_int 0)] -{ - rtx dest = gen_rtx_REG (V4SFmode, REGNO (operands[0])); - switch (INTVAL (operands[2])) - { - case 1: - case 3: - emit_insn (gen_sse_shufps_v4sf (dest, operands[1], operands[1], - operands[2], operands[2], - GEN_INT (INTVAL (operands[2]) + 4), - GEN_INT (INTVAL (operands[2]) + 4))); - break; - case 2: - emit_insn (gen_vec_interleave_highv4sf (dest, operands[1], operands[1])); - break; - default: - /* 0 should be handled by the *vec_extractv4sf_0 pattern above. */ - gcc_unreachable (); - } - DONE; -} - [(set_attr "isa" "*,noavx,avx") - (set_attr "type" "sselog,*,*") - (set_attr "prefix_data16" "1,*,*") - (set_attr "prefix_extra" "1,*,*") - (set_attr "length_immediate" "1,*,*") - (set_attr "prefix" "maybe_vex,*,*") - (set_attr "mode" "V4SF,*,*")]) - -(define_insn_and_split "*vec_extract_v4sf_mem" - [(set (match_operand:SF 0 "register_operand" "=x*rf") - (vec_select:SF - (match_operand:V4SF 1 "memory_operand" "o") - (parallel [(match_operand 2 "const_0_to_3_operand" "n")])))] - "TARGET_SSE" - "#" - "&& reload_completed" - [(const_int 0)] -{ - int i = INTVAL (operands[2]); - - emit_move_insn (operands[0], adjust_address (operands[1], SFmode, i*4)); - DONE; -}) - -;; Modes handled by vec_extract patterns. -(define_mode_iterator VEC_EXTRACT_MODE - [(V32QI "TARGET_AVX") V16QI - (V16HI "TARGET_AVX") V8HI - (V8SI "TARGET_AVX") V4SI - (V4DI "TARGET_AVX") V2DI - (V8SF "TARGET_AVX") V4SF - (V4DF "TARGET_AVX") V2DF]) - -(define_expand "vec_extract" - [(match_operand: 0 "register_operand" "") - (match_operand:VEC_EXTRACT_MODE 1 "register_operand" "") - (match_operand 2 "const_int_operand" "")] - "TARGET_SSE" +;; Modes handled by vec_extract patterns. +(define_mode_iterator VEC_EXTRACT_MODE + [(V32QI "TARGET_AVX") V16QI + (V16HI "TARGET_AVX") V8HI + (V8SI "TARGET_AVX") V4SI + (V4DI "TARGET_AVX") V2DI + (V8SF "TARGET_AVX") V4SF + (V4DF "TARGET_AVX") V2DF]) + +(define_expand "vec_extract" + [(match_operand: 0 "register_operand" "") + (match_operand:VEC_EXTRACT_MODE 1 "register_operand" "") + (match_operand 2 "const_int_operand" "")] + "TARGET_SSE" { ix86_expand_vector_extract (false, operands[0], operands[1], INTVAL (operands[2])); @@ -4239,15 +4405,14 @@ [(set (match_operand:V4DF 0 "register_operand" "=x,x") (vec_select:V4DF (vec_concat:V8DF - (match_operand:V4DF 1 "nonimmediate_operand" "xm,x") - (match_operand:V4DF 2 "nonimmediate_operand" " 1,xm")) + (match_operand:V4DF 1 "nonimmediate_operand" " x,m") + (match_operand:V4DF 2 "nonimmediate_operand" "xm,1")) (parallel [(const_int 0) (const_int 4) (const_int 2) (const_int 6)])))] - "TARGET_AVX - && (!MEM_P (operands[1]) || rtx_equal_p (operands[1], operands[2]))" + "TARGET_AVX" "@ - vmovddup\t{%1, %0|%0, %1} - vunpcklpd\t{%2, %1, %0|%0, %1, %2}" + vunpcklpd\t{%2, %1, %0|%0, %1, %2} + vmovddup\t{%1, %0|%0, %1}" [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "V4DF")]) @@ -4403,35 +4568,6 @@ DONE; }) -;; Modes handled by vec_extract_even/odd pattern. -(define_mode_iterator VEC_EXTRACT_EVENODD_MODE - [(V32QI "TARGET_AVX2") (V16QI "TARGET_SSE2") - (V16HI "TARGET_AVX2") (V8HI "TARGET_SSE2") - (V8SI "TARGET_AVX2") (V4SI "TARGET_SSE2") - (V4DI "TARGET_AVX2") (V2DI "TARGET_SSE2") - (V8SF "TARGET_AVX") V4SF - (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) - -(define_expand "vec_extract_even" - [(match_operand:VEC_EXTRACT_EVENODD_MODE 0 "register_operand" "") - (match_operand:VEC_EXTRACT_EVENODD_MODE 1 "register_operand" "") - (match_operand:VEC_EXTRACT_EVENODD_MODE 2 "register_operand" "")] - "TARGET_SSE" -{ - ix86_expand_vec_extract_even_odd (operands[0], operands[1], operands[2], 0); - DONE; -}) - -(define_expand "vec_extract_odd" - [(match_operand:VEC_EXTRACT_EVENODD_MODE 0 "register_operand" "") - (match_operand:VEC_EXTRACT_EVENODD_MODE 1 "register_operand" "") - (match_operand:VEC_EXTRACT_EVENODD_MODE 2 "register_operand" "")] - "TARGET_SSE" -{ - ix86_expand_vec_extract_even_odd (operands[0], operands[1], operands[2], 1); - DONE; -}) - ;; punpcklqdq and punpckhqdq are shorter than shufpd. (define_insn "avx2_interleave_highv4di" [(set (match_operand:V4DI 0 "register_operand" "=x") @@ -4754,7 +4890,7 @@ (vec_select:DF (match_dup 0) (parallel [(const_int 1)]))))] "TARGET_SSE2 && reload_completed" [(set (match_dup 0) (match_dup 1))] - "operands[0] = adjust_address (operands[0], DFmode, 8);") + "operands[0] = adjust_address (operands[0], DFmode, 0);") (define_insn "sse2_movsd" [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,x,m,x,x,x,o") @@ -4789,69 +4925,43 @@ (set_attr "prefix" "orig,vex,orig,vex,maybe_vex,orig,orig,vex,maybe_vex") (set_attr "mode" "DF,DF,V1DF,V1DF,V1DF,V2DF,V1DF,V1DF,V1DF")]) -(define_expand "vec_dupv2df" - [(set (match_operand:V2DF 0 "register_operand" "") +(define_insn "vec_dupv2df" + [(set (match_operand:V2DF 0 "register_operand" "=x,x") (vec_duplicate:V2DF - (match_operand:DF 1 "nonimmediate_operand" "")))] + (match_operand:DF 1 "nonimmediate_operand" " 0,xm")))] "TARGET_SSE2" -{ - if (!TARGET_SSE3) - operands[1] = force_reg (DFmode, operands[1]); -}) - -(define_insn "*vec_dupv2df_sse3" - [(set (match_operand:V2DF 0 "register_operand" "=x") - (vec_duplicate:V2DF - (match_operand:DF 1 "nonimmediate_operand" "xm")))] - "TARGET_SSE3" - "%vmovddup\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog1") - (set_attr "prefix" "maybe_vex") - (set_attr "mode" "DF")]) - -(define_insn "*vec_dupv2df" - [(set (match_operand:V2DF 0 "register_operand" "=x") - (vec_duplicate:V2DF - (match_operand:DF 1 "register_operand" "0")))] - "TARGET_SSE2" - "unpcklpd\t%0, %0" - [(set_attr "type" "sselog1") + "@ + unpcklpd\t%0, %0 + %vmovddup\t{%1, %0|%0, %1}" + [(set_attr "isa" "noavx,sse3") + (set_attr "type" "sselog1") + (set_attr "prefix" "orig,maybe_vex") (set_attr "mode" "V2DF")]) -(define_insn "*vec_concatv2df_sse3" - [(set (match_operand:V2DF 0 "register_operand" "=x") - (vec_concat:V2DF - (match_operand:DF 1 "nonimmediate_operand" "xm") - (match_dup 1)))] - "TARGET_SSE3" - "%vmovddup\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog1") - (set_attr "prefix" "maybe_vex") - (set_attr "mode" "DF")]) - (define_insn "*vec_concatv2df" - [(set (match_operand:V2DF 0 "register_operand" "=x,x,x,x,x,x,x") + [(set (match_operand:V2DF 0 "register_operand" "=x,x,x,x,x,x,x,x") (vec_concat:V2DF - (match_operand:DF 1 "nonimmediate_operand" " 0,x,0,x,m,0,0") - (match_operand:DF 2 "vector_move_operand" " x,x,m,m,C,x,m")))] + (match_operand:DF 1 "nonimmediate_operand" " 0,x,m,0,x,m,0,0") + (match_operand:DF 2 "vector_move_operand" " x,x,1,m,m,C,x,m")))] "TARGET_SSE" "@ unpcklpd\t{%2, %0|%0, %2} vunpcklpd\t{%2, %1, %0|%0, %1, %2} + %vmovddup\t{%1, %0|%0, %1} movhpd\t{%2, %0|%0, %2} vmovhpd\t{%2, %1, %0|%0, %1, %2} %vmovsd\t{%1, %0|%0, %1} movlhps\t{%2, %0|%0, %2} movhps\t{%2, %0|%0, %2}" - [(set_attr "isa" "sse2_noavx,avx,sse2_noavx,avx,sse2,noavx,noavx") + [(set_attr "isa" "sse2_noavx,avx,sse3,sse2_noavx,avx,sse2,noavx,noavx") (set (attr "type") (if_then_else - (eq_attr "alternative" "0,1") + (eq_attr "alternative" "0,1,2") (const_string "sselog") (const_string "ssemov"))) - (set_attr "prefix_data16" "*,*,1,*,*,*,*") - (set_attr "prefix" "orig,vex,orig,vex,maybe_vex,orig,orig") - (set_attr "mode" "V2DF,V2DF,V1DF,V1DF,DF,V4SF,V2SF")]) + (set_attr "prefix_data16" "*,*,*,1,*,*,*,*") + (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex,orig,orig") + (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,DF,V4SF,V2SF")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -4956,7 +5066,24 @@ gen_lowpart (mulmode, t[3])))); /* Extract the even bytes and merge them back together. */ - ix86_expand_vec_extract_even_odd (operands[0], t[5], t[4], 0); + if (mode == V16QImode) + ix86_expand_vec_extract_even_odd (operands[0], t[5], t[4], 0); + else + { + /* Since avx2_interleave_{low,high}v32qi used above aren't cross-lane, + this can't be normal even extraction, but one where additionally + the second and third quarter are swapped. That is even one insn + shorter than even extraction. */ + rtvec v = rtvec_alloc (32); + for (i = 0; i < 32; ++i) + RTVEC_ELT (v, i) + = GEN_INT (i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0)); + t[0] = operands[0]; + t[1] = t[5]; + t[2] = t[4]; + t[3] = gen_rtx_CONST_VECTOR (mode, v); + ix86_expand_vec_perm_const (t); + } set_unique_reg_note (get_last_insn (), REG_EQUAL, gen_rtx_MULT (mode, operands[1], operands[2])); @@ -5507,83 +5634,97 @@ DONE; }) -(define_expand "vec_widen_smult_hi_v8hi" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "") - (match_operand:V8HI 2 "register_operand" "")] +(define_expand "vec_widen_mult_hi_" + [(match_operand: 0 "register_operand" "") + (any_extend: + (match_operand:VI2_AVX2 1 "register_operand" "")) + (match_operand:VI2_AVX2 2 "register_operand" "")] "TARGET_SSE2" { rtx op1, op2, t1, t2, dest; op1 = operands[1]; op2 = operands[2]; - t1 = gen_reg_rtx (V8HImode); - t2 = gen_reg_rtx (V8HImode); - dest = gen_lowpart (V8HImode, operands[0]); + t1 = gen_reg_rtx (mode); + t2 = gen_reg_rtx (mode); + dest = gen_lowpart (mode, operands[0]); - emit_insn (gen_mulv8hi3 (t1, op1, op2)); - emit_insn (gen_smulv8hi3_highpart (t2, op1, op2)); - emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2)); + emit_insn (gen_mul3 (t1, op1, op2)); + emit_insn (gen_mul3_highpart (t2, op1, op2)); + emit_insn (gen_vec_interleave_high (dest, t1, t2)); DONE; }) -(define_expand "vec_widen_smult_lo_v8hi" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "") - (match_operand:V8HI 2 "register_operand" "")] +(define_expand "vec_widen_mult_lo_" + [(match_operand: 0 "register_operand" "") + (any_extend: + (match_operand:VI2_AVX2 1 "register_operand" "")) + (match_operand:VI2_AVX2 2 "register_operand" "")] "TARGET_SSE2" { rtx op1, op2, t1, t2, dest; op1 = operands[1]; op2 = operands[2]; - t1 = gen_reg_rtx (V8HImode); - t2 = gen_reg_rtx (V8HImode); - dest = gen_lowpart (V8HImode, operands[0]); + t1 = gen_reg_rtx (mode); + t2 = gen_reg_rtx (mode); + dest = gen_lowpart (mode, operands[0]); - emit_insn (gen_mulv8hi3 (t1, op1, op2)); - emit_insn (gen_smulv8hi3_highpart (t2, op1, op2)); - emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2)); + emit_insn (gen_mul3 (t1, op1, op2)); + emit_insn (gen_mul3_highpart (t2, op1, op2)); + emit_insn (gen_vec_interleave_low (dest, t1, t2)); DONE; }) -(define_expand "vec_widen_umult_hi_v8hi" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "") - (match_operand:V8HI 2 "register_operand" "")] - "TARGET_SSE2" +(define_expand "vec_widen_mult_hi_v8si" + [(match_operand:V4DI 0 "register_operand" "") + (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" "")) + (match_operand:V8SI 2 "nonimmediate_operand" "")] + "TARGET_AVX2" { - rtx op1, op2, t1, t2, dest; - - op1 = operands[1]; - op2 = operands[2]; - t1 = gen_reg_rtx (V8HImode); - t2 = gen_reg_rtx (V8HImode); - dest = gen_lowpart (V8HImode, operands[0]); + rtx t1, t2, t3, t4; - emit_insn (gen_mulv8hi3 (t1, op1, op2)); - emit_insn (gen_umulv8hi3_highpart (t2, op1, op2)); - emit_insn (gen_vec_interleave_highv8hi (dest, t1, t2)); + t1 = gen_reg_rtx (V4DImode); + t2 = gen_reg_rtx (V4DImode); + t3 = gen_reg_rtx (V8SImode); + t4 = gen_reg_rtx (V8SImode); + emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]), + const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]), + const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), + GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6)))); + emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), + GEN_INT (2 + (2 << 2) + (3 << 4) + (3 << 6)))); + emit_insn (gen_avx2_mulv4siv4di3 (operands[0], t3, t4)); DONE; }) -(define_expand "vec_widen_umult_lo_v8hi" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "") - (match_operand:V8HI 2 "register_operand" "")] - "TARGET_SSE2" +(define_expand "vec_widen_mult_lo_v8si" + [(match_operand:V4DI 0 "register_operand" "") + (any_extend:V4DI (match_operand:V8SI 1 "nonimmediate_operand" "")) + (match_operand:V8SI 2 "nonimmediate_operand" "")] + "TARGET_AVX2" { - rtx op1, op2, t1, t2, dest; - - op1 = operands[1]; - op2 = operands[2]; - t1 = gen_reg_rtx (V8HImode); - t2 = gen_reg_rtx (V8HImode); - dest = gen_lowpart (V8HImode, operands[0]); + rtx t1, t2, t3, t4; - emit_insn (gen_mulv8hi3 (t1, op1, op2)); - emit_insn (gen_umulv8hi3_highpart (t2, op1, op2)); - emit_insn (gen_vec_interleave_lowv8hi (dest, t1, t2)); + t1 = gen_reg_rtx (V4DImode); + t2 = gen_reg_rtx (V4DImode); + t3 = gen_reg_rtx (V8SImode); + t4 = gen_reg_rtx (V8SImode); + emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, operands[1]), + const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, operands[2]), + const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), + GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6)))); + emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), + GEN_INT (0 + (0 << 2) + (1 << 4) + (1 << 6)))); + emit_insn (gen_avx2_mulv4siv4di3 (operands[0], t3, t4)); DONE; }) @@ -5591,24 +5732,28 @@ [(match_operand:V2DI 0 "register_operand" "") (match_operand:V4SI 1 "register_operand" "") (match_operand:V4SI 2 "register_operand" "")] - "TARGET_XOP" + "TARGET_SSE4_1" { - rtx t1, t2; + rtx op1, op2, t1, t2; + op1 = operands[1]; + op2 = operands[2]; t1 = gen_reg_rtx (V4SImode); t2 = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_pshufd_1 (t1, operands[1], - GEN_INT (0), - GEN_INT (2), - GEN_INT (1), - GEN_INT (3))); - emit_insn (gen_sse2_pshufd_1 (t2, operands[2], - GEN_INT (0), - GEN_INT (2), - GEN_INT (1), - GEN_INT (3))); - emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2)); + if (TARGET_XOP) + { + emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2), + GEN_INT (1), GEN_INT (3))); + emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2), + GEN_INT (1), GEN_INT (3))); + emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2)); + DONE; + } + + emit_insn (gen_vec_interleave_highv4si (t1, op1, op1)); + emit_insn (gen_vec_interleave_highv4si (t2, op2, op2)); + emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2)); DONE; }) @@ -5616,24 +5761,28 @@ [(match_operand:V2DI 0 "register_operand" "") (match_operand:V4SI 1 "register_operand" "") (match_operand:V4SI 2 "register_operand" "")] - "TARGET_XOP" + "TARGET_SSE4_1" { - rtx t1, t2; + rtx op1, op2, t1, t2; + op1 = operands[1]; + op2 = operands[2]; t1 = gen_reg_rtx (V4SImode); t2 = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_pshufd_1 (t1, operands[1], - GEN_INT (0), - GEN_INT (2), - GEN_INT (1), - GEN_INT (3))); - emit_insn (gen_sse2_pshufd_1 (t2, operands[2], - GEN_INT (0), - GEN_INT (2), - GEN_INT (1), - GEN_INT (3))); - emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2)); + if (TARGET_XOP) + { + emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2), + GEN_INT (1), GEN_INT (3))); + emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2), + GEN_INT (1), GEN_INT (3))); + emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2)); + DONE; + } + + emit_insn (gen_vec_interleave_lowv4si (t1, op1, op1)); + emit_insn (gen_vec_interleave_lowv4si (t2, op2, op2)); + emit_insn (gen_sse4_1_mulv2siv2di3 (operands[0], t1, t2)); DONE; }) @@ -5675,30 +5824,35 @@ DONE; }) -(define_expand "sdot_prodv8hi" - [(match_operand:V4SI 0 "register_operand" "") - (match_operand:V8HI 1 "register_operand" "") - (match_operand:V8HI 2 "register_operand" "") - (match_operand:V4SI 3 "register_operand" "")] +(define_expand "sdot_prod" + [(match_operand: 0 "register_operand" "") + (match_operand:VI2_AVX2 1 "register_operand" "") + (match_operand:VI2_AVX2 2 "register_operand" "") + (match_operand: 3 "register_operand" "")] "TARGET_SSE2" { - rtx t = gen_reg_rtx (V4SImode); - emit_insn (gen_sse2_pmaddwd (t, operands[1], operands[2])); - emit_insn (gen_addv4si3 (operands[0], operands[3], t)); + rtx t = gen_reg_rtx (mode); + emit_insn (gen__pmaddwd (t, operands[1], operands[2])); + emit_insn (gen_rtx_SET (VOIDmode, operands[0], + gen_rtx_PLUS (mode, + operands[3], t))); DONE; }) -(define_expand "udot_prodv4si" +(define_code_attr sse2_sse4_1 + [(zero_extend "sse2") (sign_extend "sse4_1")]) + +(define_expand "dot_prodv4si" [(match_operand:V2DI 0 "register_operand" "") - (match_operand:V4SI 1 "register_operand" "") + (any_extend:V2DI (match_operand:V4SI 1 "register_operand" "")) (match_operand:V4SI 2 "register_operand" "") (match_operand:V2DI 3 "register_operand" "")] - "TARGET_SSE2" + " == ZERO_EXTEND ? TARGET_SSE2 : TARGET_SSE4_1" { rtx t1, t2, t3, t4; t1 = gen_reg_rtx (V2DImode); - emit_insn (gen_sse2_umulv2siv2di3 (t1, operands[1], operands[2])); + emit_insn (gen__mulv2siv2di3 (t1, operands[1], operands[2])); emit_insn (gen_addv2di3 (t1, t1, operands[3])); t2 = gen_reg_rtx (V4SImode); @@ -5711,12 +5865,41 @@ GEN_INT (32))); t4 = gen_reg_rtx (V2DImode); - emit_insn (gen_sse2_umulv2siv2di3 (t4, t2, t3)); + emit_insn (gen__mulv2siv2di3 (t4, t2, t3)); emit_insn (gen_addv2di3 (operands[0], t1, t4)); DONE; }) +(define_expand "dot_prodv8si" + [(match_operand:V4DI 0 "register_operand" "") + (any_extend:V4DI (match_operand:V8SI 1 "register_operand" "")) + (match_operand:V8SI 2 "register_operand" "") + (match_operand:V4DI 3 "register_operand" "")] + "TARGET_AVX2" +{ + rtx t1, t2, t3, t4; + + t1 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_mulv4siv4di3 (t1, operands[1], operands[2])); + emit_insn (gen_addv4di3 (t1, t1, operands[3])); + + t2 = gen_reg_rtx (V8SImode); + t3 = gen_reg_rtx (V8SImode); + emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t2), + gen_lowpart (V2TImode, operands[1]), + GEN_INT (32))); + emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, t3), + gen_lowpart (V2TImode, operands[2]), + GEN_INT (32))); + + t4 = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_mulv4siv4di3 (t4, t2, t3)); + + emit_insn (gen_addv4di3 (operands[0], t1, t4)); + DONE; +}) + (define_insn "ashr3" [(set (match_operand:VI24_AVX2 0 "register_operand" "=x,x") (ashiftrt:VI24_AVX2 @@ -5736,49 +5919,15 @@ (set_attr "prefix" "orig,vex") (set_attr "mode" "")]) -(define_insn "lshr3" - [(set (match_operand:VI248_AVX2 0 "register_operand" "=x,x") - (lshiftrt:VI248_AVX2 - (match_operand:VI248_AVX2 1 "register_operand" "0,x") - (match_operand:SI 2 "nonmemory_operand" "xN,xN")))] - "TARGET_SSE2" - "@ - psrl\t{%2, %0|%0, %2} - vpsrl\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "isa" "noavx,avx") - (set_attr "type" "sseishft") - (set (attr "length_immediate") - (if_then_else (match_operand 2 "const_int_operand" "") - (const_string "1") - (const_string "0"))) - (set_attr "prefix_data16" "1,*") - (set_attr "prefix" "orig,vex") - (set_attr "mode" "")]) - -(define_insn "avx2_lshl3" - [(set (match_operand:VI248_256 0 "register_operand" "=x") - (ashift:VI248_256 - (match_operand:VI248_256 1 "register_operand" "x") - (match_operand:SI 2 "nonmemory_operand" "xN")))] - "TARGET_AVX2" - "vpsll\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseishft") - (set_attr "prefix" "vex") - (set (attr "length_immediate") - (if_then_else (match_operand 2 "const_int_operand" "") - (const_string "1") - (const_string "0"))) - (set_attr "mode" "OI")]) - -(define_insn "ashl3" +(define_insn "3" [(set (match_operand:VI248_AVX2 0 "register_operand" "=x,x") - (ashift:VI248_AVX2 + (any_lshift:VI248_AVX2 (match_operand:VI248_AVX2 1 "register_operand" "0,x") (match_operand:SI 2 "nonmemory_operand" "xN,xN")))] "TARGET_SSE2" "@ - psll\t{%2, %0|%0, %2} - vpsll\t{%2, %1, %0|%0, %1, %2}" + p\t{%2, %0|%0, %2} + vp\t{%2, %1, %0|%0, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sseishft") (set (attr "length_immediate") @@ -5887,8 +6036,9 @@ (define_expand "3" [(set (match_operand:VI8_AVX2 0 "register_operand" "") - (maxmin:VI8_AVX2 (match_operand:VI8_AVX2 1 "register_operand" "") - (match_operand:VI8_AVX2 2 "register_operand" "")))] + (maxmin:VI8_AVX2 + (match_operand:VI8_AVX2 1 "register_operand" "") + (match_operand:VI8_AVX2 2 "register_operand" "")))] "TARGET_SSE4_2" { enum rtx_code code; @@ -5921,8 +6071,9 @@ (define_expand "3" [(set (match_operand:VI124_128 0 "register_operand" "") - (smaxmin:VI124_128 (match_operand:VI124_128 1 "nonimmediate_operand" "") - (match_operand:VI124_128 2 "nonimmediate_operand" "")))] + (smaxmin:VI124_128 + (match_operand:VI124_128 1 "nonimmediate_operand" "") + (match_operand:VI124_128 2 "nonimmediate_operand" "")))] "TARGET_SSE2" { if (TARGET_SSE4_1 || mode == V8HImode) @@ -5990,8 +6141,9 @@ (define_expand "3" [(set (match_operand:VI124_128 0 "register_operand" "") - (umaxmin:VI124_128 (match_operand:VI124_128 1 "nonimmediate_operand" "") - (match_operand:VI124_128 2 "nonimmediate_operand" "")))] + (umaxmin:VI124_128 + (match_operand:VI124_128 1 "nonimmediate_operand" "") + (match_operand:VI124_128 2 "nonimmediate_operand" "")))] "TARGET_SSE2" { if (TARGET_SSE4_1 || mode == V16QImode) @@ -6188,9 +6340,9 @@ (if_then_else:V_256 (match_operator 3 "" [(match_operand:VI_256 4 "nonimmediate_operand" "") - (match_operand:VI_256 5 "nonimmediate_operand" "")]) - (match_operand:V_256 1 "general_operand" "") - (match_operand:V_256 2 "general_operand" "")))] + (match_operand:VI_256 5 "general_operand" "")]) + (match_operand:V_256 1 "" "") + (match_operand:V_256 2 "" "")))] "TARGET_AVX2 && (GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (mode))" @@ -6205,9 +6357,9 @@ (if_then_else:V_128 (match_operator 3 "" [(match_operand:VI124_128 4 "nonimmediate_operand" "") - (match_operand:VI124_128 5 "nonimmediate_operand" "")]) - (match_operand:V_128 1 "general_operand" "") - (match_operand:V_128 2 "general_operand" "")))] + (match_operand:VI124_128 5 "general_operand" "")]) + (match_operand:V_128 1 "" "") + (match_operand:V_128 2 "" "")))] "TARGET_SSE2 && (GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (mode))" @@ -6222,9 +6374,9 @@ (if_then_else:VI8F_128 (match_operator 3 "" [(match_operand:V2DI 4 "nonimmediate_operand" "") - (match_operand:V2DI 5 "nonimmediate_operand" "")]) - (match_operand:VI8F_128 1 "general_operand" "") - (match_operand:VI8F_128 2 "general_operand" "")))] + (match_operand:V2DI 5 "general_operand" "")]) + (match_operand:VI8F_128 1 "" "") + (match_operand:VI8F_128 2 "" "")))] "TARGET_SSE4_2" { bool ok = ix86_expand_int_vcond (operands); @@ -7481,58 +7633,36 @@ (set_attr "prefix" "maybe_vex,orig,vex,maybe_vex,orig,orig") (set_attr "mode" "V2SF,TI,TI,TI,V4SF,V2SF")]) -(define_insn "*vec_dupv4si_avx" - [(set (match_operand:V4SI 0 "register_operand" "=x,x") - (vec_duplicate:V4SI - (match_operand:SI 1 "nonimmediate_operand" " x,m")))] - "TARGET_AVX" - "@ - vpshufd\t{$0, %1, %0|%0, %1, 0} - vbroadcastss\t{%1, %0|%0, %1}" - [(set_attr "type" "sselog1,ssemov") - (set_attr "length_immediate" "1,0") - (set_attr "prefix_extra" "0,1") - (set_attr "prefix" "vex") - (set_attr "mode" "TI,V4SF")]) - (define_insn "*vec_dupv4si" - [(set (match_operand:V4SI 0 "register_operand" "=x,x") + [(set (match_operand:V4SI 0 "register_operand" "=x,x,x") (vec_duplicate:V4SI - (match_operand:SI 1 "register_operand" " x,0")))] + (match_operand:SI 1 "nonimmediate_operand" " x,m,0")))] "TARGET_SSE" "@ - pshufd\t{$0, %1, %0|%0, %1, 0} + %vpshufd\t{$0, %1, %0|%0, %1, 0} + vbroadcastss\t{%1, %0|%0, %1} shufps\t{$0, %0, %0|%0, %0, 0}" - [(set_attr "isa" "sse2,*") - (set_attr "type" "sselog1") - (set_attr "length_immediate" "1") - (set_attr "mode" "TI,V4SF")]) - -(define_insn "*vec_dupv2di_sse3" - [(set (match_operand:V2DI 0 "register_operand" "=x,x,x") - (vec_duplicate:V2DI - (match_operand:DI 1 "nonimmediate_operand" " 0,x,m")))] - "TARGET_SSE3" - "@ - punpcklqdq\t%0, %0 - vpunpcklqdq\t{%d1, %0|%0, %d1} - %vmovddup\t{%1, %0|%0, %1}" - [(set_attr "isa" "noavx,avx,*") - (set_attr "type" "sselog1") - (set_attr "prefix" "orig,vex,maybe_vex") - (set_attr "mode" "TI,TI,DF")]) + [(set_attr "isa" "sse2,avx,noavx") + (set_attr "type" "sselog1,ssemov,sselog1") + (set_attr "length_immediate" "1,0,1") + (set_attr "prefix_extra" "0,1,*") + (set_attr "prefix" "maybe_vex,vex,orig") + (set_attr "mode" "TI,V4SF,V4SF")]) (define_insn "*vec_dupv2di" - [(set (match_operand:V2DI 0 "register_operand" "=x,x") + [(set (match_operand:V2DI 0 "register_operand" "=x,x,x,x") (vec_duplicate:V2DI - (match_operand:DI 1 "register_operand" " 0,0")))] + (match_operand:DI 1 "nonimmediate_operand" " 0,x,m,0")))] "TARGET_SSE" "@ punpcklqdq\t%0, %0 + vpunpcklqdq\t{%d1, %0|%0, %d1} + %vmovddup\t{%1, %0|%0, %1} movlhps\t%0, %0" - [(set_attr "isa" "sse2,*") - (set_attr "type" "sselog1,ssemov") - (set_attr "mode" "TI,V4SF")]) + [(set_attr "isa" "sse2_noavx,avx,sse3,noavx") + (set_attr "type" "sselog1,sselog1,sselog1,ssemov") + (set_attr "prefix" "orig,vex,maybe_vex,orig") + (set_attr "mode" "TI,TI,DF,V4SF")]) (define_insn "*vec_concatv2si_sse4_1" [(set (match_operand:V2SI 0 "register_operand" "=x, x,x,x, x, *y,*y") @@ -7987,25 +8117,6 @@ (set_attr "prefix" "maybe_vex") (set_attr "memory" "store")]) -(define_expand "sse_sfence" - [(set (match_dup 0) - (unspec:BLK [(match_dup 0)] UNSPEC_SFENCE))] - "TARGET_SSE || TARGET_3DNOW_A" -{ - operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); - MEM_VOLATILE_P (operands[0]) = 1; -}) - -(define_insn "*sse_sfence" - [(set (match_operand:BLK 0 "" "") - (unspec:BLK [(match_dup 0)] UNSPEC_SFENCE))] - "TARGET_SSE || TARGET_3DNOW_A" - "sfence" - [(set_attr "type" "sse") - (set_attr "length_address" "0") - (set_attr "atom_sse_attr" "fence") - (set_attr "memory" "unknown")]) - (define_insn "sse2_clflush" [(unspec_volatile [(match_operand 0 "address_operand" "p")] UNSPECV_CLFLUSH)] @@ -8015,43 +8126,6 @@ (set_attr "atom_sse_attr" "fence") (set_attr "memory" "unknown")]) -(define_expand "sse2_mfence" - [(set (match_dup 0) - (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))] - "TARGET_SSE2" -{ - operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); - MEM_VOLATILE_P (operands[0]) = 1; -}) - -(define_insn "*sse2_mfence" - [(set (match_operand:BLK 0 "" "") - (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))] - "TARGET_64BIT || TARGET_SSE2" - "mfence" - [(set_attr "type" "sse") - (set_attr "length_address" "0") - (set_attr "atom_sse_attr" "fence") - (set_attr "memory" "unknown")]) - -(define_expand "sse2_lfence" - [(set (match_dup 0) - (unspec:BLK [(match_dup 0)] UNSPEC_LFENCE))] - "TARGET_SSE2" -{ - operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); - MEM_VOLATILE_P (operands[0]) = 1; -}) - -(define_insn "*sse2_lfence" - [(set (match_operand:BLK 0 "" "") - (unspec:BLK [(match_dup 0)] UNSPEC_LFENCE))] - "TARGET_SSE2" - "lfence" - [(set_attr "type" "sse") - (set_attr "length_address" "0") - (set_attr "atom_sse_attr" "lfence") - (set_attr "memory" "unknown")]) (define_insn "sse3_mwait" [(unspec_volatile [(match_operand:SI 0 "register_operand" "a") @@ -9672,7 +9746,7 @@ (const_int 6) (const_int 7)]))))] "TARGET_SSE4_1" - "%vpmovbw\t{%1, %0|%0, %1}" + "%vpmovbw\t{%1, %0|%0, %q1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") @@ -9692,7 +9766,7 @@ (const_int 6) (const_int 7)]))))] "TARGET_AVX2" - "vpmovbd\t{%1, %0|%0, %1}" + "vpmovbd\t{%1, %0|%0, %q1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex") @@ -9708,7 +9782,7 @@ (const_int 2) (const_int 3)]))))] "TARGET_SSE4_1" - "%vpmovbd\t{%1, %0|%0, %1}" + "%vpmovbd\t{%1, %0|%0, %k1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") @@ -9735,7 +9809,7 @@ (const_int 2) (const_int 3)]))))] "TARGET_SSE4_1" - "%vpmovwd\t{%1, %0|%0, %1}" + "%vpmovwd\t{%1, %0|%0, %q1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") @@ -9751,7 +9825,7 @@ (const_int 2) (const_int 3)]))))] "TARGET_AVX2" - "vpmovbq\t{%1, %0|%0, %1}" + "vpmovbq\t{%1, %0|%0, %k1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex") @@ -9765,7 +9839,7 @@ (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE4_1" - "%vpmovbq\t{%1, %0|%0, %1}" + "%vpmovbq\t{%1, %0|%0, %w1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") @@ -9781,7 +9855,7 @@ (const_int 2) (const_int 3)]))))] "TARGET_AVX2" - "vpmovwq\t{%1, %0|%0, %1}" + "vpmovwq\t{%1, %0|%0, %q1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex") @@ -9795,7 +9869,7 @@ (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE4_1" - "%vpmovwq\t{%1, %0|%0, %1}" + "%vpmovwq\t{%1, %0|%0, %k1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") @@ -9819,7 +9893,7 @@ (parallel [(const_int 0) (const_int 1)]))))] "TARGET_SSE4_1" - "%vpmovdq\t{%1, %0|%0, %1}" + "%vpmovdq\t{%1, %0|%0, %q1}" [(set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "maybe_vex") @@ -9884,6 +9958,60 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "")]) +(define_expand "_round_sfix" + [(match_operand: 0 "register_operand" "") + (match_operand:VF1 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_0_to_15_operand" "")] + "TARGET_ROUND" +{ + rtx tmp = gen_reg_rtx (mode); + + emit_insn + (gen__round (tmp, operands[1], + operands[2])); + emit_insn + (gen_fix_trunc2 (operands[0], tmp)); + DONE; +}) + +(define_expand "_round_vec_pack_sfix" + [(match_operand: 0 "register_operand" "") + (match_operand:VF2 1 "nonimmediate_operand" "") + (match_operand:VF2 2 "nonimmediate_operand" "") + (match_operand:SI 3 "const_0_to_15_operand" "")] + "TARGET_ROUND" +{ + rtx tmp0, tmp1; + + if (mode == V2DFmode + && TARGET_AVX && !TARGET_PREFER_AVX128) + { + rtx tmp2 = gen_reg_rtx (V4DFmode); + + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); + + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_avx_roundpd256 (tmp2, tmp0, operands[3])); + emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2)); + } + else + { + tmp0 = gen_reg_rtx (mode); + tmp1 = gen_reg_rtx (mode); + + emit_insn + (gen__round (tmp0, operands[1], + operands[3])); + emit_insn + (gen__round (tmp1, operands[2], + operands[3])); + emit_insn + (gen_vec_pack_sfix_trunc_ (operands[0], tmp0, tmp1)); + } + DONE; +}) + (define_insn "sse4_1_round" [(set (match_operand:VF_128 0 "register_operand" "=x,x") (vec_merge:VF_128 @@ -9908,7 +10036,7 @@ (define_expand "round2" [(set (match_dup 4) (plus:VF - (match_operand:VF 1 "nonimmediate_operand" "") + (match_operand:VF 1 "register_operand" "") (match_dup 3))) (set (match_operand:VF 0 "register_operand" "") (unspec:VF @@ -9939,6 +10067,54 @@ operands[5] = GEN_INT (ROUND_TRUNC); }) +(define_expand "round2_sfix" + [(match_operand: 0 "register_operand" "") + (match_operand:VF1 1 "register_operand" "")] + "TARGET_ROUND && !flag_trapping_math" +{ + rtx tmp = gen_reg_rtx (mode); + + emit_insn (gen_round2 (tmp, operands[1])); + + emit_insn + (gen_fix_trunc2 (operands[0], tmp)); + DONE; +}) + +(define_expand "round2_vec_pack_sfix" + [(match_operand: 0 "register_operand" "") + (match_operand:VF2 1 "register_operand" "") + (match_operand:VF2 2 "register_operand" "")] + "TARGET_ROUND && !flag_trapping_math" +{ + rtx tmp0, tmp1; + + if (mode == V2DFmode + && TARGET_AVX && !TARGET_PREFER_AVX128) + { + rtx tmp2 = gen_reg_rtx (V4DFmode); + + tmp0 = gen_reg_rtx (V4DFmode); + tmp1 = force_reg (V2DFmode, operands[1]); + + emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2])); + emit_insn (gen_roundv4df2 (tmp2, tmp0)); + emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2)); + } + else + { + tmp0 = gen_reg_rtx (mode); + tmp1 = gen_reg_rtx (mode); + + emit_insn (gen_round2 (tmp0, operands[1])); + emit_insn (gen_round2 (tmp1, operands[2])); + + emit_insn + (gen_vec_pack_sfix_trunc_ (operands[0], tmp0, tmp1)); + } + DONE; +}) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Intel SSE4.2 string/text processing instructions @@ -11188,42 +11364,110 @@ (set_attr "mode" "TI")]) ;; XOP packed shift instructions. -;; FIXME: add V2DI back in (define_expand "vlshr3" - [(match_operand:VI124_128 0 "register_operand" "") - (match_operand:VI124_128 1 "register_operand" "") - (match_operand:VI124_128 2 "register_operand" "")] + [(set (match_operand:VI12_128 0 "register_operand" "") + (lshiftrt:VI12_128 + (match_operand:VI12_128 1 "register_operand" "") + (match_operand:VI12_128 2 "nonimmediate_operand" "")))] "TARGET_XOP" { rtx neg = gen_reg_rtx (mode); emit_insn (gen_neg2 (neg, operands[2])); - emit_insn (gen_xop_lshl3 (operands[0], operands[1], neg)); + emit_insn (gen_xop_shl3 (operands[0], operands[1], neg)); DONE; }) +(define_expand "vlshr3" + [(set (match_operand:VI48_128 0 "register_operand" "") + (lshiftrt:VI48_128 + (match_operand:VI48_128 1 "register_operand" "") + (match_operand:VI48_128 2 "nonimmediate_operand" "")))] + "TARGET_AVX2 || TARGET_XOP" +{ + if (!TARGET_AVX2) + { + rtx neg = gen_reg_rtx (mode); + emit_insn (gen_neg2 (neg, operands[2])); + emit_insn (gen_xop_shl3 (operands[0], operands[1], neg)); + DONE; + } +}) + +(define_expand "vlshr3" + [(set (match_operand:VI48_256 0 "register_operand" "") + (lshiftrt:VI48_256 + (match_operand:VI48_256 1 "register_operand" "") + (match_operand:VI48_256 2 "nonimmediate_operand" "")))] + "TARGET_AVX2") + (define_expand "vashr3" - [(match_operand:VI124_128 0 "register_operand" "") - (match_operand:VI124_128 1 "register_operand" "") - (match_operand:VI124_128 2 "register_operand" "")] + [(set (match_operand:VI128_128 0 "register_operand" "") + (ashiftrt:VI128_128 + (match_operand:VI128_128 1 "register_operand" "") + (match_operand:VI128_128 2 "nonimmediate_operand" "")))] "TARGET_XOP" { rtx neg = gen_reg_rtx (mode); emit_insn (gen_neg2 (neg, operands[2])); - emit_insn (gen_xop_ashl3 (operands[0], operands[1], neg)); + emit_insn (gen_xop_sha3 (operands[0], operands[1], neg)); DONE; }) +(define_expand "vashrv4si3" + [(set (match_operand:V4SI 0 "register_operand" "") + (ashiftrt:V4SI (match_operand:V4SI 1 "register_operand" "") + (match_operand:V4SI 2 "nonimmediate_operand" "")))] + "TARGET_AVX2 || TARGET_XOP" +{ + if (!TARGET_AVX2) + { + rtx neg = gen_reg_rtx (V4SImode); + emit_insn (gen_negv4si2 (neg, operands[2])); + emit_insn (gen_xop_shav4si3 (operands[0], operands[1], neg)); + DONE; + } +}) + +(define_expand "vashrv8si3" + [(set (match_operand:V8SI 0 "register_operand" "") + (ashiftrt:V8SI (match_operand:V8SI 1 "register_operand" "") + (match_operand:V8SI 2 "nonimmediate_operand" "")))] + "TARGET_AVX2") + (define_expand "vashl3" - [(match_operand:VI124_128 0 "register_operand" "") - (match_operand:VI124_128 1 "register_operand" "") - (match_operand:VI124_128 2 "register_operand" "")] + [(set (match_operand:VI12_128 0 "register_operand" "") + (ashift:VI12_128 + (match_operand:VI12_128 1 "register_operand" "") + (match_operand:VI12_128 2 "nonimmediate_operand" "")))] "TARGET_XOP" { - emit_insn (gen_xop_ashl3 (operands[0], operands[1], operands[2])); + emit_insn (gen_xop_sha3 (operands[0], operands[1], operands[2])); DONE; }) -(define_insn "xop_ashl3" +(define_expand "vashl3" + [(set (match_operand:VI48_128 0 "register_operand" "") + (ashift:VI48_128 + (match_operand:VI48_128 1 "register_operand" "") + (match_operand:VI48_128 2 "nonimmediate_operand" "")))] + "TARGET_AVX2 || TARGET_XOP" +{ + if (!TARGET_AVX2) + { + operands[2] = force_reg (mode, operands[2]); + emit_insn (gen_xop_sha3 (operands[0], operands[1], operands[2])); + DONE; + } +}) + +(define_expand "vashl3" + [(set (match_operand:VI48_256 0 "register_operand" "") + (ashift:VI48_256 + (match_operand:VI48_256 1 "register_operand" "") + (match_operand:VI48_256 2 "nonimmediate_operand" "")))] + "TARGET_AVX2") + +(define_insn "xop_sha3" [(set (match_operand:VI_128 0 "register_operand" "=x,x") (if_then_else:VI_128 (ge:VI_128 @@ -11242,7 +11486,7 @@ (set_attr "prefix_extra" "2") (set_attr "mode" "TI")]) -(define_insn "xop_lshl3" +(define_insn "xop_shl3" [(set (match_operand:VI_128 0 "register_operand" "=x,x") (if_then_else:VI_128 (ge:VI_128 @@ -11261,104 +11505,90 @@ (set_attr "prefix_extra" "2") (set_attr "mode" "TI")]) -;; SSE2 doesn't have some shift varients, so define versions for XOP +;; SSE2 doesn't have some shift variants, so define versions for XOP (define_expand "ashlv16qi3" - [(match_operand:V16QI 0 "register_operand" "") - (match_operand:V16QI 1 "register_operand" "") - (match_operand:SI 2 "nonmemory_operand" "")] + [(set (match_operand:V16QI 0 "register_operand" "") + (ashift:V16QI + (match_operand:V16QI 1 "register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")))] "TARGET_XOP" { - rtvec vs = rtvec_alloc (16); - rtx par = gen_rtx_PARALLEL (V16QImode, vs); rtx reg = gen_reg_rtx (V16QImode); + rtx par; int i; - for (i = 0; i < 16; i++) - RTVEC_ELT (vs, i) = operands[2]; - emit_insn (gen_vec_initv16qi (reg, par)); - emit_insn (gen_xop_ashlv16qi3 (operands[0], operands[1], reg)); - DONE; -}) - -(define_expand "lshlv16qi3" - [(match_operand:V16QI 0 "register_operand" "") - (match_operand:V16QI 1 "register_operand" "") - (match_operand:SI 2 "nonmemory_operand" "")] - "TARGET_XOP" -{ - rtvec vs = rtvec_alloc (16); - rtx par = gen_rtx_PARALLEL (V16QImode, vs); - rtx reg = gen_reg_rtx (V16QImode); - int i; + par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); for (i = 0; i < 16; i++) - RTVEC_ELT (vs, i) = operands[2]; + XVECEXP (par, 0, i) = operands[2]; emit_insn (gen_vec_initv16qi (reg, par)); - emit_insn (gen_xop_lshlv16qi3 (operands[0], operands[1], reg)); + emit_insn (gen_xop_shav16qi3 (operands[0], operands[1], reg)); DONE; }) -(define_expand "ashrv16qi3" - [(match_operand:V16QI 0 "register_operand" "") - (match_operand:V16QI 1 "register_operand" "") - (match_operand:SI 2 "nonmemory_operand" "")] +(define_expand "v16qi3" + [(set (match_operand:V16QI 0 "register_operand" "") + (any_shiftrt:V16QI + (match_operand:V16QI 1 "register_operand" "") + (match_operand:SI 2 "nonmemory_operand" "")))] "TARGET_XOP" { - rtvec vs = rtvec_alloc (16); - rtx par = gen_rtx_PARALLEL (V16QImode, vs); rtx reg = gen_reg_rtx (V16QImode); + rtx par; + bool negate = false; + rtx (*shift_insn)(rtx, rtx, rtx); int i; - rtx ele = ((CONST_INT_P (operands[2])) - ? GEN_INT (- INTVAL (operands[2])) - : operands[2]); + if (CONST_INT_P (operands[2])) + operands[2] = GEN_INT (-INTVAL (operands[2])); + else + negate = true; + + par = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16)); for (i = 0; i < 16; i++) - RTVEC_ELT (vs, i) = ele; + XVECEXP (par, 0, i) = operands[2]; emit_insn (gen_vec_initv16qi (reg, par)); - if (!CONST_INT_P (operands[2])) - { - rtx neg = gen_reg_rtx (V16QImode); - emit_insn (gen_negv16qi2 (neg, reg)); - emit_insn (gen_xop_ashlv16qi3 (operands[0], operands[1], neg)); - } + if (negate) + emit_insn (gen_negv16qi2 (reg, reg)); + + if ( == LSHIFTRT) + shift_insn = gen_xop_shlv16qi3; else - emit_insn (gen_xop_ashlv16qi3 (operands[0], operands[1], reg)); + shift_insn = gen_xop_shav16qi3; + emit_insn (shift_insn (operands[0], operands[1], reg)); DONE; }) (define_expand "ashrv2di3" - [(match_operand:V2DI 0 "register_operand" "") - (match_operand:V2DI 1 "register_operand" "") - (match_operand:DI 2 "nonmemory_operand" "")] + [(set (match_operand:V2DI 0 "register_operand" "") + (ashiftrt:V2DI + (match_operand:V2DI 1 "register_operand" "") + (match_operand:DI 2 "nonmemory_operand" "")))] "TARGET_XOP" { - rtvec vs = rtvec_alloc (2); - rtx par = gen_rtx_PARALLEL (V2DImode, vs); rtx reg = gen_reg_rtx (V2DImode); - rtx ele; + rtx par; + bool negate = false; + int i; if (CONST_INT_P (operands[2])) - ele = GEN_INT (- INTVAL (operands[2])); - else if (GET_MODE (operands[2]) != DImode) - { - rtx move = gen_reg_rtx (DImode); - ele = gen_reg_rtx (DImode); - convert_move (move, operands[2], false); - emit_insn (gen_negdi2 (ele, move)); - } + operands[2] = GEN_INT (-INTVAL (operands[2])); else - { - ele = gen_reg_rtx (DImode); - emit_insn (gen_negdi2 (ele, operands[2])); - } + negate = true; + + par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2)); + for (i = 0; i < 2; i++) + XVECEXP (par, 0, i) = operands[2]; - RTVEC_ELT (vs, 0) = ele; - RTVEC_ELT (vs, 1) = ele; emit_insn (gen_vec_initv2di (reg, par)); - emit_insn (gen_xop_ashlv2di3 (operands[0], operands[1], reg)); + + if (negate) + emit_insn (gen_negv2di2 (reg, reg)); + + emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg)); DONE; }) @@ -11650,11 +11880,11 @@ (define_insn "avx2_permvarv8si" [(set (match_operand:V8SI 0 "register_operand" "=x") (unspec:V8SI - [(match_operand:V8SI 1 "register_operand" "x") - (match_operand:V8SI 2 "nonimmediate_operand" "xm")] + [(match_operand:V8SI 1 "nonimmediate_operand" "xm") + (match_operand:V8SI 2 "register_operand" "x")] UNSPEC_VPERMSI))] "TARGET_AVX2" - "vpermd\t{%2, %1, %0|%0, %1, %2}" + "vpermd\t{%1, %2, %0|%0, %2, %1}" [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -11675,11 +11905,11 @@ (define_insn "avx2_permvarv8sf" [(set (match_operand:V8SF 0 "register_operand" "=x") (unspec:V8SF - [(match_operand:V8SF 1 "register_operand" "x") - (match_operand:V8SF 2 "nonimmediate_operand" "xm")] + [(match_operand:V8SF 1 "nonimmediate_operand" "xm") + (match_operand:V8SI 2 "register_operand" "x")] UNSPEC_VPERMSF))] "TARGET_AVX2" - "vpermps\t{%2, %1, %0|%0, %1, %2}" + "vpermps\t{%1, %2, %0|%0, %2, %1}" [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -12013,6 +12243,10 @@ && avx_vperm2f128_parallel (operands[3], mode)" { int mask = avx_vperm2f128_parallel (operands[3], mode) - 1; + if (mask == 0x12) + return "vinsert\t{$0, %x2, %1, %0|%0, %1, %x2, 0}"; + if (mask == 0x20) + return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}"; operands[3] = GEN_INT (mask); return "vperm2\t{%3, %2, %1, %0|%0, %1, %2, %3}"; } @@ -12219,53 +12453,32 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_expand "_maskload" - [(set (match_operand:V48_AVX2 0 "register_operand" "") +(define_insn "_maskload" + [(set (match_operand:V48_AVX2 0 "register_operand" "=x") (unspec:V48_AVX2 - [(match_operand: 2 "register_operand" "") - (match_operand:V48_AVX2 1 "memory_operand" "") - (match_dup 0)] - UNSPEC_MASKMOV))] - "TARGET_AVX") - -(define_expand "_maskstore" - [(set (match_operand:V48_AVX2 0 "memory_operand" "") - (unspec:V48_AVX2 - [(match_operand: 1 "register_operand" "") - (match_operand:V48_AVX2 2 "register_operand" "") - (match_dup 0)] - UNSPEC_MASKMOV))] - "TARGET_AVX") - -(define_insn "*avx2_maskmov" - [(set (match_operand:VI48_AVX2 0 "nonimmediate_operand" "=x,m") - (unspec:VI48_AVX2 - [(match_operand: 1 "register_operand" "x,x") - (match_operand:VI48_AVX2 2 "nonimmediate_operand" "m,x") - (match_dup 0)] + [(match_operand: 2 "register_operand" "x") + (match_operand:V48_AVX2 1 "memory_operand" "m")] UNSPEC_MASKMOV))] - "TARGET_AVX2 - && (REG_P (operands[0]) == MEM_P (operands[2]))" - "vpmaskmov\t{%2, %1, %0|%0, %1, %2}" + "TARGET_AVX" + "vmaskmov\t{%1, %2, %0|%0, %2, %1}" [(set_attr "type" "sselog1") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex") (set_attr "mode" "")]) -(define_insn "*avx_maskmov" - [(set (match_operand:VF 0 "nonimmediate_operand" "=x,m") - (unspec:VF - [(match_operand: 1 "register_operand" "x,x") - (match_operand:VF 2 "nonimmediate_operand" "m,x") +(define_insn "_maskstore" + [(set (match_operand:V48_AVX2 0 "memory_operand" "=m") + (unspec:V48_AVX2 + [(match_operand: 1 "register_operand" "x") + (match_operand:V48_AVX2 2 "register_operand" "x") (match_dup 0)] UNSPEC_MASKMOV))] - "TARGET_AVX - && (REG_P (operands[0]) == MEM_P (operands[2]))" - "vmaskmov\t{%2, %1, %0|%0, %1, %2}" + "TARGET_AVX" + "vmaskmov\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sselog1") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex") - (set_attr "mode" "")]) + (set_attr "mode" "")]) (define_insn_and_split "avx__" [(set (match_operand:AVX256MODE2P 0 "nonimmediate_operand" "=x,m") @@ -12345,248 +12558,27 @@ DONE; }) -(define_insn "avx2_ashrvv8si" - [(set (match_operand:V8SI 0 "register_operand" "=x") - (vec_concat:V8SI - (vec_concat:V4SI - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_operand:V8SI 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select:SI - (match_operand:V8SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)]))))) - (vec_concat:V4SI - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 0)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 0)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)])))))))] - "TARGET_AVX2" - "vpsravd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseishft") - (set_attr "prefix" "vex") - (set_attr "mode" "OI")]) - -(define_insn "avx2_ashrvv4si" - [(set (match_operand:V4SI 0 "register_operand" "=x") - (vec_concat:V4SI - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_operand:V4SI 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select:SI - (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)]))))))] +(define_insn "avx2_ashrv" + [(set (match_operand:VI4_AVX2 0 "register_operand" "=x") + (ashiftrt:VI4_AVX2 + (match_operand:VI4_AVX2 1 "register_operand" "x") + (match_operand:VI4_AVX2 2 "nonimmediate_operand" "xm")))] "TARGET_AVX2" "vpsravd\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") (set_attr "prefix" "vex") - (set_attr "mode" "TI")]) - -(define_insn "avx2_vv8si" - [(set (match_operand:V8SI 0 "register_operand" "=x") - (vec_concat:V8SI - (vec_concat:V4SI - (vec_concat:V2SI - (lshift:SI - (vec_select:SI - (match_operand:V8SI 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select:SI - (match_operand:V8SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)]))))) - (vec_concat:V4SI - (vec_concat:V2SI - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 0)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 0)]))) - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)])))))))] - "TARGET_AVX2" - "vpvd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseishft") - (set_attr "prefix" "vex") - (set_attr "mode" "OI")]) - -(define_insn "avx2_v" - [(set (match_operand:VI4SD_AVX2 0 "register_operand" "=x") - (vec_concat:VI4SD_AVX2 - (vec_concat: - (lshift: - (vec_select: - (match_operand:VI4SD_AVX2 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select: - (match_operand:VI4SD_AVX2 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (lshift: - (vec_select: - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select: - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat: - (lshift: - (vec_select: - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select: - (match_dup 2) - (parallel [(const_int 2)]))) - (lshift: - (vec_select: - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select: - (match_dup 2) - (parallel [(const_int 3)]))))))] - "TARGET_AVX2" - "vpv\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseishft") - (set_attr "prefix" "vex") (set_attr "mode" "")]) -(define_insn "avx2_vv2di" - [(set (match_operand:V2DI 0 "register_operand" "=x") - (vec_concat:V2DI - (lshift:DI - (vec_select:DI - (match_operand:V2DI 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select:DI - (match_operand:V2DI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (lshift:DI - (vec_select:DI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:DI - (match_dup 2) - (parallel [(const_int 1)])))))] +(define_insn "avx2_v" + [(set (match_operand:VI48_AVX2 0 "register_operand" "=x") + (any_lshift:VI48_AVX2 + (match_operand:VI48_AVX2 1 "register_operand" "x") + (match_operand:VI48_AVX2 2 "nonimmediate_operand" "xm")))] "TARGET_AVX2" - "vpvq\t{%2, %1, %0|%0, %1, %2}" + "vpv\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") (set_attr "prefix" "vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "")]) (define_insn "avx_vec_concat" [(set (match_operand:V_256 0 "register_operand" "=x,x") @@ -12700,38 +12692,79 @@ ;; For gather* insn patterns (define_mode_iterator VEC_GATHER_MODE [V2DI V2DF V4DI V4DF V4SI V4SF V8SI V8SF]) -(define_mode_attr VEC_GATHER_MODE +(define_mode_attr VEC_GATHER_IDXSI [(V2DI "V4SI") (V2DF "V4SI") (V4DI "V4SI") (V4DF "V4SI") (V4SI "V4SI") (V4SF "V4SI") (V8SI "V8SI") (V8SF "V8SI")]) +(define_mode_attr VEC_GATHER_IDXDI + [(V2DI "V2DI") (V2DF "V2DI") + (V4DI "V4DI") (V4DF "V4DI") + (V4SI "V2DI") (V4SF "V2DI") + (V8SI "V4DI") (V8SF "V4DI")]) +(define_mode_attr VEC_GATHER_SRCDI + [(V2DI "V2DI") (V2DF "V2DF") + (V4DI "V4DI") (V4DF "V4DF") + (V4SI "V4SI") (V4SF "V4SF") + (V8SI "V4SI") (V8SF "V4SF")]) (define_expand "avx2_gathersi" [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") (unspec:VEC_GATHER_MODE [(match_operand:VEC_GATHER_MODE 1 "register_operand" "") - (match_operand 2 "register_operand" "") + (mem: + (match_par_dup 7 + [(match_operand 2 "vsib_address_operand" "") + (match_operand: + 3 "register_operand" "") + (match_operand:SI 5 "const1248_operand " "")])) (mem:BLK (scratch)) - (match_operand: 3 "register_operand" "") - (match_operand:VEC_GATHER_MODE 4 "register_operand" "") - (match_operand:SI 5 "const1248_operand " "")] + (match_operand:VEC_GATHER_MODE 4 "register_operand" "")] UNSPEC_GATHER)) (clobber (match_scratch:VEC_GATHER_MODE 6 ""))])] - "TARGET_AVX2") + "TARGET_AVX2" +{ + operands[7] + = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3], + operands[5]), UNSPEC_VSIBADDR); +}) (define_insn "*avx2_gathersi" [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") (unspec:VEC_GATHER_MODE [(match_operand:VEC_GATHER_MODE 2 "register_operand" "0") - (match_operand:P 3 "register_operand" "r") + (match_operator: 7 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 3 "vsib_address_operand" "p") + (match_operand: 4 "register_operand" "x") + (match_operand:SI 6 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) (mem:BLK (scratch)) - (match_operand: 4 "register_operand" "x") - (match_operand:VEC_GATHER_MODE 5 "register_operand" "1") - (match_operand:SI 6 "const1248_operand" "n")] + (match_operand:VEC_GATHER_MODE 5 "register_operand" "1")] UNSPEC_GATHER)) (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] "TARGET_AVX2" - "vgatherd\t{%1, (%3, %4, %p6), %0|%0, (%3, %4, %p6), %1}" + "vgatherd\t{%1, %7, %0|%0, %7, %1}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*avx2_gathersi_2" + [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") + (unspec:VEC_GATHER_MODE + [(pc) + (match_operator: 6 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 2 "vsib_address_operand" "p") + (match_operand: 3 "register_operand" "x") + (match_operand:SI 5 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (mem:BLK (scratch)) + (match_operand:VEC_GATHER_MODE 4 "register_operand" "1")] + UNSPEC_GATHER)) + (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] + "TARGET_AVX2" + "vgatherd\t{%1, %6, %0|%0, %6, %1}" [(set_attr "type" "ssemov") (set_attr "prefix" "vex") (set_attr "mode" "")]) @@ -12739,61 +12772,111 @@ (define_expand "avx2_gatherdi" [(parallel [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "") (unspec:VEC_GATHER_MODE - [(match_operand:VEC_GATHER_MODE 1 "register_operand" "") - (match_operand 2 "register_operand" "") + [(match_operand: 1 "register_operand" "") + (mem: + (match_par_dup 7 + [(match_operand 2 "vsib_address_operand" "") + (match_operand: + 3 "register_operand" "") + (match_operand:SI 5 "const1248_operand " "")])) (mem:BLK (scratch)) - (match_operand: 3 "register_operand" "") - (match_operand:VEC_GATHER_MODE 4 "register_operand" "") - (match_operand:SI 5 "const1248_operand " "")] + (match_operand: + 4 "register_operand" "")] UNSPEC_GATHER)) (clobber (match_scratch:VEC_GATHER_MODE 6 ""))])] - "TARGET_AVX2") + "TARGET_AVX2" +{ + operands[7] + = gen_rtx_UNSPEC (Pmode, gen_rtvec (3, operands[2], operands[3], + operands[5]), UNSPEC_VSIBADDR); +}) (define_insn "*avx2_gatherdi" - [(set (match_operand:AVXMODE48P_DI 0 "register_operand" "=&x") - (unspec:AVXMODE48P_DI - [(match_operand:AVXMODE48P_DI 2 "register_operand" "0") - (match_operand:P 3 "register_operand" "r") + [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") + (unspec:VEC_GATHER_MODE + [(match_operand: 2 "register_operand" "0") + (match_operator: 7 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 3 "vsib_address_operand" "p") + (match_operand: 4 "register_operand" "x") + (match_operand:SI 6 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) (mem:BLK (scratch)) - (match_operand: 4 "register_operand" "x") - (match_operand:AVXMODE48P_DI 5 "register_operand" "1") - (match_operand:SI 6 "const1248_operand" "n")] + (match_operand: 5 "register_operand" "1")] UNSPEC_GATHER)) - (clobber (match_scratch:AVXMODE48P_DI 1 "=&x"))] + (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] "TARGET_AVX2" - "vgatherq\t{%1, (%3, %4, %p6), %0|%0, (%3, %4, %p6), %1}" + "vgatherq\t{%5, %7, %2|%2, %7, %5}" [(set_attr "type" "ssemov") (set_attr "prefix" "vex") (set_attr "mode" "")]) -;; Special handling for VEX.256 with float arguments -;; since there're still xmms as operands -(define_expand "avx2_gatherdi256" - [(parallel [(set (match_operand:VI4F_128 0 "register_operand" "") - (unspec:VI4F_128 - [(match_operand:VI4F_128 1 "register_operand" "") - (match_operand 2 "register_operand" "") - (mem:BLK (scratch)) - (match_operand:V4DI 3 "register_operand" "") - (match_operand:VI4F_128 4 "register_operand" "") - (match_operand:SI 5 "const1248_operand " "")] - UNSPEC_GATHER)) - (clobber (match_scratch:VI4F_128 6 ""))])] - "TARGET_AVX2") - -(define_insn "*avx2_gatherdi256" - [(set (match_operand:VI4F_128 0 "register_operand" "=x") - (unspec:VI4F_128 - [(match_operand:VI4F_128 2 "register_operand" "0") - (match_operand:P 3 "register_operand" "r") +(define_insn "*avx2_gatherdi_2" + [(set (match_operand:VEC_GATHER_MODE 0 "register_operand" "=&x") + (unspec:VEC_GATHER_MODE + [(pc) + (match_operator: 6 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 2 "vsib_address_operand" "p") + (match_operand: 3 "register_operand" "x") + (match_operand:SI 5 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) (mem:BLK (scratch)) - (match_operand:V4DI 4 "register_operand" "x") - (match_operand:VI4F_128 5 "register_operand" "1") - (match_operand:SI 6 "const1248_operand" "n")] - UNSPEC_GATHER)) - (clobber (match_scratch:VI4F_128 1 "=&x"))] + (match_operand: 4 "register_operand" "1")] + UNSPEC_GATHER)) + (clobber (match_scratch:VEC_GATHER_MODE 1 "=&x"))] + "TARGET_AVX2" +{ + if (mode != mode) + return "vgatherq\t{%4, %6, %x0|%x0, %6, %4}"; + return "vgatherq\t{%4, %6, %0|%0, %6, %4}"; +} + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*avx2_gatherdi_3" + [(set (match_operand: 0 "register_operand" "=&x") + (vec_select: + (unspec:VI4F_256 + [(match_operand: 2 "register_operand" "0") + (match_operator: 7 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 3 "vsib_address_operand" "p") + (match_operand: 4 "register_operand" "x") + (match_operand:SI 6 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (mem:BLK (scratch)) + (match_operand: 5 "register_operand" "1")] + UNSPEC_GATHER) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))) + (clobber (match_scratch:VI4F_256 1 "=&x"))] + "TARGET_AVX2" + "vgatherq\t{%5, %7, %0|%0, %7, %5}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*avx2_gatherdi_4" + [(set (match_operand: 0 "register_operand" "=&x") + (vec_select: + (unspec:VI4F_256 + [(pc) + (match_operator: 6 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 2 "vsib_address_operand" "p") + (match_operand: 3 "register_operand" "x") + (match_operand:SI 5 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (mem:BLK (scratch)) + (match_operand: 4 "register_operand" "1")] + UNSPEC_GATHER) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))) + (clobber (match_scratch:VI4F_256 1 "=&x"))] "TARGET_AVX2" - "vgatherq\t{%1, (%3, %4, %p6), %0|%0, (%3, %4, %p6), %1}" + "vgatherq\t{%4, %6, %0|%0, %6, %4}" [(set_attr "type" "ssemov") (set_attr "prefix" "vex") (set_attr "mode" "")])