X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Fconfig%2Fi386%2Fsse.md;h=492cf21cd0653854c36213a9971755e808e068f4;hb=ba2bc388c57dd5555464a08aa1af3659b5ab2fc0;hp=d04902b67598fafe655404731c2ae484c67936b3;hpb=4030506f06061684b0a0f3a315094c5abbbf45d2;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index d04902b6759..492cf21cd06 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1,5 +1,5 @@ ;; GCC machine description for SSE instructions -;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 +;; Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 ;; Free Software Foundation, Inc. ;; ;; This file is part of GCC. @@ -18,6 +18,86 @@ ;; along with GCC; see the file COPYING3. If not see ;; . +(define_c_enum "unspec" [ + ;; SSE + UNSPEC_MOVNT + UNSPEC_LOADU + UNSPEC_STOREU + + ;; SSE3 + UNSPEC_LDDQU + + ;; SSSE3 + UNSPEC_PSHUFB + UNSPEC_PSIGN + UNSPEC_PALIGNR + + ;; For SSE4A support + UNSPEC_EXTRQI + UNSPEC_EXTRQ + UNSPEC_INSERTQI + UNSPEC_INSERTQ + + ;; For SSE4.1 support + UNSPEC_BLENDV + UNSPEC_INSERTPS + UNSPEC_DP + UNSPEC_MOVNTDQA + UNSPEC_MPSADBW + UNSPEC_PHMINPOSUW + UNSPEC_PTEST + + ;; For SSE4.2 support + UNSPEC_PCMPESTR + UNSPEC_PCMPISTR + + ;; For FMA4 support + UNSPEC_FMADDSUB + UNSPEC_XOP_UNSIGNED_CMP + UNSPEC_XOP_TRUEFALSE + UNSPEC_XOP_PERMUTE + UNSPEC_FRCZ + + ;; For AES support + UNSPEC_AESENC + UNSPEC_AESENCLAST + UNSPEC_AESDEC + UNSPEC_AESDECLAST + UNSPEC_AESIMC + UNSPEC_AESKEYGENASSIST + + ;; For PCLMUL support + UNSPEC_PCLMUL + + ;; For AVX support + UNSPEC_PCMP + UNSPEC_VPERMIL + UNSPEC_VPERMIL2 + UNSPEC_VPERMIL2F128 + UNSPEC_CAST + UNSPEC_VTESTP + UNSPEC_VCVTPH2PS + UNSPEC_VCVTPS2PH + + ;; For AVX2 support + UNSPEC_VPERMSI + UNSPEC_VPERMDF + UNSPEC_VPERMSF + UNSPEC_VPERMTI + UNSPEC_GATHER + UNSPEC_VSIBADDR +]) + +(define_c_enum "unspecv" [ + UNSPECV_LDMXCSR + UNSPECV_STMXCSR + UNSPECV_CLFLUSH + UNSPECV_MONITOR + UNSPECV_MWAIT + UNSPECV_VZEROALL + UNSPECV_VZEROUPPER +]) + ;; All vector modes including V?TImode, used in move patterns. (define_mode_iterator V16 [(V32QI "TARGET_AVX") V16QI @@ -316,8 +396,6 @@ ;; Mix-n-match (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF]) -(define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF]) - ;; Mapping of immediate bits for blend instructions (define_mode_attr blendbits [(V8SF "255") (V4SF "15") (V4DF "15") (V2DF "3")]) @@ -503,44 +581,51 @@ DONE; }) -(define_expand "_movu" - [(set (match_operand:VF 0 "nonimmediate_operand" "") +(define_insn "_loadu" + [(set (match_operand:VF 0 "register_operand" "=x") (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "")] - UNSPEC_MOVU))] + [(match_operand:VF 1 "memory_operand" "m")] + UNSPEC_LOADU))] "TARGET_SSE" -{ - if (MEM_P (operands[0]) && MEM_P (operands[1])) - operands[1] = force_reg (mode, operands[1]); -}) + "%vmovu\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "movu" "1") + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "")]) -(define_insn "*_movu" - [(set (match_operand:VF 0 "nonimmediate_operand" "=x,m") +(define_insn "_storeu" + [(set (match_operand:VF 0 "memory_operand" "=m") (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "xm,x")] - UNSPEC_MOVU))] - "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))" + [(match_operand:VF 1 "register_operand" "x")] + UNSPEC_STOREU))] + "TARGET_SSE" "%vmovu\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "movu" "1") (set_attr "prefix" "maybe_vex") (set_attr "mode" "")]) -(define_expand "_movdqu" - [(set (match_operand:VI1 0 "nonimmediate_operand" "") - (unspec:VI1 [(match_operand:VI1 1 "nonimmediate_operand" "")] - UNSPEC_MOVU))] +(define_insn "_loaddqu" + [(set (match_operand:VI1 0 "register_operand" "=x") + (unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")] + UNSPEC_LOADU))] "TARGET_SSE2" -{ - if (MEM_P (operands[0]) && MEM_P (operands[1])) - operands[1] = force_reg (mode, operands[1]); -}) + "%vmovdqu\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "movu" "1") + (set (attr "prefix_data16") + (if_then_else + (match_test "TARGET_AVX") + (const_string "*") + (const_string "1"))) + (set_attr "prefix" "maybe_vex") + (set_attr "mode" "")]) -(define_insn "*_movdqu" - [(set (match_operand:VI1 0 "nonimmediate_operand" "=x,m") - (unspec:VI1 [(match_operand:VI1 1 "nonimmediate_operand" "xm,x")] - UNSPEC_MOVU))] - "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))" +(define_insn "_storedqu" + [(set (match_operand:VI1 0 "memory_operand" "=m") + (unspec:VI1 [(match_operand:VI1 1 "register_operand" "x")] + UNSPEC_STOREU))] + "TARGET_SSE2" "%vmovdqu\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "movu" "1") @@ -573,15 +658,15 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "")]) -(define_insn "sse2_movntsi" - [(set (match_operand:SI 0 "memory_operand" "=m") - (unspec:SI [(match_operand:SI 1 "register_operand" "r")] - UNSPEC_MOVNT))] +(define_insn "sse2_movnti" + [(set (match_operand:SWI48 0 "memory_operand" "=m") + (unspec:SWI48 [(match_operand:SWI48 1 "register_operand" "r")] + UNSPEC_MOVNT))] "TARGET_SSE2" "movnti\t{%1, %0|%0, %1}" [(set_attr "type" "ssemov") (set_attr "prefix_data16" "0") - (set_attr "mode" "V2DF")]) + (set_attr "mode" "")]) (define_insn "_movnt" [(set (match_operand:VF 0 "memory_operand" "=m") @@ -614,8 +699,9 @@ ;; Modes handled by storent patterns. (define_mode_iterator STORENT_MODE - [(SI "TARGET_SSE2") (SF "TARGET_SSE4A") (DF "TARGET_SSE4A") - (V2DI "TARGET_SSE2") + [(DI "TARGET_SSE2 && TARGET_64BIT") (SI "TARGET_SSE2") + (SF "TARGET_SSE4A") (DF "TARGET_SSE4A") + (V4DI "TARGET_AVX") (V2DI "TARGET_SSE2") (V8SF "TARGET_AVX") V4SF (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) @@ -1096,14 +1182,14 @@ (parallel [(const_int 0)])) (vec_select:DF (match_dup 1) (parallel [(const_int 1)]))) (plusminus:DF - (vec_select:DF (match_dup 1) (parallel [(const_int 2)])) - (vec_select:DF (match_dup 1) (parallel [(const_int 3)])))) - (vec_concat:V2DF - (plusminus:DF (vec_select:DF (match_operand:V4DF 2 "nonimmediate_operand" "xm") (parallel [(const_int 0)])) - (vec_select:DF (match_dup 2) (parallel [(const_int 1)]))) + (vec_select:DF (match_dup 2) (parallel [(const_int 1)])))) + (vec_concat:V2DF + (plusminus:DF + (vec_select:DF (match_dup 1) (parallel [(const_int 2)])) + (vec_select:DF (match_dup 1) (parallel [(const_int 3)]))) (plusminus:DF (vec_select:DF (match_dup 2) (parallel [(const_int 2)])) (vec_select:DF (match_dup 2) (parallel [(const_int 3)]))))))] @@ -1628,63 +1714,51 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; -;; FMA4 floating point multiply/accumulate instructions. This -;; includes the scalar version of the instructions as well as the -;; vector. +;; FMA floating point multiply/accumulate instructions. These include +;; scalar versions of the instructions as well as vector versions. ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; In order to match (*a * *b) + *c, particularly when vectorizing, allow -;; combine to generate a multiply/add with two memory references. We then -;; split this insn, into loading up the destination register with one of the -;; memory operations. If we don't manage to split the insn, reload will -;; generate the appropriate moves. The reason this is needed, is that combine -;; has already folded one of the memory references into both the multiply and -;; add insns, and it can't generate a new pseudo. I.e.: -;; (set (reg1) (mem (addr1))) -;; (set (reg2) (mult (reg1) (mem (addr2)))) -;; (set (reg3) (plus (reg2) (mem (addr3)))) -;; -;; ??? This is historic, pre-dating the gimple fma transformation. -;; We could now properly represent that only one memory operand is -;; allowed and not be penalized during optimization. - -;; Intrinsic FMA operations. +;; The standard names for scalar FMA are only available with SSE math enabled. +(define_mode_iterator FMAMODEM [(SF "TARGET_SSE_MATH") + (DF "TARGET_SSE_MATH") + V4SF V2DF V8SF V4DF]) -;; The standard names for fma is only available with SSE math enabled. (define_expand "fma4" - [(set (match_operand:FMAMODE 0 "register_operand") - (fma:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand") - (match_operand:FMAMODE 2 "nonimmediate_operand") - (match_operand:FMAMODE 3 "nonimmediate_operand")))] - "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH") + [(set (match_operand:FMAMODEM 0 "register_operand") + (fma:FMAMODEM + (match_operand:FMAMODEM 1 "nonimmediate_operand") + (match_operand:FMAMODEM 2 "nonimmediate_operand") + (match_operand:FMAMODEM 3 "nonimmediate_operand")))] + "TARGET_FMA || TARGET_FMA4") (define_expand "fms4" - [(set (match_operand:FMAMODE 0 "register_operand") - (fma:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand") - (match_operand:FMAMODE 2 "nonimmediate_operand") - (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))] - "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH") + [(set (match_operand:FMAMODEM 0 "register_operand") + (fma:FMAMODEM + (match_operand:FMAMODEM 1 "nonimmediate_operand") + (match_operand:FMAMODEM 2 "nonimmediate_operand") + (neg:FMAMODEM (match_operand:FMAMODEM 3 "nonimmediate_operand"))))] + "TARGET_FMA || TARGET_FMA4") (define_expand "fnma4" - [(set (match_operand:FMAMODE 0 "register_operand") - (fma:FMAMODE - (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand")) - (match_operand:FMAMODE 2 "nonimmediate_operand") - (match_operand:FMAMODE 3 "nonimmediate_operand")))] - "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH") + [(set (match_operand:FMAMODEM 0 "register_operand") + (fma:FMAMODEM + (neg:FMAMODEM (match_operand:FMAMODEM 1 "nonimmediate_operand")) + (match_operand:FMAMODEM 2 "nonimmediate_operand") + (match_operand:FMAMODEM 3 "nonimmediate_operand")))] + "TARGET_FMA || TARGET_FMA4") (define_expand "fnms4" - [(set (match_operand:FMAMODE 0 "register_operand") - (fma:FMAMODE - (neg:FMAMODE (match_operand:FMAMODE 1 "nonimmediate_operand")) - (match_operand:FMAMODE 2 "nonimmediate_operand") - (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))] - "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH") + [(set (match_operand:FMAMODEM 0 "register_operand") + (fma:FMAMODEM + (neg:FMAMODEM (match_operand:FMAMODEM 1 "nonimmediate_operand")) + (match_operand:FMAMODEM 2 "nonimmediate_operand") + (neg:FMAMODEM (match_operand:FMAMODEM 3 "nonimmediate_operand"))))] + "TARGET_FMA || TARGET_FMA4") + +;; The builtins for intrinsics are not constrained by SSE math enabled. +(define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF]) -;; The builtin for fma4intrin.h is not constrained by SSE math enabled. (define_expand "fma4i_fmadd_" [(set (match_operand:FMAMODE 0 "register_operand") (fma:FMAMODE @@ -1693,70 +1767,137 @@ (match_operand:FMAMODE 3 "nonimmediate_operand")))] "TARGET_FMA || TARGET_FMA4") -(define_insn "*fma4i_fmadd_" - [(set (match_operand:FMAMODE 0 "register_operand" "=x,x") +(define_insn "*fma_fmadd_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x,x,x") (fma:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x") - (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m") - (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x")))] - "TARGET_FMA4" - "vfmadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "type" "ssemuladd") + (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x, x,x") + (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm,x,m") + (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0,xm,x")))] + "TARGET_FMA || TARGET_FMA4" + "@ + vfmadd132\t{%2, %3, %0|%0, %3, %2} + vfmadd213\t{%3, %2, %0|%0, %2, %3} + vfmadd231\t{%2, %1, %0|%0, %1, %2} + vfmadd\t{%3, %2, %1, %0|%0, %1, %2, %3} + vfmadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "fma,fma,fma,fma4,fma4") + (set_attr "type" "ssemuladd") (set_attr "mode" "")]) -(define_insn "*fma4i_fmsub_" - [(set (match_operand:FMAMODE 0 "register_operand" "=x,x") +(define_insn "*fma_fmsub_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x,x,x") (fma:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x") - (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m") + (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x, x,x") + (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm,x,m") (neg:FMAMODE - (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x"))))] - "TARGET_FMA4" - "vfmsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "type" "ssemuladd") + (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0,xm,x"))))] + "TARGET_FMA || TARGET_FMA4" + "@ + vfmsub132\t{%2, %3, %0|%0, %3, %2} + vfmsub213\t{%3, %2, %0|%0, %2, %3} + vfmsub231\t{%2, %1, %0|%0, %1, %2} + vfmsub\t{%3, %2, %1, %0|%0, %1, %2, %3} + vfmsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "fma,fma,fma,fma4,fma4") + (set_attr "type" "ssemuladd") (set_attr "mode" "")]) -(define_insn "*fma4i_fnmadd_" - [(set (match_operand:FMAMODE 0 "register_operand" "=x,x") +(define_insn "*fma_fnmadd_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x,x,x") (fma:FMAMODE (neg:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x")) - (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m") - (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x")))] - "TARGET_FMA4" - "vfnmadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "type" "ssemuladd") + (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x, x,x")) + (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm,x,m") + (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0,xm,x")))] + "TARGET_FMA || TARGET_FMA4" + "@ + vfnmadd132\t{%2, %3, %0|%0, %3, %2} + vfnmadd213\t{%3, %2, %0|%0, %2, %3} + vfnmadd231\t{%2, %1, %0|%0, %1, %2} + vfnmadd\t{%3, %2, %1, %0|%0, %1, %2, %3} + vfnmadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "fma,fma,fma,fma4,fma4") + (set_attr "type" "ssemuladd") (set_attr "mode" "")]) -(define_insn "*fma4i_fnmsub_" - [(set (match_operand:FMAMODE 0 "register_operand" "=x,x") +(define_insn "*fma_fnmsub_" + [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x,x,x") (fma:FMAMODE (neg:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x")) - (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m") + (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x, x,x")) + (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm,x,m") (neg:FMAMODE - (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x"))))] - "TARGET_FMA4" - "vfnmsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "type" "ssemuladd") + (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0,xm,x"))))] + "TARGET_FMA || TARGET_FMA4" + "@ + vfnmsub132\t{%2, %3, %0|%0, %3, %2} + vfnmsub213\t{%3, %2, %0|%0, %2, %3} + vfnmsub231\t{%2, %1, %0|%0, %1, %2} + vfnmsub\t{%3, %2, %1, %0|%0, %1, %2, %3} + vfnmsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "fma,fma,fma,fma4,fma4") + (set_attr "type" "ssemuladd") (set_attr "mode" "")]) -;; Scalar versions of the above. Unlike ADDSS et al, these write the -;; entire destination register, with the high-order elements zeroed. +;; FMA parallel floating point multiply addsub and subadd operations. -(define_expand "fma4i_vmfmadd_" - [(set (match_operand:VF_128 0 "register_operand") - (vec_merge:VF_128 - (fma:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand") - (match_operand:VF_128 2 "nonimmediate_operand") - (match_operand:VF_128 3 "nonimmediate_operand")) - (match_dup 4) - (const_int 1)))] - "TARGET_FMA4" -{ - operands[4] = CONST0_RTX (mode); -}) +;; It would be possible to represent these without the UNSPEC as +;; +;; (vec_merge +;; (fma op1 op2 op3) +;; (fma op1 op2 (neg op3)) +;; (merge-const)) +;; +;; But this doesn't seem useful in practice. + +(define_expand "fmaddsub_" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF + [(match_operand:VF 1 "nonimmediate_operand") + (match_operand:VF 2 "nonimmediate_operand") + (match_operand:VF 3 "nonimmediate_operand")] + UNSPEC_FMADDSUB))] + "TARGET_FMA || TARGET_FMA4") + +(define_insn "*fma_fmaddsub_" + [(set (match_operand:VF 0 "register_operand" "=x,x,x,x,x") + (unspec:VF + [(match_operand:VF 1 "nonimmediate_operand" "%0, 0,x, x,x") + (match_operand:VF 2 "nonimmediate_operand" "xm, x,xm,x,m") + (match_operand:VF 3 "nonimmediate_operand" " x,xm,0,xm,x")] + UNSPEC_FMADDSUB))] + "TARGET_FMA || TARGET_FMA4" + "@ + vfmaddsub132\t{%2, %3, %0|%0, %3, %2} + vfmaddsub213\t{%3, %2, %0|%0, %2, %3} + vfmaddsub231\t{%2, %1, %0|%0, %1, %2} + vfmaddsub\t{%3, %2, %1, %0|%0, %1, %2, %3} + vfmaddsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "fma,fma,fma,fma4,fma4") + (set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +(define_insn "*fma_fmsubadd_" + [(set (match_operand:VF 0 "register_operand" "=x,x,x,x,x") + (unspec:VF + [(match_operand:VF 1 "nonimmediate_operand" "%0, 0,x, x,x") + (match_operand:VF 2 "nonimmediate_operand" "xm, x,xm,x,m") + (neg:VF + (match_operand:VF 3 "nonimmediate_operand" " x,xm,0,xm,x"))] + UNSPEC_FMADDSUB))] + "TARGET_FMA || TARGET_FMA4" + "@ + vfmsubadd132\t{%2, %3, %0|%0, %3, %2} + vfmsubadd213\t{%3, %2, %0|%0, %2, %3} + vfmsubadd231\t{%2, %1, %0|%0, %1, %2} + vfmsubadd\t{%3, %2, %1, %0|%0, %1, %2, %3} + vfmsubadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "fma,fma,fma,fma4,fma4") + (set_attr "type" "ssemuladd") + (set_attr "mode" "")]) + +;; FMA3 floating point scalar intrinsics. These merge result with +;; high-order elements from the destination register. (define_expand "fmai_vmfmadd_" [(set (match_operand:VF_128 0 "register_operand") @@ -1765,82 +1906,95 @@ (match_operand:VF_128 1 "nonimmediate_operand") (match_operand:VF_128 2 "nonimmediate_operand") (match_operand:VF_128 3 "nonimmediate_operand")) - (match_dup 0) + (match_dup 1) (const_int 1)))] "TARGET_FMA") (define_insn "*fmai_fmadd_" - [(set (match_operand:VF_128 0 "register_operand" "=x,x,x") + [(set (match_operand:VF_128 0 "register_operand" "=x,x") (vec_merge:VF_128 (fma:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand" "%0, 0,x") - (match_operand:VF_128 2 "nonimmediate_operand" "xm, x,xm") - (match_operand:VF_128 3 "nonimmediate_operand" " x,xm,0")) - (match_dup 0) + (match_operand:VF_128 1 "nonimmediate_operand" " 0, 0") + (match_operand:VF_128 2 "nonimmediate_operand" "xm, x") + (match_operand:VF_128 3 "nonimmediate_operand" " x,xm")) + (match_dup 1) (const_int 1)))] "TARGET_FMA" "@ vfmadd132\t{%2, %3, %0|%0, %3, %2} - vfmadd213\t{%3, %2, %0|%0, %2, %3} - vfmadd231\t{%2, %1, %0|%0, %1, %2}" + vfmadd213\t{%3, %2, %0|%0, %2, %3}" [(set_attr "type" "ssemuladd") (set_attr "mode" "")]) (define_insn "*fmai_fmsub_" - [(set (match_operand:VF_128 0 "register_operand" "=x,x,x") + [(set (match_operand:VF_128 0 "register_operand" "=x,x") (vec_merge:VF_128 (fma:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand" "%0, 0,x") - (match_operand:VF_128 2 "nonimmediate_operand" "xm, x,xm") + (match_operand:VF_128 1 "nonimmediate_operand" " 0, 0") + (match_operand:VF_128 2 "nonimmediate_operand" "xm, x") (neg:VF_128 - (match_operand:VF_128 3 "nonimmediate_operand" " x,xm,0"))) - (match_dup 0) + (match_operand:VF_128 3 "nonimmediate_operand" " x,xm"))) + (match_dup 1) (const_int 1)))] "TARGET_FMA" "@ vfmsub132\t{%2, %3, %0|%0, %3, %2} - vfmsub213\t{%3, %2, %0|%0, %2, %3} - vfmsub231\t{%2, %1, %0|%0, %1, %2}" + vfmsub213\t{%3, %2, %0|%0, %2, %3}" [(set_attr "type" "ssemuladd") (set_attr "mode" "")]) (define_insn "*fmai_fnmadd_" - [(set (match_operand:VF_128 0 "register_operand" "=x,x,x") + [(set (match_operand:VF_128 0 "register_operand" "=x,x") (vec_merge:VF_128 (fma:VF_128 (neg:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand" "%0, 0,x")) - (match_operand:VF_128 2 "nonimmediate_operand" "xm, x,xm") - (match_operand:VF_128 3 "nonimmediate_operand" " x,xm,0")) - (match_dup 0) + (match_operand:VF_128 2 "nonimmediate_operand" "xm, x")) + (match_operand:VF_128 1 "nonimmediate_operand" " 0, 0") + (match_operand:VF_128 3 "nonimmediate_operand" " x,xm")) + (match_dup 1) (const_int 1)))] "TARGET_FMA" "@ vfnmadd132\t{%2, %3, %0|%0, %3, %2} - vfnmadd213\t{%3, %2, %0|%0, %2, %3} - vfnmadd231\t{%2, %1, %0|%0, %1, %2}" + vfnmadd213\t{%3, %2, %0|%0, %2, %3}" [(set_attr "type" "ssemuladd") (set_attr "mode" "")]) (define_insn "*fmai_fnmsub_" - [(set (match_operand:VF_128 0 "register_operand" "=x,x,x") + [(set (match_operand:VF_128 0 "register_operand" "=x,x") (vec_merge:VF_128 (fma:VF_128 (neg:VF_128 - (match_operand:VF_128 1 "nonimmediate_operand" "%0, 0,x")) - (match_operand:VF_128 2 "nonimmediate_operand" "xm, x,xm") + (match_operand:VF_128 2 "nonimmediate_operand" "xm, x")) + (match_operand:VF_128 1 "nonimmediate_operand" " 0, 0") (neg:VF_128 - (match_operand:VF_128 3 "nonimmediate_operand" " x,xm,0"))) - (match_dup 0) + (match_operand:VF_128 3 "nonimmediate_operand" " x,xm"))) + (match_dup 1) (const_int 1)))] "TARGET_FMA" "@ vfnmsub132\t{%2, %3, %0|%0, %3, %2} - vfnmsub213\t{%3, %2, %0|%0, %2, %3} - vfnmsub231\t{%2, %1, %0|%0, %1, %2}" + vfnmsub213\t{%3, %2, %0|%0, %2, %3}" [(set_attr "type" "ssemuladd") (set_attr "mode" "")]) +;; FMA4 floating point scalar intrinsics. These write the +;; entire destination register, with the high-order elements zeroed. + +(define_expand "fma4i_vmfmadd_" + [(set (match_operand:VF_128 0 "register_operand") + (vec_merge:VF_128 + (fma:VF_128 + (match_operand:VF_128 1 "nonimmediate_operand") + (match_operand:VF_128 2 "nonimmediate_operand") + (match_operand:VF_128 3 "nonimmediate_operand")) + (match_dup 4) + (const_int 1)))] + "TARGET_FMA4" +{ + operands[4] = CONST0_RTX (mode); +}) + (define_insn "*fma4i_vmfmadd_" [(set (match_operand:VF_128 0 "register_operand" "=x,x") (vec_merge:VF_128 @@ -1903,152 +2057,6 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; -;; FMA4 Parallel floating point multiply addsub and subadd operations. -;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -;; It would be possible to represent these without the UNSPEC as -;; -;; (vec_merge -;; (fma op1 op2 op3) -;; (fma op1 op2 (neg op3)) -;; (merge-const)) -;; -;; But this doesn't seem useful in practice. - -(define_expand "fmaddsub_" - [(set (match_operand:VF 0 "register_operand") - (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand") - (match_operand:VF 2 "nonimmediate_operand") - (match_operand:VF 3 "nonimmediate_operand")] - UNSPEC_FMADDSUB))] - "TARGET_FMA || TARGET_FMA4") - -(define_insn "*fma4_fmaddsub_" - [(set (match_operand:VF 0 "register_operand" "=x,x") - (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "%x,x") - (match_operand:VF 2 "nonimmediate_operand" " x,m") - (match_operand:VF 3 "nonimmediate_operand" "xm,x")] - UNSPEC_FMADDSUB))] - "TARGET_FMA4" - "vfmaddsub\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "")]) - -(define_insn "*fma4_fmsubadd_" - [(set (match_operand:VF 0 "register_operand" "=x,x") - (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "%x,x") - (match_operand:VF 2 "nonimmediate_operand" " x,m") - (neg:VF - (match_operand:VF 3 "nonimmediate_operand" "xm,x"))] - UNSPEC_FMADDSUB))] - "TARGET_FMA4" - "vfmsubadd\t{%3, %2, %1, %0|%0, %1, %2, %3}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "")]) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; -;; FMA3 floating point multiply/accumulate instructions. -;; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(define_insn "*fma_fmadd_" - [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x") - (fma:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x") - (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm") - (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0")))] - "TARGET_FMA" - "@ - vfmadd132\t{%2, %3, %0|%0, %3, %2} - vfmadd213\t{%3, %2, %0|%0, %2, %3} - vfmadd231\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "")]) - -(define_insn "*fma_fmsub_" - [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x") - (fma:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x") - (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm") - (neg:FMAMODE - (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0"))))] - "TARGET_FMA" - "@ - vfmsub132\t{%2, %3, %0|%0, %3, %2} - vfmsub213\t{%3, %2, %0|%0, %2, %3} - vfmsub231\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "")]) - -(define_insn "*fma_fnmadd_" - [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x") - (fma:FMAMODE - (neg:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x")) - (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm") - (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0")))] - "TARGET_FMA" - "@ - vfnmadd132\t{%2, %3, %0|%0, %3, %2} - vfnmadd213\t{%3, %2, %0|%0, %2, %3} - vfnmadd231\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "")]) - -(define_insn "*fma_fnmsub_" - [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x") - (fma:FMAMODE - (neg:FMAMODE - (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x")) - (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm") - (neg:FMAMODE - (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0"))))] - "TARGET_FMA" - "@ - vfnmsub132\t{%2, %3, %0|%0, %3, %2} - vfnmsub213\t{%3, %2, %0|%0, %2, %3} - vfnmsub231\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "")]) - -(define_insn "*fma_fmaddsub_" - [(set (match_operand:VF 0 "register_operand" "=x,x,x") - (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "%0, 0,x") - (match_operand:VF 2 "nonimmediate_operand" "xm, x,xm") - (match_operand:VF 3 "nonimmediate_operand" " x,xm,0")] - UNSPEC_FMADDSUB))] - "TARGET_FMA" - "@ - vfmaddsub132\t{%2, %3, %0|%0, %3, %2} - vfmaddsub213\t{%3, %2, %0|%0, %2, %3} - vfmaddsub231\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "")]) - -(define_insn "*fma_fmsubadd_" - [(set (match_operand:VF 0 "register_operand" "=x,x,x") - (unspec:VF - [(match_operand:VF 1 "nonimmediate_operand" "%0, 0,x") - (match_operand:VF 2 "nonimmediate_operand" "xm, x,xm") - (neg:VF - (match_operand:VF 3 "nonimmediate_operand" " x,xm,0"))] - UNSPEC_FMADDSUB))] - "TARGET_FMA" - "@ - vfmsubadd132\t{%2, %3, %0|%0, %3, %2} - vfmsubadd213\t{%3, %2, %0|%0, %2, %3} - vfmsubadd231\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "ssemuladd") - (set_attr "mode" "")]) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; ;; Parallel single-precision floating point conversion operations ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3268,7 +3276,7 @@ (vec_select:V4SF (vec_concat:V8SF (match_operand:V4SF 1 "nonimmediate_operand" " 0,x,0,x,0") - (match_operand:V4SF 2 "nonimmediate_operand" " x,x,m,x,x")) + (match_operand:V4SF 2 "nonimmediate_operand" " x,x,m,m,x")) (parallel [(const_int 0) (const_int 1) (const_int 4) @@ -3685,7 +3693,7 @@ (define_insn "sse_loadlps" [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,x,x,m") (vec_concat:V4SF - (match_operand:V2SF 2 "nonimmediate_operand" " 0,x,m,x,x") + (match_operand:V2SF 2 "nonimmediate_operand" " 0,x,m,m,x") (vec_select:V2SF (match_operand:V4SF 1 "nonimmediate_operand" " x,x,0,x,0") (parallel [(const_int 2) (const_int 3)]))))] @@ -3815,13 +3823,13 @@ ;; see comment above inline_secondary_memory_needed function in i386.c (define_insn "vec_set_0" [(set (match_operand:VI4F_128 0 "nonimmediate_operand" - "=x,x,x ,x,x,x,x ,x ,m,m ,m") + "=x,x,x ,x,x,x,x ,x ,m ,m ,m") (vec_merge:VI4F_128 (vec_duplicate:VI4F_128 (match_operand: 2 "general_operand" - " x,m,*r,m,x,x,*rm,*rm,x,fF,*r")) + " x,m,*r,m,x,x,*rm,*rm,!x,!*re,!*fF")) (match_operand:VI4F_128 1 "vector_move_operand" - " C,C,C ,C,0,x,0 ,x ,0,0 ,0") + " C,C,C ,C,0,x,0 ,x ,0 ,0 ,0") (const_int 1)))] "TARGET_SSE" "@ @@ -3841,9 +3849,9 @@ (cond [(eq_attr "alternative" "0,6,7") (const_string "sselog") (eq_attr "alternative" "9") - (const_string "fmov") - (eq_attr "alternative" "10") (const_string "imov") + (eq_attr "alternative" "10") + (const_string "fmov") ] (const_string "ssemov"))) (set_attr "prefix_extra" "*,*,*,*,*,*,1,1,*,*,*") @@ -4298,7 +4306,7 @@ (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov") (set_attr "prefix_data16" "*,*,*,1,*,1") (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex") - (set_attr "mode" "V2DF,V2DF,V2DF,V1DF,V1DF,V1DF")]) + (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")]) ;; Recall that the 256-bit unpck insns only shuffle within their lanes. (define_expand "avx_movddup256" @@ -4399,7 +4407,7 @@ (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov") (set_attr "prefix_data16" "*,*,*,1,*,1") (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex") - (set_attr "mode" "V2DF,V2DF,V2DF,V1DF,V1DF,V1DF")]) + (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")]) (define_split [(set (match_operand:V2DF 0 "memory_operand" "") @@ -4810,7 +4818,7 @@ (vec_select:DF (match_dup 0) (parallel [(const_int 1)]))))] "TARGET_SSE2 && reload_completed" [(set (match_dup 0) (match_dup 1))] - "operands[0] = adjust_address (operands[0], DFmode, 8);") + "operands[0] = adjust_address (operands[0], DFmode, 0);") (define_insn "sse2_movsd" [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,x,x,m,x,x,x,o") @@ -4856,7 +4864,7 @@ [(set_attr "isa" "noavx,sse3") (set_attr "type" "sselog1") (set_attr "prefix" "orig,maybe_vex") - (set_attr "mode" "V2DF")]) + (set_attr "mode" "V2DF,DF")]) (define_insn "*vec_concatv2df" [(set (match_operand:V2DF 0 "register_operand" "=x,x,x,x,x,x,x,x") @@ -4986,7 +4994,24 @@ gen_lowpart (mulmode, t[3])))); /* Extract the even bytes and merge them back together. */ - ix86_expand_vec_extract_even_odd (operands[0], t[5], t[4], 0); + if (mode == V16QImode) + ix86_expand_vec_extract_even_odd (operands[0], t[5], t[4], 0); + else + { + /* Since avx2_interleave_{low,high}v32qi used above aren't cross-lane, + this can't be normal even extraction, but one where additionally + the second and third quarter are swapped. That is even one insn + shorter than even extraction. */ + rtvec v = rtvec_alloc (32); + for (i = 0; i < 32; ++i) + RTVEC_ELT (v, i) + = GEN_INT (i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0)); + t[0] = operands[0]; + t[1] = t[5]; + t[2] = t[4]; + t[3] = gen_rtx_CONST_VECTOR (mode, v); + ix86_expand_vec_perm_const (t); + } set_unique_reg_note (get_last_insn (), REG_EQUAL, gen_rtx_MULT (mode, operands[1], operands[2])); @@ -5646,11 +5671,15 @@ if (TARGET_XOP) { + rtx t3 = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2), GEN_INT (1), GEN_INT (3))); emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2), GEN_INT (1), GEN_INT (3))); - emit_insn (gen_xop_mulv2div2di3_high (operands[0], t1, t2)); + emit_move_insn (t3, CONST0_RTX (V2DImode)); + + emit_insn (gen_xop_pmacsdqh (operands[0], t1, t2, t3)); DONE; } @@ -5675,11 +5704,15 @@ if (TARGET_XOP) { + rtx t3 = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_pshufd_1 (t1, op1, GEN_INT (0), GEN_INT (2), GEN_INT (1), GEN_INT (3))); emit_insn (gen_sse2_pshufd_1 (t2, op2, GEN_INT (0), GEN_INT (2), GEN_INT (1), GEN_INT (3))); - emit_insn (gen_xop_mulv2div2di3_low (operands[0], t1, t2)); + emit_move_insn (t3, CONST0_RTX (V2DImode)); + + emit_insn (gen_xop_pmacsdql (operands[0], t1, t2, t3)); DONE; } @@ -6243,9 +6276,9 @@ (if_then_else:V_256 (match_operator 3 "" [(match_operand:VI_256 4 "nonimmediate_operand" "") - (match_operand:VI_256 5 "nonimmediate_operand" "")]) - (match_operand:V_256 1 "general_operand" "") - (match_operand:V_256 2 "general_operand" "")))] + (match_operand:VI_256 5 "general_operand" "")]) + (match_operand:V_256 1 "" "") + (match_operand:V_256 2 "" "")))] "TARGET_AVX2 && (GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (mode))" @@ -6260,9 +6293,9 @@ (if_then_else:V_128 (match_operator 3 "" [(match_operand:VI124_128 4 "nonimmediate_operand" "") - (match_operand:VI124_128 5 "nonimmediate_operand" "")]) - (match_operand:V_128 1 "general_operand" "") - (match_operand:V_128 2 "general_operand" "")))] + (match_operand:VI124_128 5 "general_operand" "")]) + (match_operand:V_128 1 "" "") + (match_operand:V_128 2 "" "")))] "TARGET_SSE2 && (GET_MODE_NUNITS (mode) == GET_MODE_NUNITS (mode))" @@ -6277,9 +6310,9 @@ (if_then_else:VI8F_128 (match_operator 3 "" [(match_operand:V2DI 4 "nonimmediate_operand" "") - (match_operand:V2DI 5 "nonimmediate_operand" "")]) - (match_operand:VI8F_128 1 "general_operand" "") - (match_operand:VI8F_128 2 "general_operand" "")))] + (match_operand:V2DI 5 "general_operand" "")]) + (match_operand:VI8F_128 1 "" "") + (match_operand:VI8F_128 2 "" "")))] "TARGET_SSE4_2" { bool ok = ix86_expand_int_vcond (operands); @@ -7536,16 +7569,6 @@ (set_attr "prefix" "maybe_vex,orig,vex,maybe_vex,orig,orig") (set_attr "mode" "V2SF,TI,TI,TI,V4SF,V2SF")]) -(define_expand "vec_dupv4si" - [(set (match_operand:V4SI 0 "register_operand" "") - (vec_duplicate:V4SI - (match_operand:SI 1 "nonimmediate_operand" "")))] - "TARGET_SSE" -{ - if (!TARGET_AVX) - operands[1] = force_reg (V4SImode, operands[1]); -}) - (define_insn "*vec_dupv4si" [(set (match_operand:V4SI 0 "register_operand" "=x,x,x") (vec_duplicate:V4SI @@ -7562,16 +7585,6 @@ (set_attr "prefix" "maybe_vex,vex,orig") (set_attr "mode" "TI,V4SF,V4SF")]) -(define_expand "vec_dupv2di" - [(set (match_operand:V2DI 0 "register_operand" "") - (vec_duplicate:V2DI - (match_operand:DI 1 "nonimmediate_operand" "")))] - "TARGET_SSE" -{ - if (!TARGET_AVX) - operands[1] = force_reg (V2DImode, operands[1]); -}) - (define_insn "*vec_dupv2di" [(set (match_operand:V2DI 0 "register_operand" "=x,x,x,x") (vec_duplicate:V2DI @@ -8040,25 +8053,6 @@ (set_attr "prefix" "maybe_vex") (set_attr "memory" "store")]) -(define_expand "sse_sfence" - [(set (match_dup 0) - (unspec:BLK [(match_dup 0)] UNSPEC_SFENCE))] - "TARGET_SSE || TARGET_3DNOW_A" -{ - operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); - MEM_VOLATILE_P (operands[0]) = 1; -}) - -(define_insn "*sse_sfence" - [(set (match_operand:BLK 0 "" "") - (unspec:BLK [(match_dup 0)] UNSPEC_SFENCE))] - "TARGET_SSE || TARGET_3DNOW_A" - "sfence" - [(set_attr "type" "sse") - (set_attr "length_address" "0") - (set_attr "atom_sse_attr" "fence") - (set_attr "memory" "unknown")]) - (define_insn "sse2_clflush" [(unspec_volatile [(match_operand 0 "address_operand" "p")] UNSPECV_CLFLUSH)] @@ -8068,43 +8062,6 @@ (set_attr "atom_sse_attr" "fence") (set_attr "memory" "unknown")]) -(define_expand "sse2_mfence" - [(set (match_dup 0) - (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))] - "TARGET_SSE2" -{ - operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); - MEM_VOLATILE_P (operands[0]) = 1; -}) - -(define_insn "*sse2_mfence" - [(set (match_operand:BLK 0 "" "") - (unspec:BLK [(match_dup 0)] UNSPEC_MFENCE))] - "TARGET_64BIT || TARGET_SSE2" - "mfence" - [(set_attr "type" "sse") - (set_attr "length_address" "0") - (set_attr "atom_sse_attr" "fence") - (set_attr "memory" "unknown")]) - -(define_expand "sse2_lfence" - [(set (match_dup 0) - (unspec:BLK [(match_dup 0)] UNSPEC_LFENCE))] - "TARGET_SSE2" -{ - operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode)); - MEM_VOLATILE_P (operands[0]) = 1; -}) - -(define_insn "*sse2_lfence" - [(set (match_operand:BLK 0 "" "") - (unspec:BLK [(match_dup 0)] UNSPEC_LFENCE))] - "TARGET_SSE2" - "lfence" - [(set_attr "type" "sse") - (set_attr "length_address" "0") - (set_attr "atom_sse_attr" "lfence") - (set_attr "memory" "unknown")]) (define_insn "sse3_mwait" [(unspec_volatile [(match_operand:SI 0 "register_operand" "a") @@ -9611,7 +9568,7 @@ (set_attr "mode" "TI")]) (define_insn "_pblendvb" - [(set (match_operand:VI1_AVX2 0 "reg_not_xmm0_operand" "=x,x") + [(set (match_operand:VI1_AVX2 0 "reg_not_xmm0_operand_maybe_avx" "=x,x") (unspec:VI1_AVX2 [(match_operand:VI1_AVX2 1 "reg_not_xmm0_operand_maybe_avx" "0,x") (match_operand:VI1_AVX2 2 "nonimm_not_xmm0_operand_maybe_avx" "xm,xm") @@ -10422,12 +10379,12 @@ (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") - (parallel [(const_int 1) - (const_int 3)]))) - (vec_select:V2SI + (parallel [(const_int 0) + (const_int 2)]))) + (vec_select:V2SI (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 1) - (const_int 3)]))) + (parallel [(const_int 0) + (const_int 2)]))) (match_operand:V2DI 3 "nonimmediate_operand" "x")))] "TARGET_XOP" "vpmacssdql\t{%3, %2, %1, %0|%0, %1, %2, %3}" @@ -10441,13 +10398,13 @@ (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") - (parallel [(const_int 0) - (const_int 2)]))) + (parallel [(const_int 1) + (const_int 3)]))) (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0) - (const_int 2)])))) + (parallel [(const_int 1) + (const_int 3)])))) (match_operand:V2DI 3 "nonimmediate_operand" "x")))] "TARGET_XOP" "vpmacssdqh\t{%3, %2, %1, %0|%0, %1, %2, %3}" @@ -10461,61 +10418,19 @@ (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") - (parallel [(const_int 1) - (const_int 3)]))) + (parallel [(const_int 0) + (const_int 2)]))) (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 1) - (const_int 3)])))) + (parallel [(const_int 0) + (const_int 2)])))) (match_operand:V2DI 3 "nonimmediate_operand" "x")))] "TARGET_XOP" "vpmacsdql\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "ssemuladd") (set_attr "mode" "TI")]) -;; We don't have a straight 32-bit parallel multiply and extend on XOP, so -;; fake it with a multiply/add. In general, we expect the define_split to -;; occur before register allocation, so we have to handle the corner case where -;; the target is the same as operands 1/2 -(define_insn_and_split "xop_mulv2div2di3_low" - [(set (match_operand:V2DI 0 "register_operand" "=&x") - (mult:V2DI - (sign_extend:V2DI - (vec_select:V2SI - (match_operand:V4SI 1 "register_operand" "%x") - (parallel [(const_int 1) - (const_int 3)]))) - (sign_extend:V2DI - (vec_select:V2SI - (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 1) - (const_int 3)])))))] - "TARGET_XOP" - "#" - "&& reload_completed" - [(set (match_dup 0) - (match_dup 3)) - (set (match_dup 0) - (plus:V2DI - (mult:V2DI - (sign_extend:V2DI - (vec_select:V2SI - (match_dup 1) - (parallel [(const_int 1) - (const_int 3)]))) - (sign_extend:V2DI - (vec_select:V2SI - (match_dup 2) - (parallel [(const_int 1) - (const_int 3)])))) - (match_dup 0)))] -{ - operands[3] = CONST0_RTX (V2DImode); -} - [(set_attr "type" "ssemul") - (set_attr "mode" "TI")]) - (define_insn "xop_pmacsdqh" [(set (match_operand:V2DI 0 "register_operand" "=x") (plus:V2DI @@ -10523,61 +10438,19 @@ (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 1 "nonimmediate_operand" "%x") - (parallel [(const_int 0) - (const_int 2)]))) + (parallel [(const_int 1) + (const_int 3)]))) (sign_extend:V2DI (vec_select:V2SI (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0) - (const_int 2)])))) + (parallel [(const_int 1) + (const_int 3)])))) (match_operand:V2DI 3 "nonimmediate_operand" "x")))] "TARGET_XOP" "vpmacsdqh\t{%3, %2, %1, %0|%0, %1, %2, %3}" [(set_attr "type" "ssemuladd") (set_attr "mode" "TI")]) -;; We don't have a straight 32-bit parallel multiply and extend on XOP, so -;; fake it with a multiply/add. In general, we expect the define_split to -;; occur before register allocation, so we have to handle the corner case where -;; the target is the same as either operands[1] or operands[2] -(define_insn_and_split "xop_mulv2div2di3_high" - [(set (match_operand:V2DI 0 "register_operand" "=&x") - (mult:V2DI - (sign_extend:V2DI - (vec_select:V2SI - (match_operand:V4SI 1 "register_operand" "%x") - (parallel [(const_int 0) - (const_int 2)]))) - (sign_extend:V2DI - (vec_select:V2SI - (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0) - (const_int 2)])))))] - "TARGET_XOP" - "#" - "&& reload_completed" - [(set (match_dup 0) - (match_dup 3)) - (set (match_dup 0) - (plus:V2DI - (mult:V2DI - (sign_extend:V2DI - (vec_select:V2SI - (match_dup 1) - (parallel [(const_int 0) - (const_int 2)]))) - (sign_extend:V2DI - (vec_select:V2SI - (match_dup 2) - (parallel [(const_int 0) - (const_int 2)])))) - (match_dup 0)))] -{ - operands[3] = CONST0_RTX (V2DImode); -} - [(set_attr "type" "ssemul") - (set_attr "mode" "TI")]) - ;; XOP parallel integer multiply/add instructions for the intrinisics (define_insn "xop_pmacsswd" [(set (match_operand:V4SI 0 "register_operand" "=x") @@ -10794,45 +10667,45 @@ (vec_select:V2QI (match_operand:V16QI 1 "nonimmediate_operand" "xm") (parallel [(const_int 0) - (const_int 4)]))) + (const_int 8)]))) (sign_extend:V2DI (vec_select:V2QI (match_dup 1) (parallel [(const_int 1) - (const_int 5)])))) + (const_int 9)])))) (plus:V2DI (sign_extend:V2DI (vec_select:V2QI (match_dup 1) (parallel [(const_int 2) - (const_int 6)]))) + (const_int 10)]))) (sign_extend:V2DI (vec_select:V2QI (match_dup 1) (parallel [(const_int 3) - (const_int 7)]))))) + (const_int 11)]))))) (plus:V2DI (plus:V2DI (sign_extend:V2DI (vec_select:V2QI (match_dup 1) - (parallel [(const_int 8) + (parallel [(const_int 4) (const_int 12)]))) (sign_extend:V2DI (vec_select:V2QI (match_dup 1) - (parallel [(const_int 9) + (parallel [(const_int 5) (const_int 13)])))) (plus:V2DI (sign_extend:V2DI (vec_select:V2QI (match_dup 1) - (parallel [(const_int 10) + (parallel [(const_int 6) (const_int 14)]))) (sign_extend:V2DI (vec_select:V2QI (match_dup 1) - (parallel [(const_int 11) + (parallel [(const_int 7) (const_int 15)])))))))] "TARGET_XOP" "vphaddbq\t{%1, %0|%0, %1}" @@ -10980,45 +10853,45 @@ (vec_select:V2QI (match_operand:V16QI 1 "nonimmediate_operand" "xm") (parallel [(const_int 0) - (const_int 4)]))) + (const_int 8)]))) (sign_extend:V2DI (vec_select:V2QI (match_dup 1) (parallel [(const_int 1) - (const_int 5)])))) + (const_int 9)])))) (plus:V2DI (zero_extend:V2DI (vec_select:V2QI (match_dup 1) (parallel [(const_int 2) - (const_int 6)]))) + (const_int 10)]))) (zero_extend:V2DI (vec_select:V2QI (match_dup 1) (parallel [(const_int 3) - (const_int 7)]))))) + (const_int 11)]))))) (plus:V2DI (plus:V2DI (zero_extend:V2DI (vec_select:V2QI (match_dup 1) - (parallel [(const_int 8) + (parallel [(const_int 4) (const_int 12)]))) (sign_extend:V2DI (vec_select:V2QI (match_dup 1) - (parallel [(const_int 9) + (parallel [(const_int 5) (const_int 13)])))) (plus:V2DI (zero_extend:V2DI (vec_select:V2QI (match_dup 1) - (parallel [(const_int 10) + (parallel [(const_int 6) (const_int 14)]))) (zero_extend:V2DI (vec_select:V2QI (match_dup 1) - (parallel [(const_int 11) + (parallel [(const_int 7) (const_int 15)])))))))] "TARGET_XOP" "vphaddubq\t{%1, %0|%0, %1}" @@ -11294,7 +11167,8 @@ (match_operand:SI 2 "const_0_to__operand" "n")))] "TARGET_XOP" { - operands[3] = GEN_INT (( * 8) - INTVAL (operands[2])); + operands[3] + = GEN_INT (GET_MODE_BITSIZE (mode) - INTVAL (operands[2])); return \"vprot\t{%3, %1, %0|%0, %1, %3}\"; } [(set_attr "type" "sseishft") @@ -11859,11 +11733,11 @@ (define_insn "avx2_permvarv8si" [(set (match_operand:V8SI 0 "register_operand" "=x") (unspec:V8SI - [(match_operand:V8SI 1 "register_operand" "x") - (match_operand:V8SI 2 "nonimmediate_operand" "xm")] + [(match_operand:V8SI 1 "nonimmediate_operand" "xm") + (match_operand:V8SI 2 "register_operand" "x")] UNSPEC_VPERMSI))] "TARGET_AVX2" - "vpermd\t{%2, %1, %0|%0, %1, %2}" + "vpermd\t{%1, %2, %0|%0, %2, %1}" [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -11884,11 +11758,11 @@ (define_insn "avx2_permvarv8sf" [(set (match_operand:V8SF 0 "register_operand" "=x") (unspec:V8SF - [(match_operand:V8SF 1 "register_operand" "x") - (match_operand:V8SF 2 "nonimmediate_operand" "xm")] + [(match_operand:V8SF 1 "nonimmediate_operand" "xm") + (match_operand:V8SI 2 "register_operand" "x")] UNSPEC_VPERMSF))] "TARGET_AVX2" - "vpermps\t{%2, %1, %0|%0, %1, %2}" + "vpermps\t{%1, %2, %0|%0, %2, %1}" [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -12446,7 +12320,7 @@ (set_attr "mode" "")]) (define_insn "_maskstore" - [(set (match_operand:V48_AVX2 0 "memory_operand" "=m") + [(set (match_operand:V48_AVX2 0 "memory_operand" "+m") (unspec:V48_AVX2 [(match_operand: 1 "register_operand" "x") (match_operand:V48_AVX2 2 "register_operand" "x") @@ -12596,7 +12470,7 @@ (unspec:V8SF [(match_operand:V8HI 1 "register_operand" "x")] UNSPEC_VCVTPH2PS) (parallel [(const_int 0) (const_int 1) - (const_int 1) (const_int 2)])))] + (const_int 2) (const_int 3)])))] "TARGET_F16C" "vcvtph2ps\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") @@ -12813,3 +12687,49 @@ [(set_attr "type" "ssemov") (set_attr "prefix" "vex") (set_attr "mode" "")]) + +(define_insn "*avx2_gatherdi_3" + [(set (match_operand: 0 "register_operand" "=&x") + (vec_select: + (unspec:VI4F_256 + [(match_operand: 2 "register_operand" "0") + (match_operator: 7 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 3 "vsib_address_operand" "p") + (match_operand: 4 "register_operand" "x") + (match_operand:SI 6 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (mem:BLK (scratch)) + (match_operand: 5 "register_operand" "1")] + UNSPEC_GATHER) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))) + (clobber (match_scratch:VI4F_256 1 "=&x"))] + "TARGET_AVX2" + "vgatherq\t{%5, %7, %0|%0, %7, %5}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "")]) + +(define_insn "*avx2_gatherdi_4" + [(set (match_operand: 0 "register_operand" "=&x") + (vec_select: + (unspec:VI4F_256 + [(pc) + (match_operator: 6 "vsib_mem_operator" + [(unspec:P + [(match_operand:P 2 "vsib_address_operand" "p") + (match_operand: 3 "register_operand" "x") + (match_operand:SI 5 "const1248_operand" "n")] + UNSPEC_VSIBADDR)]) + (mem:BLK (scratch)) + (match_operand: 4 "register_operand" "1")] + UNSPEC_GATHER) + (parallel [(const_int 0) (const_int 1) + (const_int 2) (const_int 3)]))) + (clobber (match_scratch:VI4F_256 1 "=&x"))] + "TARGET_AVX2" + "vgatherq\t{%4, %6, %0|%0, %6, %4}" + [(set_attr "type" "ssemov") + (set_attr "prefix" "vex") + (set_attr "mode" "")])