Backport from mainline

[pf3gnuchains/gcc-fork.git] / gcc / config / i386 / sse.md
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

index c2a6217..bf4b136 100644 (file)
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -21,7 +21,8 @@
  (define_c_enum "unspec" [
    ;; SSE
    UNSPEC_MOVNT
-  UNSPEC_MOVU
+  UNSPEC_LOADU
+  UNSPEC_STOREU
  
    ;; SSE3
    UNSPEC_LDDQU
@@ -395,8 +396,6 @@
  ;; Mix-n-match
  (define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
  
-(define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF])
-
  ;; Mapping of immediate bits for blend instructions
  (define_mode_attr blendbits
    [(V8SF "255") (V4SF "15") (V4DF "15") (V2DF "3")])
@@ -582,23 +581,51 @@
    DONE;
  })
  
-(define_insn "<sse>_movu<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:VF 0 "nonimmediate_operand" "=x,m")
+(define_insn "<sse>_loadu<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF 0 "register_operand" "=x")
         (unspec:VF
-         [(match_operand:VF 1 "nonimmediate_operand" "xm,x")]
-         UNSPEC_MOVU))]
-  "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+         [(match_operand:VF 1 "memory_operand" "m")]
+         UNSPEC_LOADU))]
+  "TARGET_SSE"
    "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}"
    [(set_attr "type" "ssemov")
     (set_attr "movu" "1")
     (set_attr "prefix" "maybe_vex")
     (set_attr "mode" "<MODE>")])
  
-(define_insn "<sse2>_movdqu<avxsizesuffix>"
-  [(set (match_operand:VI1 0 "nonimmediate_operand" "=x,m")
-       (unspec:VI1 [(match_operand:VI1 1 "nonimmediate_operand" "xm,x")]
-                   UNSPEC_MOVU))]
-  "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+(define_insn "<sse>_storeu<ssemodesuffix><avxsizesuffix>"
+  [(set (match_operand:VF 0 "memory_operand" "=m")
+       (unspec:VF
+         [(match_operand:VF 1 "register_operand" "x")]
+         UNSPEC_STOREU))]
+  "TARGET_SSE"
+  "%vmovu<ssemodesuffix>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "<sse2>_loaddqu<avxsizesuffix>"
+  [(set (match_operand:VI1 0 "register_operand" "=x")
+       (unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
+                   UNSPEC_LOADU))]
+  "TARGET_SSE2"
+  "%vmovdqu\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "movu" "1")
+   (set (attr "prefix_data16")
+     (if_then_else
+       (match_test "TARGET_AVX")
+     (const_string "*")
+     (const_string "1")))
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
+(define_insn "<sse2>_storedqu<avxsizesuffix>"
+  [(set (match_operand:VI1 0 "memory_operand" "=m")
+       (unspec:VI1 [(match_operand:VI1 1 "register_operand" "x")]
+                   UNSPEC_STOREU))]
+  "TARGET_SSE2"
    "%vmovdqu\t{%1, %0|%0, %1}"
    [(set_attr "type" "ssemov")
     (set_attr "movu" "1")
@@ -1687,28 +1714,12 @@
  
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;;
-;; FMA4 floating point multiply/accumulate instructions.  This
-;; includes the scalar version of the instructions as well as the
-;; vector.
+;; FMA floating point multiply/accumulate instructions.  These include
+;; scalar versions of the instructions as well as vector versions.
  ;;
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  
-;; In order to match (*a * *b) + *c, particularly when vectorizing, allow
-;; combine to generate a multiply/add with two memory references.  We then
-;; split this insn, into loading up the destination register with one of the
-;; memory operations.  If we don't manage to split the insn, reload will
-;; generate the appropriate moves.  The reason this is needed, is that combine
-;; has already folded one of the memory references into both the multiply and
-;; add insns, and it can't generate a new pseudo.  I.e.:
-;;     (set (reg1) (mem (addr1)))
-;;     (set (reg2) (mult (reg1) (mem (addr2))))
-;;     (set (reg3) (plus (reg2) (mem (addr3))))
-;;
-;; ??? This is historic, pre-dating the gimple fma transformation.
-;; We could now properly represent that only one memory operand is
-;; allowed and not be penalized during optimization.
-
-;; Intrinsic FMA operations.
+(define_mode_iterator FMAMODE [SF DF V4SF V2DF V8SF V4DF])
  
  ;; The standard names for fma is only available with SSE math enabled.
  (define_expand "fma<mode>4"
@@ -1743,7 +1754,7 @@
           (neg:FMAMODE (match_operand:FMAMODE 3 "nonimmediate_operand"))))]
    "(TARGET_FMA || TARGET_FMA4) && TARGET_SSE_MATH")
  
-;; The builtin for fma4intrin.h is not constrained by SSE math enabled.
+;; The builtin for intrinsics is not constrained by SSE math enabled.
  (define_expand "fma4i_fmadd_<mode>"
    [(set (match_operand:FMAMODE 0 "register_operand")
         (fma:FMAMODE
@@ -1752,70 +1763,137 @@
           (match_operand:FMAMODE 3 "nonimmediate_operand")))]
    "TARGET_FMA || TARGET_FMA4")
  
-(define_insn "*fma4i_fmadd_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x")
+(define_insn "*fma_fmadd_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x,x,x")
         (fma:FMAMODE
-         (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x")
-         (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m")
-         (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x")))]
-  "TARGET_FMA4"
-  "vfmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
+         (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x, x,x")
+         (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm,x,m")
+         (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0,xm,x")))]
+  "TARGET_FMA || TARGET_FMA4"
+  "@
+   vfmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfmadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfmadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}
+   vfmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma,fma,fma,fma4,fma4")
+   (set_attr "type" "ssemuladd")
     (set_attr "mode" "<MODE>")])
  
-(define_insn "*fma4i_fmsub_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x")
+(define_insn "*fma_fmsub_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x,x,x")
         (fma:FMAMODE
-         (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x")
-         (match_operand:FMAMODE 2 "nonimmediate_operand" " x,m")
+         (match_operand:FMAMODE   1 "nonimmediate_operand" "%0, 0,x, x,x")
+         (match_operand:FMAMODE   2 "nonimmediate_operand" "xm, x,xm,x,m")
           (neg:FMAMODE
-           (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x"))))]
-  "TARGET_FMA4"
-  "vfmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
+           (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0,xm,x"))))]
+  "TARGET_FMA || TARGET_FMA4"
+  "@
+   vfmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfmsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfmsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}
+   vfmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma,fma,fma,fma4,fma4")
+   (set_attr "type" "ssemuladd")
     (set_attr "mode" "<MODE>")])
  
-(define_insn "*fma4i_fnmadd_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x")
+(define_insn "*fma_fnmadd_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x,x,x")
         (fma:FMAMODE
           (neg:FMAMODE
-           (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x"))
-         (match_operand:FMAMODE   2 "nonimmediate_operand" " x,m")
-         (match_operand:FMAMODE   3 "nonimmediate_operand" "xm,x")))]
-  "TARGET_FMA4"
-  "vfnmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
+           (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x, x,x"))
+         (match_operand:FMAMODE   2 "nonimmediate_operand" "xm, x,xm,x,m")
+         (match_operand:FMAMODE   3 "nonimmediate_operand" " x,xm,0,xm,x")))]
+  "TARGET_FMA || TARGET_FMA4"
+  "@
+   vfnmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfnmadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfnmadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}
+   vfnmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfnmadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma,fma,fma,fma4,fma4")
+   (set_attr "type" "ssemuladd")
     (set_attr "mode" "<MODE>")])
  
-(define_insn "*fma4i_fnmsub_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x")
+(define_insn "*fma_fnmsub_<mode>"
+  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x,x,x")
         (fma:FMAMODE
           (neg:FMAMODE
-           (match_operand:FMAMODE 1 "nonimmediate_operand" "%x,x"))
-         (match_operand:FMAMODE   2 "nonimmediate_operand" " x,m")
+           (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x, x,x"))
+         (match_operand:FMAMODE   2 "nonimmediate_operand" "xm, x,xm,x,m")
           (neg:FMAMODE
-           (match_operand:FMAMODE 3 "nonimmediate_operand" "xm,x"))))]
-  "TARGET_FMA4"
-  "vfnmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
+           (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0,xm,x"))))]
+  "TARGET_FMA || TARGET_FMA4"
+  "@
+   vfnmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfnmsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfnmsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}
+   vfnmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfnmsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma,fma,fma,fma4,fma4")
+   (set_attr "type" "ssemuladd")
     (set_attr "mode" "<MODE>")])
  
-;; Scalar versions of the above.  Unlike ADDSS et al, these write the
-;; entire destination register, with the high-order elements zeroed.
+;; FMA parallel floating point multiply addsub and subadd operations.
  
-(define_expand "fma4i_vmfmadd_<mode>"
-  [(set (match_operand:VF_128 0 "register_operand")
-       (vec_merge:VF_128
-         (fma:VF_128
-           (match_operand:VF_128 1 "nonimmediate_operand")
-           (match_operand:VF_128 2 "nonimmediate_operand")
-           (match_operand:VF_128 3 "nonimmediate_operand"))
-         (match_dup 4)
-         (const_int 1)))]
-  "TARGET_FMA4"
-{
-  operands[4] = CONST0_RTX (<MODE>mode);
-})
+;; It would be possible to represent these without the UNSPEC as
+;;
+;; (vec_merge
+;;   (fma op1 op2 op3)
+;;   (fma op1 op2 (neg op3))
+;;   (merge-const))
+;;
+;; But this doesn't seem useful in practice.
+
+(define_expand "fmaddsub_<mode>"
+  [(set (match_operand:VF 0 "register_operand")
+       (unspec:VF
+         [(match_operand:VF 1 "nonimmediate_operand")
+          (match_operand:VF 2 "nonimmediate_operand")
+          (match_operand:VF 3 "nonimmediate_operand")]
+         UNSPEC_FMADDSUB))]
+  "TARGET_FMA || TARGET_FMA4")
+
+(define_insn "*fma_fmaddsub_<mode>"
+  [(set (match_operand:VF 0 "register_operand" "=x,x,x,x,x")
+       (unspec:VF
+         [(match_operand:VF 1 "nonimmediate_operand" "%0, 0,x, x,x")
+          (match_operand:VF 2 "nonimmediate_operand" "xm, x,xm,x,m")
+          (match_operand:VF 3 "nonimmediate_operand" " x,xm,0,xm,x")]
+         UNSPEC_FMADDSUB))]
+  "TARGET_FMA || TARGET_FMA4"
+  "@
+   vfmaddsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfmaddsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfmaddsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}
+   vfmaddsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfmaddsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma,fma,fma,fma4,fma4")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*fma_fmsubadd_<mode>"
+  [(set (match_operand:VF 0 "register_operand" "=x,x,x,x,x")
+       (unspec:VF
+         [(match_operand:VF   1 "nonimmediate_operand" "%0, 0,x, x,x")
+          (match_operand:VF   2 "nonimmediate_operand" "xm, x,xm,x,m")
+          (neg:VF
+            (match_operand:VF 3 "nonimmediate_operand" " x,xm,0,xm,x"))]
+         UNSPEC_FMADDSUB))]
+  "TARGET_FMA || TARGET_FMA4"
+  "@
+   vfmsubadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
+   vfmsubadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
+   vfmsubadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}
+   vfmsubadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}
+   vfmsubadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "isa" "fma,fma,fma,fma4,fma4")
+   (set_attr "type" "ssemuladd")
+   (set_attr "mode" "<MODE>")])
+
+;; FMA3 floating point scalar intrinsics. These merge result with
+;; high-order elements from the destination register.
  
  (define_expand "fmai_vmfmadd_<mode>"
    [(set (match_operand:VF_128 0 "register_operand")
@@ -1824,82 +1902,95 @@
             (match_operand:VF_128 1 "nonimmediate_operand")
             (match_operand:VF_128 2 "nonimmediate_operand")
             (match_operand:VF_128 3 "nonimmediate_operand"))
-         (match_dup 0)
+         (match_dup 1)
           (const_int 1)))]
    "TARGET_FMA")
  
  (define_insn "*fmai_fmadd_<mode>"
-  [(set (match_operand:VF_128 0 "register_operand" "=x,x,x")
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
          (vec_merge:VF_128
           (fma:VF_128
-           (match_operand:VF_128 1 "nonimmediate_operand" "%0, 0,x")
-           (match_operand:VF_128 2 "nonimmediate_operand" "xm, x,xm")
-           (match_operand:VF_128 3 "nonimmediate_operand" " x,xm,0"))
-         (match_dup 0)
+           (match_operand:VF_128 1 "nonimmediate_operand" " 0, 0")
+           (match_operand:VF_128 2 "nonimmediate_operand" "xm, x")
+           (match_operand:VF_128 3 "nonimmediate_operand" " x,xm"))
+         (match_dup 1)
           (const_int 1)))]
    "TARGET_FMA"
    "@
     vfmadd132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfmadd213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfmadd231<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+   vfmadd213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}"
    [(set_attr "type" "ssemuladd")
     (set_attr "mode" "<MODE>")])
  
  (define_insn "*fmai_fmsub_<mode>"
-  [(set (match_operand:VF_128 0 "register_operand" "=x,x,x")
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
          (vec_merge:VF_128
           (fma:VF_128
-           (match_operand:VF_128   1 "nonimmediate_operand" "%0, 0,x")
-           (match_operand:VF_128   2 "nonimmediate_operand" "xm, x,xm")
+           (match_operand:VF_128   1 "nonimmediate_operand" " 0, 0")
+           (match_operand:VF_128   2 "nonimmediate_operand" "xm, x")
             (neg:VF_128
-             (match_operand:VF_128 3 "nonimmediate_operand" " x,xm,0")))
-         (match_dup 0)
+             (match_operand:VF_128 3 "nonimmediate_operand" " x,xm")))
+         (match_dup 1)
           (const_int 1)))]
    "TARGET_FMA"
    "@
     vfmsub132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfmsub213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfmsub231<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+   vfmsub213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}"
    [(set_attr "type" "ssemuladd")
     (set_attr "mode" "<MODE>")])
  
  (define_insn "*fmai_fnmadd_<mode>"
-  [(set (match_operand:VF_128 0 "register_operand" "=x,x,x")
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
          (vec_merge:VF_128
           (fma:VF_128
             (neg:VF_128
-             (match_operand:VF_128 1 "nonimmediate_operand" "%0, 0,x"))
-           (match_operand:VF_128   2 "nonimmediate_operand" "xm, x,xm")
-           (match_operand:VF_128   3 "nonimmediate_operand" " x,xm,0"))
-         (match_dup 0)
+             (match_operand:VF_128 2 "nonimmediate_operand" "xm, x"))
+           (match_operand:VF_128   1 "nonimmediate_operand" " 0, 0")
+           (match_operand:VF_128   3 "nonimmediate_operand" " x,xm"))
+         (match_dup 1)
           (const_int 1)))]
    "TARGET_FMA"
    "@
     vfnmadd132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfnmadd213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfnmadd231<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+   vfnmadd213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}"
    [(set_attr "type" "ssemuladd")
     (set_attr "mode" "<MODE>")])
  
  (define_insn "*fmai_fnmsub_<mode>"
-  [(set (match_operand:VF_128 0 "register_operand" "=x,x,x")
+  [(set (match_operand:VF_128 0 "register_operand" "=x,x")
          (vec_merge:VF_128
           (fma:VF_128
             (neg:VF_128
-             (match_operand:VF_128 1 "nonimmediate_operand" "%0, 0,x"))
-           (match_operand:VF_128   2 "nonimmediate_operand" "xm, x,xm")
+             (match_operand:VF_128 2 "nonimmediate_operand" "xm, x"))
+           (match_operand:VF_128   1 "nonimmediate_operand" " 0, 0")
             (neg:VF_128
-             (match_operand:VF_128 3 "nonimmediate_operand" " x,xm,0")))
-         (match_dup 0)
+             (match_operand:VF_128 3 "nonimmediate_operand" " x,xm")))
+         (match_dup 1)
           (const_int 1)))]
    "TARGET_FMA"
    "@
     vfnmsub132<ssescalarmodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfnmsub213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfnmsub231<ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %2}"
+   vfnmsub213<ssescalarmodesuffix>\t{%3, %2, %0|%0, %2, %3}"
    [(set_attr "type" "ssemuladd")
     (set_attr "mode" "<MODE>")])
  
+;; FMA4 floating point scalar intrinsics.  These write the
+;; entire destination register, with the high-order elements zeroed.
+
+(define_expand "fma4i_vmfmadd_<mode>"
+  [(set (match_operand:VF_128 0 "register_operand")
+       (vec_merge:VF_128
+         (fma:VF_128
+           (match_operand:VF_128 1 "nonimmediate_operand")
+           (match_operand:VF_128 2 "nonimmediate_operand")
+           (match_operand:VF_128 3 "nonimmediate_operand"))
+         (match_dup 4)
+         (const_int 1)))]
+  "TARGET_FMA4"
+{
+  operands[4] = CONST0_RTX (<MODE>mode);
+})
+
  (define_insn "*fma4i_vmfmadd_<mode>"
    [(set (match_operand:VF_128 0 "register_operand" "=x,x")
         (vec_merge:VF_128
@@ -1962,152 +2053,6 @@
  
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  ;;
-;; FMA4 Parallel floating point multiply addsub and subadd operations.
-;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-;; It would be possible to represent these without the UNSPEC as
-;;
-;; (vec_merge
-;;   (fma op1 op2 op3)
-;;   (fma op1 op2 (neg op3))
-;;   (merge-const))
-;;
-;; But this doesn't seem useful in practice.
-
-(define_expand "fmaddsub_<mode>"
-  [(set (match_operand:VF 0 "register_operand")
-       (unspec:VF
-         [(match_operand:VF 1 "nonimmediate_operand")
-          (match_operand:VF 2 "nonimmediate_operand")
-          (match_operand:VF 3 "nonimmediate_operand")]
-         UNSPEC_FMADDSUB))]
-  "TARGET_FMA || TARGET_FMA4")
-
-(define_insn "*fma4_fmaddsub_<mode>"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
-       (unspec:VF
-         [(match_operand:VF 1 "nonimmediate_operand" "%x,x")
-          (match_operand:VF 2 "nonimmediate_operand" " x,m")
-          (match_operand:VF 3 "nonimmediate_operand" "xm,x")]
-         UNSPEC_FMADDSUB))]
-  "TARGET_FMA4"
-  "vfmaddsub<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "*fma4_fmsubadd_<mode>"
-  [(set (match_operand:VF 0 "register_operand" "=x,x")
-       (unspec:VF
-         [(match_operand:VF 1 "nonimmediate_operand" "%x,x")
-          (match_operand:VF 2 "nonimmediate_operand" " x,m")
-          (neg:VF
-            (match_operand:VF 3 "nonimmediate_operand" "xm,x"))]
-         UNSPEC_FMADDSUB))]
-  "TARGET_FMA4"
-  "vfmsubadd<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;
-;; FMA3 floating point multiply/accumulate instructions.
-;;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-(define_insn "*fma_fmadd_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x")
-       (fma:FMAMODE
-         (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x")
-         (match_operand:FMAMODE 2 "nonimmediate_operand" "xm, x,xm")
-         (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0")))]
-  "TARGET_FMA"
-  "@
-   vfmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfmadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfmadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "*fma_fmsub_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x")
-       (fma:FMAMODE
-         (match_operand:FMAMODE   1 "nonimmediate_operand" "%0, 0,x")
-         (match_operand:FMAMODE   2 "nonimmediate_operand" "xm, x,xm")
-         (neg:FMAMODE
-           (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0"))))]
-  "TARGET_FMA"
-  "@
-   vfmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfmsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfmsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "*fma_fnmadd_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x")
-       (fma:FMAMODE
-         (neg:FMAMODE
-           (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x"))
-         (match_operand:FMAMODE   2 "nonimmediate_operand" "xm, x,xm")
-         (match_operand:FMAMODE   3 "nonimmediate_operand" " x,xm,0")))]
-  "TARGET_FMA"
-  "@
-   vfnmadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfnmadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfnmadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "*fma_fnmsub_<mode>"
-  [(set (match_operand:FMAMODE 0 "register_operand" "=x,x,x")
-       (fma:FMAMODE
-         (neg:FMAMODE
-           (match_operand:FMAMODE 1 "nonimmediate_operand" "%0, 0,x"))
-         (match_operand:FMAMODE   2 "nonimmediate_operand" "xm, x,xm")
-         (neg:FMAMODE
-           (match_operand:FMAMODE 3 "nonimmediate_operand" " x,xm,0"))))]
-  "TARGET_FMA"
-  "@
-   vfnmsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfnmsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfnmsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "*fma_fmaddsub_<mode>"
-  [(set (match_operand:VF 0 "register_operand" "=x,x,x")
-       (unspec:VF
-         [(match_operand:VF 1 "nonimmediate_operand" "%0, 0,x")
-          (match_operand:VF 2 "nonimmediate_operand" "xm, x,xm")
-          (match_operand:VF 3 "nonimmediate_operand" " x,xm,0")]
-         UNSPEC_FMADDSUB))]
-  "TARGET_FMA"
-  "@
-   vfmaddsub132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfmaddsub213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfmaddsub231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-(define_insn "*fma_fmsubadd_<mode>"
-  [(set (match_operand:VF 0 "register_operand" "=x,x,x")
-       (unspec:VF
-         [(match_operand:VF   1 "nonimmediate_operand" "%0, 0,x")
-          (match_operand:VF   2 "nonimmediate_operand" "xm, x,xm")
-          (neg:VF
-            (match_operand:VF 3 "nonimmediate_operand" " x,xm,0"))]
-         UNSPEC_FMADDSUB))]
-  "TARGET_FMA"
-  "@
-   vfmsubadd132<ssemodesuffix>\t{%2, %3, %0|%0, %3, %2}
-   vfmsubadd213<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
-   vfmsubadd231<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "ssemuladd")
-   (set_attr "mode" "<MODE>")])
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;
  ;; Parallel single-precision floating point conversion operations
  ;;
  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -4357,7 +4302,7 @@
    (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov")
     (set_attr "prefix_data16" "*,*,*,1,*,1")
     (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex")
-   (set_attr "mode" "V2DF,V2DF,V2DF,V1DF,V1DF,V1DF")])
+   (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")])
  
  ;; Recall that the 256-bit unpck insns only shuffle within their lanes.
  (define_expand "avx_movddup256"
@@ -4458,7 +4403,7 @@
     (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov")
     (set_attr "prefix_data16" "*,*,*,1,*,1")
     (set_attr "prefix" "orig,vex,maybe_vex,orig,vex,maybe_vex")
-   (set_attr "mode" "V2DF,V2DF,V2DF,V1DF,V1DF,V1DF")])
+   (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")])
  
  (define_split
    [(set (match_operand:V2DF 0 "memory_operand" "")
@@ -4915,7 +4860,7 @@
    [(set_attr "isa" "noavx,sse3")
     (set_attr "type" "sselog1")
     (set_attr "prefix" "orig,maybe_vex")
-   (set_attr "mode" "V2DF")])
+   (set_attr "mode" "V2DF,DF")])
  
  (define_insn "*vec_concatv2df"
    [(set (match_operand:V2DF 0 "register_operand"     "=x,x,x,x,x,x,x,x")
@@ -9619,7 +9564,7 @@
     (set_attr "mode" "TI")])
  
  (define_insn "<sse4_1_avx2>_pblendvb"
-  [(set (match_operand:VI1_AVX2 0 "reg_not_xmm0_operand" "=x,x")
+  [(set (match_operand:VI1_AVX2 0 "reg_not_xmm0_operand_maybe_avx" "=x,x")
         (unspec:VI1_AVX2
           [(match_operand:VI1_AVX2 1 "reg_not_xmm0_operand_maybe_avx"  "0,x")
            (match_operand:VI1_AVX2 2 "nonimm_not_xmm0_operand_maybe_avx" "xm,xm")
@@ -12370,7 +12315,7 @@
     (set_attr "mode" "<sseinsnmode>")])
  
  (define_insn "<avx_avx2>_maskstore<ssemodesuffix><avxsizesuffix>"
-  [(set (match_operand:V48_AVX2 0 "memory_operand" "=m")
+  [(set (match_operand:V48_AVX2 0 "memory_operand" "+m")
         (unspec:V48_AVX2
           [(match_operand:<sseintvecmode> 1 "register_operand" "x")
            (match_operand:V48_AVX2 2 "register_operand" "x")