OSDN Git Service

2007-05-25 H.J. Lu <hongjiu.lu@intel.com>
[pf3gnuchains/gcc-fork.git] / gcc / config / i386 / sse.md
index a17e7a9..d635b59 100644 (file)
@@ -1,5 +1,5 @@
 ;; GCC machine description for SSE instructions
-;; Copyright (C) 2005, 2006
+;; Copyright (C) 2005, 2006, 2007
 ;; Free Software Foundation, Inc.
 ;;
 ;; This file is part of GCC.
@@ -60,7 +60,9 @@
 (define_insn "*mov<mode>_internal"
   [(set (match_operand:SSEMODEI 0 "nonimmediate_operand" "=x,x ,m")
        (match_operand:SSEMODEI 1 "nonimmediate_or_sse_const_operand"  "C ,xm,x"))]
-  "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "TARGET_SSE
+   && (register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[1], <MODE>mode))"
 {
   switch (which_alternative)
     {
          (const_string "V4SF")
          (const_string "TI")))])
 
+;; Move a DI from a 32-bit register pair (e.g. %edx:%eax) to an xmm.
+;; We'd rather avoid this entirely; if the 32-bit reg pair was loaded
+;; from memory, we'd prefer to load the memory directly into the %xmm
+;; register.  To facilitate this happy circumstance, this pattern won't
+;; split until after register allocation.  If the 64-bit value didn't
+;; come from memory, this is the best we can do.  This is much better
+;; than storing %edx:%eax into a stack temporary and loading an %xmm
+;; from there.
+
+(define_insn_and_split "movdi_to_sse"
+  [(parallel
+    [(set (match_operand:V4SI 0 "register_operand" "=?x,x")
+         (subreg:V4SI (match_operand:DI 1 "nonimmediate_operand" "r,m") 0))
+     (clobber (match_scratch:V4SI 2 "=&x,X"))])]
+  "!TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  switch (which_alternative)
+    {
+    case 0:
+      /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax).
+        Assemble the 64-bit DImode value in an xmm register.  */
+      emit_insn (gen_sse2_loadld (operands[0], CONST0_RTX (V4SImode),
+                                 gen_rtx_SUBREG (SImode, operands[1], 0)));
+      emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode),
+                                 gen_rtx_SUBREG (SImode, operands[1], 4)));
+      emit_insn (gen_sse2_punpckldq (operands[0], operands[0], operands[2]));
+      break;
+
+    case 1:
+      emit_insn (gen_vec_concatv2di (operands[0], operands[1], const0_rtx));
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+  DONE;
+})
+
 (define_expand "movv4sf"
   [(set (match_operand:V4SF 0 "nonimmediate_operand" "")
        (match_operand:V4SF 1 "nonimmediate_operand" ""))]
 (define_insn "*movv4sf_internal"
   [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,m")
        (match_operand:V4SF 1 "nonimmediate_or_sse_const_operand" "C,xm,x"))]
-  "TARGET_SSE"
+  "TARGET_SSE
+   && (register_operand (operands[0], V4SFmode)
+       || register_operand (operands[1], V4SFmode))"
 {
   switch (which_alternative)
     {
 (define_insn "*movv2df_internal"
   [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,x,m")
        (match_operand:V2DF 1 "nonimmediate_or_sse_const_operand" "C,xm,x"))]
-  "TARGET_SSE && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "TARGET_SSE
+   && (register_operand (operands[0], V2DFmode)
+       || register_operand (operands[1], V2DFmode))"
 {
   switch (which_alternative)
     {
   "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
   "movdqu\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse_movntv4sf"
   "TARGET_SSE2"
   "movntdq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_movntsi"
   "TARGET_SSE3"
   "lddqu\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "TI")])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (define_insn "sse_vmaddv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
        (vec_merge:V4SF
-         (plus:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "%0")
+         (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
                     (match_operand:V4SF 2 "nonimmediate_operand" "xm"))
          (match_dup 1)
          (const_int 1)))]
 (define_insn "sse_vmmulv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
        (vec_merge:V4SF
-         (mult:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "%0")
+         (mult:V4SF (match_operand:V4SF 1 "register_operand" "0")
                     (match_operand:V4SF 2 "nonimmediate_operand" "xm"))
          (match_dup 1)
          (const_int 1)))]
   [(set_attr "type" "sse")
    (set_attr "mode" "V4SF")])
 
-(define_insn "*sse_vmsmaxv4sf3_finite"
-  [(set (match_operand:V4SF 0 "register_operand" "=x")
-       (vec_merge:V4SF
-        (smax:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "%0")
-                   (match_operand:V4SF 2 "nonimmediate_operand" "xm"))
-        (match_dup 1)
-        (const_int 1)))]
-  "TARGET_SSE && flag_finite_math_only
-   && ix86_binary_operator_ok (SMAX, V4SFmode, operands)"
-  "maxss\t{%2, %0|%0, %2}"
-  [(set_attr "type" "sse")
-   (set_attr "mode" "SF")])
-
 (define_insn "sse_vmsmaxv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
        (vec_merge:V4SF
   [(set_attr "type" "sse")
    (set_attr "mode" "V4SF")])
 
-(define_insn "*sse_vmsminv4sf3_finite"
-  [(set (match_operand:V4SF 0 "register_operand" "=x")
-       (vec_merge:V4SF
-        (smin:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "%0")
-                   (match_operand:V4SF 2 "nonimmediate_operand" "xm"))
-        (match_dup 1)
-        (const_int 1)))]
-  "TARGET_SSE && flag_finite_math_only
-   && ix86_binary_operator_ok (SMIN, V4SFmode, operands)"
-  "minss\t{%2, %0|%0, %2}"
-  [(set_attr "type" "sse")
-   (set_attr "mode" "SF")])
-
 (define_insn "sse_vmsminv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
        (vec_merge:V4SF
   "TARGET_SSE3"
   "addsubps\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseadd")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "V4SF")])
 
 (define_insn "sse3_haddv4sf3"
   "TARGET_SSE3"
   "haddps\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseadd")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "V4SF")])
 
 (define_insn "sse3_hsubv4sf3"
   "TARGET_SSE3"
   "hsubps\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseadd")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "V4SF")])
 
 (define_expand "reduc_splus_v4sf"
   "cvtsi2ss\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "vector,double")
+   (set_attr "amdfam10_decode" "vector,double")
    (set_attr "mode" "SF")])
 
 (define_insn "sse_cvtsi2ssq"
   "cvtsi2ssq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "vector,double")
+   (set_attr "amdfam10_decode" "vector,double")
    (set_attr "mode" "SF")])
 
 (define_insn "sse_cvtss2si"
   "cvtss2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "SI")])
 
 (define_insn "sse_cvtss2si_2"
   "cvtss2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "SI")])
 
 (define_insn "sse_cvtss2siq"
   "cvtss2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "sse_cvtss2siq_2"
   "cvtss2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "sse_cvttss2si"
   "cvttss2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "SI")])
 
 (define_insn "sse_cvttss2siq"
   "cvttss2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "sse2_cvtdq2ps"
   "TARGET_SSE2"
   "cvtdq2ps\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
-   (set_attr "mode" "V2DF")])
+   (set_attr "mode" "V4SF")])
 
 (define_insn "sse2_cvtps2dq"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
   "TARGET_SSE2"
   "cvtps2dq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_cvttps2dq"
   "TARGET_SSE2"
   "cvttps2dq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "TI")])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   "TARGET_SSE3"
   "movshdup\t{%1, %0|%0, %1}"
   [(set_attr "type" "sse")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "V4SF")])
 
 (define_insn "sse3_movsldup"
   "TARGET_SSE3"
   "movsldup\t{%1, %0|%0, %1}"
   [(set_attr "type" "sse")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "V4SF")])
 
 (define_expand "sse_shufps"
   DONE;
 })
 
-(define_insn "*vec_setv4sf_0"
-  [(set (match_operand:V4SF 0 "nonimmediate_operand"  "=x,x,Y ,m")
+(define_insn "vec_setv4sf_0"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand"  "=x,x,Y2,m")
        (vec_merge:V4SF
          (vec_duplicate:V4SF
            (match_operand:SF 2 "general_operand"     " x,m,*r,x*rfF"))
   [(set_attr "type" "ssemov")
    (set_attr "mode" "SF")])
 
+;; A subset is vec_setv4sf.
+(define_insn "*vec_setv4sf_sse4_1"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+       (vec_merge:V4SF
+         (vec_duplicate:V4SF
+           (match_operand:SF 2 "nonimmediate_operand" "xm"))
+         (match_operand:V4SF 1 "register_operand" "0")
+         (match_operand:SI 3 "const_pow2_1_to_8_operand" "n")))]
+  "TARGET_SSE4_1"
+{
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])) << 4);
+  return "insertps\t{%3, %2, %0|%0, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse4_1_insertps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+       (unspec:V4SF [(match_operand:V4SF 2 "register_operand" "x")
+                     (match_operand:V4SF 1 "register_operand" "0")
+                     (match_operand:SI 3 "const_0_to_255_operand" "n")]
+                    UNSPEC_INSERTPS))]
+  "TARGET_SSE4_1"
+  "insertps\t{%3, %2, %0|%0, %2, %3}";
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V4SF")])
+
 (define_split
   [(set (match_operand:V4SF 0 "memory_operand" "")
        (vec_merge:V4SF
   DONE;
 })
 
+(define_insn "*sse4_1_extractps"
+  [(set (match_operand:SF 0 "register_operand" "=rm")
+       (vec_select:SF
+         (match_operand:V4SF 1 "register_operand" "x")
+         (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))]
+  "TARGET_SSE4_1"
+  "extractps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V4SF")])
+
 (define_expand "vec_extractv4sf"
   [(match_operand:SF 0 "register_operand" "")
    (match_operand:V4SF 1 "register_operand" "")
 (define_insn "sse2_vmaddv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
        (vec_merge:V2DF
-         (plus:V2DF (match_operand:V2DF 1 "nonimmediate_operand" "%0")
+         (plus:V2DF (match_operand:V2DF 1 "register_operand" "0")
                     (match_operand:V2DF 2 "nonimmediate_operand" "xm"))
          (match_dup 1)
          (const_int 1)))]
 (define_insn "sse2_vmmulv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
        (vec_merge:V2DF
-         (mult:V2DF (match_operand:V2DF 1 "nonimmediate_operand" "%0")
+         (mult:V2DF (match_operand:V2DF 1 "register_operand" "0")
                     (match_operand:V2DF 2 "nonimmediate_operand" "xm"))
          (match_dup 1)
          (const_int 1)))]
   [(set_attr "type" "sseadd")
    (set_attr "mode" "V2DF")])
 
-(define_insn "*sse2_vmsmaxv2df3_finite"
-  [(set (match_operand:V2DF 0 "register_operand" "=x")
-       (vec_merge:V2DF
-         (smax:V2DF (match_operand:V2DF 1 "nonimmediate_operand" "%0")
-                    (match_operand:V2DF 2 "nonimmediate_operand" "xm"))
-         (match_dup 1)
-         (const_int 1)))]
-  "TARGET_SSE2 && flag_finite_math_only
-   && ix86_binary_operator_ok (SMAX, V2DFmode, operands)"
-  "maxsd\t{%2, %0|%0, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "mode" "DF")])
-
 (define_insn "sse2_vmsmaxv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
        (vec_merge:V2DF
   [(set_attr "type" "sseadd")
    (set_attr "mode" "V2DF")])
 
-(define_insn "*sse2_vmsminv2df3_finite"
-  [(set (match_operand:V2DF 0 "register_operand" "=x")
-       (vec_merge:V2DF
-         (smin:V2DF (match_operand:V2DF 1 "nonimmediate_operand" "%0")
-                    (match_operand:V2DF 2 "nonimmediate_operand" "xm"))
-         (match_dup 1)
-         (const_int 1)))]
-  "TARGET_SSE2 && flag_finite_math_only
-   && ix86_binary_operator_ok (SMIN, V2DFmode, operands)"
-  "minsd\t{%2, %0|%0, %2}"
-  [(set_attr "type" "sseadd")
-   (set_attr "mode" "DF")])
-
 (define_insn "sse2_vmsminv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
        (vec_merge:V2DF
   "cvtpd2pi\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
    (set_attr "unit" "mmx")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "sse2_cvttpd2pi"
   "cvttpd2pi\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
    (set_attr "unit" "mmx")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_cvtsi2sd"
   "cvtsi2sd\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseicvt")
    (set_attr "mode" "DF")
-   (set_attr "athlon_decode" "double,direct")])
+   (set_attr "athlon_decode" "double,direct")
+   (set_attr "amdfam10_decode" "vector,double")])
 
 (define_insn "sse2_cvtsi2sdq"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
   "cvtsi2sdq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseicvt")
    (set_attr "mode" "DF")
-   (set_attr "athlon_decode" "double,direct")])
+   (set_attr "athlon_decode" "double,direct")
+   (set_attr "amdfam10_decode" "vector,double")])
 
 (define_insn "sse2_cvtsd2si"
   [(set (match_operand:SI 0 "register_operand" "=r,r")
   "cvtsd2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "SI")])
 
 (define_insn "sse2_cvtsd2si_2"
   "cvtsd2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "SI")])
 
 (define_insn "sse2_cvtsd2siq"
   "cvtsd2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "sse2_cvtsd2siq_2"
   "cvtsd2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "sse2_cvttsd2si"
   "TARGET_SSE2"
   "cvttsd2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "SI")
-   (set_attr "athlon_decode" "double,vector")])
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")])
 
 (define_insn "sse2_cvttsd2siq"
   [(set (match_operand:DI 0 "register_operand" "=r,r")
   "TARGET_SSE2 && TARGET_64BIT"
   "cvttsd2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "DI")
-   (set_attr "athlon_decode" "double,vector")])
+   (set_attr "athlon_decode" "double,vector")
+   (set_attr "amdfam10_decode" "double,double")])
 
 (define_insn "sse2_cvtdq2pd"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
   "TARGET_SSE2"
   "cvtpd2dq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
-   (set_attr "mode" "TI")])
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "TI")
+   (set_attr "amdfam10_decode" "double")])
 
 (define_expand "sse2_cvttpd2dq"
   [(set (match_operand:V4SI 0 "register_operand" "")
   "TARGET_SSE2"
   "cvttpd2dq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
-   (set_attr "mode" "TI")])
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "TI")
+   (set_attr "amdfam10_decode" "double")])
 
 (define_insn "sse2_cvtsd2ss"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
   "cvtsd2ss\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecvt")
    (set_attr "athlon_decode" "vector,double")
+   (set_attr "amdfam10_decode" "vector,double")
    (set_attr "mode" "SF")])
 
 (define_insn "sse2_cvtss2sd"
-  [(set (match_operand:V2DF 0 "register_operand" "=x")
+  [(set (match_operand:V2DF 0 "register_operand" "=x,x")
        (vec_merge:V2DF
          (float_extend:V2DF
            (vec_select:V2SF
-             (match_operand:V4SF 2 "nonimmediate_operand" "xm")
+             (match_operand:V4SF 2 "nonimmediate_operand" "x,m")
              (parallel [(const_int 0) (const_int 1)])))
-         (match_operand:V2DF 1 "register_operand" "0")
+         (match_operand:V2DF 1 "register_operand" "0,0")
          (const_int 1)))]
   "TARGET_SSE2"
   "cvtss2sd\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecvt")
+   (set_attr "amdfam10_decode" "vector,double")
    (set_attr "mode" "DF")])
 
 (define_expand "sse2_cvtpd2ps"
   "TARGET_SSE2"
   "cvtpd2ps\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
-   (set_attr "mode" "V4SF")])
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "V4SF")
+   (set_attr "amdfam10_decode" "double")])
 
 (define_insn "sse2_cvtps2pd"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
   "TARGET_SSE2"
   "cvtps2pd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
-   (set_attr "mode" "V2DF")])
+   (set_attr "mode" "V2DF")
+   (set_attr "amdfam10_decode" "direct")])
+
+(define_expand "vec_unpacks_hi_v4sf"
+  [(set (match_dup 2)
+   (vec_select:V4SF
+     (vec_concat:V8SF
+       (match_dup 2)
+       (match_operand:V4SF 1 "nonimmediate_operand" ""))
+     (parallel [(const_int 6)
+               (const_int 7)
+               (const_int 2)
+               (const_int 3)])))
+  (set (match_operand:V2DF 0 "register_operand" "")
+   (float_extend:V2DF
+     (vec_select:V2SF
+       (match_dup 2)
+       (parallel [(const_int 0) (const_int 1)]))))]
+ "TARGET_SSE2"
+{
+ operands[2] = gen_reg_rtx (V4SFmode);
+})
+
+(define_expand "vec_unpacks_lo_v4sf"
+  [(set (match_operand:V2DF 0 "register_operand" "")
+       (float_extend:V2DF
+         (vec_select:V2SF
+           (match_operand:V4SF 1 "nonimmediate_operand" "")
+           (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE2")
+
+(define_expand "vec_unpacks_float_hi_v8hi"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V8HI 1 "register_operand" "")]
+  "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_vec_unpacks_hi_v8hi (tmp, operands[1]));
+  emit_insn (gen_sse2_cvtdq2ps (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "vec_unpacks_float_lo_v8hi"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V8HI 1 "register_operand" "")]
+  "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_vec_unpacks_lo_v8hi (tmp, operands[1]));
+  emit_insn (gen_sse2_cvtdq2ps (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "vec_unpacku_float_hi_v8hi"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V8HI 1 "register_operand" "")]
+  "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_vec_unpacku_hi_v8hi (tmp, operands[1]));
+  emit_insn (gen_sse2_cvtdq2ps (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "vec_unpacku_float_lo_v8hi"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V8HI 1 "register_operand" "")]
+  "TARGET_SSE2"
+{
+  rtx tmp = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_vec_unpacku_lo_v8hi (tmp, operands[1]));
+  emit_insn (gen_sse2_cvtdq2ps (operands[0], tmp));
+  DONE;
+})
+
+(define_expand "vec_unpacks_float_hi_v4si"
+  [(set (match_dup 2)
+       (vec_select:V4SI
+         (match_operand:V4SI 1 "nonimmediate_operand" "")
+         (parallel [(const_int 2)
+                    (const_int 3)
+                    (const_int 2)
+                    (const_int 3)])))
+   (set (match_operand:V2DF 0 "register_operand" "")
+        (float:V2DF
+         (vec_select:V2SI
+         (match_dup 2)
+           (parallel [(const_int 0) (const_int 1)]))))]
+ "TARGET_SSE2"
+{
+ operands[2] = gen_reg_rtx (V4SImode);
+})
+
+(define_expand "vec_unpacks_float_lo_v4si"
+  [(set (match_operand:V2DF 0 "register_operand" "")
+       (float:V2DF
+         (vec_select:V2SI
+           (match_operand:V4SI 1 "nonimmediate_operand" "")
+           (parallel [(const_int 0) (const_int 1)]))))]
+  "TARGET_SSE2")
+
+(define_expand "vec_pack_trunc_v2df"
+  [(match_operand:V4SF 0 "register_operand" "")
+   (match_operand:V2DF 1 "nonimmediate_operand" "")
+   (match_operand:V2DF 2 "nonimmediate_operand" "")]
+  "TARGET_SSE2"
+{
+  rtx r1, r2;
+
+  r1 = gen_reg_rtx (V4SFmode);
+  r2 = gen_reg_rtx (V4SFmode);
+
+  emit_insn (gen_sse2_cvtpd2ps (r1, operands[1]));
+  emit_insn (gen_sse2_cvtpd2ps (r2, operands[2]));
+  emit_insn (gen_sse_movlhps (operands[0], r1, r2));
+  DONE;
+})
+
+(define_expand "vec_pack_sfix_trunc_v2df"
+  [(match_operand:V4SI 0 "register_operand" "")
+   (match_operand:V2DF 1 "nonimmediate_operand" "")
+   (match_operand:V2DF 2 "nonimmediate_operand" "")]
+  "TARGET_SSE2"
+{
+  rtx r1, r2;
+
+  r1 = gen_reg_rtx (V4SImode);
+  r2 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_sse2_cvttpd2dq (r1, operands[1]));
+  emit_insn (gen_sse2_cvttpd2dq (r2, operands[2]));
+  emit_insn (gen_sse2_punpcklqdq (gen_lowpart (V2DImode, operands[0]),
+                                 gen_lowpart (V2DImode, r1),
+                                 gen_lowpart (V2DImode, r2)));
+  DONE;
+})
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
   "TARGET_SSE2"
   "unpcklpd\t%0, %0"
   [(set_attr "type" "sselog1")
-   (set_attr "mode" "V4SF")])
+   (set_attr "mode" "V2DF")])
 
 (define_insn "*vec_concatv2df_sse3"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
    (set_attr "mode" "DF")])
 
 (define_insn "*vec_concatv2df"
-  [(set (match_operand:V2DF 0 "register_operand"     "=Y,Y,Y,x,x")
+  [(set (match_operand:V2DF 0 "register_operand"     "=Y2,Y2,Y2,x,x")
        (vec_concat:V2DF
-         (match_operand:DF 1 "nonimmediate_operand" " 0,0,m,0,0")
-         (match_operand:DF 2 "vector_move_operand"  " Y,m,C,x,m")))]
+         (match_operand:DF 1 "nonimmediate_operand" " 0 ,0 ,m ,0,0")
+         (match_operand:DF 2 "vector_move_operand"  " Y2,m ,C ,x,m")))]
   "TARGET_SSE"
   "@
    unpcklpd\t{%2, %0|%0, %2}
   "TARGET_SSE2 && ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
   "padd<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_ssadd<mode>3"
   "TARGET_SSE2 && ix86_binary_operator_ok (SS_PLUS, <MODE>mode, operands)"
   "padds<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_usadd<mode>3"
   "TARGET_SSE2 && ix86_binary_operator_ok (US_PLUS, <MODE>mode, operands)"
   "paddus<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "sub<mode>3"
   "TARGET_SSE2"
   "psub<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_sssub<mode>3"
   "TARGET_SSE2"
   "psubs<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_ussub<mode>3"
   "TARGET_SSE2"
   "psubus<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "mulv16qi3"
   "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
   "pmullw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "smulv8hi3_highpart"
   "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
   "pmulhw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "umulv8hi3_highpart"
   "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
   "pmulhuw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_umulv2siv2di3"
            (vec_select:V2SI
              (match_operand:V4SI 2 "nonimmediate_operand" "xm")
              (parallel [(const_int 0) (const_int 2)])))))]
-  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
+  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V4SImode, operands)"
   "pmuludq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_mulv2siv2di3"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (mult:V2DI
+         (sign_extend:V2DI
+           (vec_select:V2SI
+             (match_operand:V4SI 1 "nonimmediate_operand" "%0")
+             (parallel [(const_int 0) (const_int 2)])))
+         (sign_extend:V2DI
+           (vec_select:V2SI
+             (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+             (parallel [(const_int 0) (const_int 2)])))))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)"
+  "pmuldq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_pmaddwd"
                           (const_int 3)
                           (const_int 5)
                           (const_int 7)]))))))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
   "pmaddwd\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "mulv4si3"
                   (match_operand:V4SI 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
-  rtx t1, t2, t3, t4, t5, t6, thirtytwo;
-  rtx op0, op1, op2;
-
-  op0 = operands[0];
-  op1 = operands[1];
-  op2 = operands[2];
-  t1 = gen_reg_rtx (V4SImode);
-  t2 = gen_reg_rtx (V4SImode);
-  t3 = gen_reg_rtx (V4SImode);
-  t4 = gen_reg_rtx (V4SImode);
-  t5 = gen_reg_rtx (V4SImode);
-  t6 = gen_reg_rtx (V4SImode);
-  thirtytwo = GEN_INT (32);
-
-  /* Multiply elements 2 and 0.  */
-  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1), op1, op2));
-
-  /* Shift both input vectors down one element, so that elements 3 and 1
-     are now in the slots for elements 2 and 0.  For K8, at least, this is
-     faster than using a shuffle.  */
-  emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2),
-                              gen_lowpart (TImode, op1), thirtytwo));
-  emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3),
-                              gen_lowpart (TImode, op2), thirtytwo));
-
-  /* Multiply elements 3 and 1.  */
-  emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4), t2, t3));
-
-  /* Move the results in element 2 down to element 1; we don't care what
-     goes in elements 2 and 3.  */
-  emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
-                               const0_rtx, const0_rtx));
-  emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
+  if (TARGET_SSE4_1)
+    ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);
+ else
+   {
+     rtx t1, t2, t3, t4, t5, t6, thirtytwo;
+     rtx op0, op1, op2;
+
+     op0 = operands[0];
+     op1 = operands[1];
+     op2 = operands[2];
+     t1 = gen_reg_rtx (V4SImode);
+     t2 = gen_reg_rtx (V4SImode);
+     t3 = gen_reg_rtx (V4SImode);
+     t4 = gen_reg_rtx (V4SImode);
+     t5 = gen_reg_rtx (V4SImode);
+     t6 = gen_reg_rtx (V4SImode);
+     thirtytwo = GEN_INT (32);
+
+     /* Multiply elements 2 and 0.  */
+     emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t1),
+                                       op1, op2));
+
+     /* Shift both input vectors down one element, so that elements 3
+       and 1 are now in the slots for elements 2 and 0.  For K8, at
+       least, this is faster than using a shuffle.  */
+     emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t2),
+                                 gen_lowpart (TImode, op1),
+                                 thirtytwo));
+     emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, t3),
+                                 gen_lowpart (TImode, op2),
+                                 thirtytwo)); 
+     /* Multiply elements 3 and 1.  */
+     emit_insn (gen_sse2_umulv2siv2di3 (gen_lowpart (V2DImode, t4),
+                                       t2, t3));
+
+     /* Move the results in element 2 down to element 1; we don't care
+       what goes in elements 2 and 3.  */
+     emit_insn (gen_sse2_pshufd_1 (t5, t1, const0_rtx, const2_rtx,
                                const0_rtx, const0_rtx));
+     emit_insn (gen_sse2_pshufd_1 (t6, t4, const0_rtx, const2_rtx,
+                                  const0_rtx, const0_rtx));
 
-  /* Merge the parts back together.  */
-  emit_insn (gen_sse2_punpckldq (op0, t5, t6));
-  DONE;
+    /* Merge the parts back together.  */
+     emit_insn (gen_sse2_punpckldq (op0, t5, t6));
+     DONE;
+   }
 })
 
+(define_insn "*sse4_1_mulv4si3"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%0")
+                  (match_operand:V4SI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)"
+  "pmulld\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
 (define_expand "mulv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "")
        (mult:V2DI (match_operand:V2DI 1 "register_operand" "")
 
 (define_expand "sdot_prodv8hi"
   [(match_operand:V4SI 0 "register_operand" "")
-   (match_operand:V8HI 1 "nonimmediate_operand" "")
-   (match_operand:V8HI 2 "nonimmediate_operand" "")
+   (match_operand:V8HI 1 "register_operand" "")
+   (match_operand:V8HI 2 "register_operand" "")
    (match_operand:V4SI 3 "register_operand" "")]
   "TARGET_SSE2"
 {
   [(set (match_operand:SSEMODE24 0 "register_operand" "=x")
        (ashiftrt:SSEMODE24
          (match_operand:SSEMODE24 1 "register_operand" "0")
-         (match_operand:SI 2 "nonmemory_operand" "xi")))]
+         (match_operand:TI 2 "nonmemory_operand" "xn")))]
   "TARGET_SSE2"
   "psra<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "lshr<mode>3"
   [(set (match_operand:SSEMODE248 0 "register_operand" "=x")
        (lshiftrt:SSEMODE248
          (match_operand:SSEMODE248 1 "register_operand" "0")
-         (match_operand:SI 2 "nonmemory_operand" "xi")))]
+         (match_operand:TI 2 "nonmemory_operand" "xn")))]
   "TARGET_SSE2"
   "psrl<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ashl<mode>3"
   [(set (match_operand:SSEMODE248 0 "register_operand" "=x")
        (ashift:SSEMODE248
          (match_operand:SSEMODE248 1 "register_operand" "0")
-         (match_operand:SI 2 "nonmemory_operand" "xi")))]
+         (match_operand:TI 2 "nonmemory_operand" "xn")))]
   "TARGET_SSE2"
   "psll<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseishft")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_ashlti3"
   return "pslldq\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "sseishft")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "vec_shl_<mode>"
   return "psrldq\t{%2, %0|%0, %2}";
 }
   [(set_attr "type" "sseishft")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "vec_shr_<mode>"
   "TARGET_SSE2 && ix86_binary_operator_ok (UMAX, V16QImode, operands)"
   "pmaxub\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "smaxv8hi3"
   "TARGET_SSE2 && ix86_binary_operator_ok (SMAX, V8HImode, operands)"
   "pmaxsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "umaxv8hi3"
-  [(set (match_operand:V8HI 0 "register_operand" "=x")
-       (us_minus:V8HI (match_operand:V8HI 1 "register_operand" "0")
-                      (match_operand:V8HI 2 "nonimmediate_operand" "xm")))
-   (set (match_dup 3)
-       (plus:V8HI (match_dup 0) (match_dup 2)))]
+  [(set (match_operand:V8HI 0 "register_operand" "")
+       (umax:V8HI (match_operand:V8HI 1 "register_operand" "")
+                  (match_operand:V8HI 2 "nonimmediate_operand" "")))]
   "TARGET_SSE2"
 {
-  operands[3] = operands[0];
-  if (rtx_equal_p (operands[0], operands[2]))
-    operands[0] = gen_reg_rtx (V8HImode);
+  if (TARGET_SSE4_1)
+    ix86_fixup_binary_operands_no_copy (UMAX, V8HImode, operands);
+  else
+    {
+      rtx op0 = operands[0], op2 = operands[2], op3 = op0;
+      if (rtx_equal_p (op3, op2))
+       op3 = gen_reg_rtx (V8HImode);
+      emit_insn (gen_sse2_ussubv8hi3 (op3, operands[1], op2));
+      emit_insn (gen_addv8hi3 (op0, op3, op2));
+      DONE;
+    }
 })
 
 (define_expand "smax<mode>3"
                        (match_operand:SSEMODE14 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
-  rtx xops[6];
-  bool ok;
-
-  xops[0] = operands[0];
-  xops[1] = operands[1];
-  xops[2] = operands[2];
-  xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
-  xops[4] = operands[1];
-  xops[5] = operands[2];
-  ok = ix86_expand_int_vcond (xops);
-  gcc_assert (ok);
-  DONE;
+  if (TARGET_SSE4_1)
+    ix86_fixup_binary_operands_no_copy (SMAX, <MODE>mode, operands);
+  else
+  {
+    rtx xops[6];
+    bool ok;
+
+    xops[0] = operands[0];
+    xops[1] = operands[1];
+    xops[2] = operands[2];
+    xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
+    xops[4] = operands[1];
+    xops[5] = operands[2];
+    ok = ix86_expand_int_vcond (xops);
+    gcc_assert (ok);
+    DONE;
+  }
 })
 
+(define_insn "*sse4_1_smax<mode>3"
+  [(set (match_operand:SSEMODE14 0 "register_operand" "=x")
+       (smax:SSEMODE14
+         (match_operand:SSEMODE14 1 "nonimmediate_operand" "%0")
+         (match_operand:SSEMODE14 2 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (SMAX, <MODE>mode, operands)"
+  "pmaxs<ssevecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
 (define_expand "umaxv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "")
        (umax:V4SI (match_operand:V4SI 1 "register_operand" "")
                   (match_operand:V4SI 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
-  rtx xops[6];
-  bool ok;
-
-  xops[0] = operands[0];
-  xops[1] = operands[1];
-  xops[2] = operands[2];
-  xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
-  xops[4] = operands[1];
-  xops[5] = operands[2];
-  ok = ix86_expand_int_vcond (xops);
-  gcc_assert (ok);
-  DONE;
+  if (TARGET_SSE4_1)
+    ix86_fixup_binary_operands_no_copy (UMAX, V4SImode, operands);
+  else
+  {
+    rtx xops[6];
+    bool ok;
+
+    xops[0] = operands[0];
+    xops[1] = operands[1];
+    xops[2] = operands[2];
+    xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
+    xops[4] = operands[1];
+    xops[5] = operands[2];
+    ok = ix86_expand_int_vcond (xops);
+    gcc_assert (ok);
+    DONE;
+  }
 })
 
+(define_insn "*sse4_1_umax<mode>3"
+  [(set (match_operand:SSEMODE24 0 "register_operand" "=x")
+       (umax:SSEMODE24
+         (match_operand:SSEMODE24 1 "nonimmediate_operand" "%0")
+         (match_operand:SSEMODE24 2 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (UMAX, <MODE>mode, operands)"
+  "pmaxu<ssevecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
 (define_expand "uminv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "")
        (umin:V16QI (match_operand:V16QI 1 "nonimmediate_operand" "")
                    (match_operand:V16QI 2 "nonimmediate_operand" "")))]
   "TARGET_SSE2"
-  "ix86_fixup_binary_operands_no_copy (UMAX, V16QImode, operands);")
+  "ix86_fixup_binary_operands_no_copy (UMIN, V16QImode, operands);")
 
 (define_insn "*uminv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
   "TARGET_SSE2 && ix86_binary_operator_ok (UMIN, V16QImode, operands)"
   "pminub\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "sminv8hi3"
   "TARGET_SSE2 && ix86_binary_operator_ok (SMIN, V8HImode, operands)"
   "pminsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "smin<mode>3"
                        (match_operand:SSEMODE14 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
-  rtx xops[6];
-  bool ok;
-
-  xops[0] = operands[0];
-  xops[1] = operands[2];
-  xops[2] = operands[1];
-  xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
-  xops[4] = operands[1];
-  xops[5] = operands[2];
-  ok = ix86_expand_int_vcond (xops);
-  gcc_assert (ok);
-  DONE;
+  if (TARGET_SSE4_1)
+    ix86_fixup_binary_operands_no_copy (SMIN, <MODE>mode, operands);
+  else
+    {
+      rtx xops[6];
+      bool ok;
+
+      xops[0] = operands[0];
+      xops[1] = operands[2];
+      xops[2] = operands[1];
+      xops[3] = gen_rtx_GT (VOIDmode, operands[1], operands[2]);
+      xops[4] = operands[1];
+      xops[5] = operands[2];
+      ok = ix86_expand_int_vcond (xops);
+      gcc_assert (ok);
+      DONE;
+    }
 })
 
+(define_insn "*sse4_1_smin<mode>3"
+  [(set (match_operand:SSEMODE14 0 "register_operand" "=x")
+       (smin:SSEMODE14
+         (match_operand:SSEMODE14 1 "nonimmediate_operand" "%0")
+         (match_operand:SSEMODE14 2 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (SMIN, <MODE>mode, operands)"
+  "pmins<ssevecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
 (define_expand "umin<mode>3"
   [(set (match_operand:SSEMODE24 0 "register_operand" "")
        (umin:SSEMODE24 (match_operand:SSEMODE24 1 "register_operand" "")
                        (match_operand:SSEMODE24 2 "register_operand" "")))]
   "TARGET_SSE2"
 {
-  rtx xops[6];
-  bool ok;
-
-  xops[0] = operands[0];
-  xops[1] = operands[2];
-  xops[2] = operands[1];
-  xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
-  xops[4] = operands[1];
-  xops[5] = operands[2];
-  ok = ix86_expand_int_vcond (xops);
-  gcc_assert (ok);
-  DONE;
+  if (TARGET_SSE4_1)
+    ix86_fixup_binary_operands_no_copy (UMIN, <MODE>mode, operands);
+  else
+    {
+      rtx xops[6];
+      bool ok;
+
+      xops[0] = operands[0];
+      xops[1] = operands[2];
+      xops[2] = operands[1];
+      xops[3] = gen_rtx_GTU (VOIDmode, operands[1], operands[2]);
+      xops[4] = operands[1];
+      xops[5] = operands[2];
+      ok = ix86_expand_int_vcond (xops);
+      gcc_assert (ok);
+      DONE;
+    }
 })
 
+(define_insn "*sse4_1_umin<mode>3"
+  [(set (match_operand:SSEMODE24 0 "register_operand" "=x")
+       (umin:SSEMODE24
+         (match_operand:SSEMODE24 1 "nonimmediate_operand" "%0")
+         (match_operand:SSEMODE24 2 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (UMIN, <MODE>mode, operands)"
+  "pminu<ssevecsize>\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel integral comparisons
   "TARGET_SSE2 && ix86_binary_operator_ok (EQ, <MODE>mode, operands)"
   "pcmpeq<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecmp")
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_eqv2di3"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (eq:V2DI
+         (match_operand:V2DI 1 "nonimmediate_operand" "%0")
+         (match_operand:V2DI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (EQ, V2DImode, operands)"
+  "pcmpeqq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_gt<mode>3"
   "TARGET_SSE2"
   "pcmpgt<ssevecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssecmp")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "vcond<mode>"
   "TARGET_SSE2 && ix86_binary_operator_ok (AND, <MODE>mode, operands)"
   "pand\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_nand<mode>3"
   "TARGET_SSE2"
   "pandn\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "ior<mode>3"
   "TARGET_SSE2 && ix86_binary_operator_ok (IOR, <MODE>mode, operands)"
   "por\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "xor<mode>3"
   "TARGET_SSE2 && ix86_binary_operator_ok (XOR, <MODE>mode, operands)"
   "pxor\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;       h3 = aeimquy2bfjnrvz3
 ;;       l3 = cgkosw04dhlptx15
 ;;   result = bdfhjlnprtvxz135
-(define_expand "vec_pack_mod_v8hi"
+(define_expand "vec_pack_trunc_v8hi"
   [(match_operand:V16QI 0 "register_operand" "")
    (match_operand:V8HI 1 "register_operand" "")
    (match_operand:V8HI 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, h1, l1, h2, l2, h3, l3;
-                                                                                
+
   op1 = gen_lowpart (V16QImode, operands[1]);
   op2 = gen_lowpart (V16QImode, operands[2]);
   h1 = gen_reg_rtx (V16QImode);
   l2 = gen_reg_rtx (V16QImode);
   h3 = gen_reg_rtx (V16QImode);
   l3 = gen_reg_rtx (V16QImode);
-                                                                                
+
   emit_insn (gen_vec_interleave_highv16qi (h1, op1, op2));
   emit_insn (gen_vec_interleave_lowv16qi (l1, op1, op2));
   emit_insn (gen_vec_interleave_highv16qi (h2, l1, h1));
   emit_insn (gen_vec_interleave_lowv16qi (operands[0], l3, h3));
   DONE;
 })
-                                                                                
+
 ;; Reduce:
 ;;      op1 = abcdefgh
 ;;      op2 = ijklmnop
 ;;       h2 = aeimbfjn
 ;;       l2 = cgkodhlp
 ;;   result = bdfhjlnp
-(define_expand "vec_pack_mod_v4si"
+(define_expand "vec_pack_trunc_v4si"
   [(match_operand:V8HI 0 "register_operand" "")
    (match_operand:V4SI 1 "register_operand" "")
    (match_operand:V4SI 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, h1, l1, h2, l2;
-                                                                                
+
   op1 = gen_lowpart (V8HImode, operands[1]);
   op2 = gen_lowpart (V8HImode, operands[2]);
   h1 = gen_reg_rtx (V8HImode);
   l1 = gen_reg_rtx (V8HImode);
   h2 = gen_reg_rtx (V8HImode);
   l2 = gen_reg_rtx (V8HImode);
-                                                                                
+
   emit_insn (gen_vec_interleave_highv8hi (h1, op1, op2));
   emit_insn (gen_vec_interleave_lowv8hi (l1, op1, op2));
   emit_insn (gen_vec_interleave_highv8hi (h2, l1, h1));
   emit_insn (gen_vec_interleave_lowv8hi (operands[0], l2, h2));
   DONE;
 })
-                                                                                
+
 ;; Reduce:
 ;;     op1 = abcd
 ;;     op2 = efgh
 ;;      h1 = aebf
 ;;      l1 = cgdh
 ;;  result = bdfh
-(define_expand "vec_pack_mod_v2di"
+(define_expand "vec_pack_trunc_v2di"
   [(match_operand:V4SI 0 "register_operand" "")
    (match_operand:V2DI 1 "register_operand" "")
    (match_operand:V2DI 2 "register_operand" "")]
   "TARGET_SSE2"
 {
   rtx op1, op2, h1, l1;
-                                                                                
+
   op1 = gen_lowpart (V4SImode, operands[1]);
   op2 = gen_lowpart (V4SImode, operands[2]);
   h1 = gen_reg_rtx (V4SImode);
   l1 = gen_reg_rtx (V4SImode);
-                                                                                
+
   emit_insn (gen_vec_interleave_highv4si (h1, op1, op2));
   emit_insn (gen_vec_interleave_lowv4si (l1, op1, op2));
   emit_insn (gen_vec_interleave_lowv4si (operands[0], l1, h1));
   "TARGET_SSE2"
   "packsswb\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_packssdw"
   "TARGET_SSE2"
   "packssdw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_packuswb"
   "TARGET_SSE2"
   "packuswb\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_punpckhbw"
   "TARGET_SSE2"
   "punpckhbw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_punpcklbw"
   "TARGET_SSE2"
   "punpcklbw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_punpckhwd"
   "TARGET_SSE2"
   "punpckhwd\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_punpcklwd"
   "TARGET_SSE2"
   "punpcklwd\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_punpckhdq"
   "TARGET_SSE2"
   "punpckhdq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_punpckldq"
   "TARGET_SSE2"
   "punpckldq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_punpckhqdq"
   "TARGET_SSE2"
   "punpckhqdq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_punpcklqdq"
   "TARGET_SSE2"
   "punpcklqdq\t{%2, %0|%0, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
-(define_expand "sse2_pinsrw"
-  [(set (match_operand:V8HI 0 "register_operand" "")
-       (vec_merge:V8HI
-         (vec_duplicate:V8HI
-           (match_operand:SI 2 "nonimmediate_operand" ""))
-         (match_operand:V8HI 1 "register_operand" "")
-         (match_operand:SI 3 "const_0_to_7_operand" "")))]
-  "TARGET_SSE2"
+(define_insn "*sse4_1_pinsrb"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+       (vec_merge:V16QI
+         (vec_duplicate:V16QI
+           (match_operand:QI 2 "nonimmediate_operand" "rm"))
+         (match_operand:V16QI 1 "register_operand" "0")
+         (match_operand:SI 3 "const_pow2_1_to_32768_operand" "n")))]
+  "TARGET_SSE4_1"
 {
-  operands[2] = gen_lowpart (HImode, operands[2]);
-  operands[3] = GEN_INT ((1 << INTVAL (operands[3])));
-})
-
-(define_insn "*sse2_pinsrw"
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
+  return "pinsrb\t{%3, %k2, %0|%0, %k2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse2_pinsrw"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
        (vec_merge:V8HI
          (vec_duplicate:V8HI
   return "pinsrw\t{%3, %k2, %0|%0, %k2, %3}";
 }
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "TI")])
+
+;; It must come before sse2_loadld since it is preferred.
+(define_insn "*sse4_1_pinsrd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (vec_merge:V4SI
+         (vec_duplicate:V4SI
+           (match_operand:SI 2 "nonimmediate_operand" "rm"))
+         (match_operand:V4SI 1 "register_operand" "0")
+         (match_operand:SI 3 "const_pow2_1_to_8_operand" "n")))]
+  "TARGET_SSE4_1"
+{
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
+  return "pinsrd\t{%3, %2, %0|%0, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
-(define_insn "sse2_pextrw"
+(define_insn "*sse4_1_pinsrq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (vec_merge:V2DI
+         (vec_duplicate:V2DI
+           (match_operand:DI 2 "nonimmediate_operand" "rm"))
+         (match_operand:V2DI 1 "register_operand" "0")
+         (match_operand:SI 3 "const_pow2_1_to_2_operand" "n")))]
+  "TARGET_SSE4_1"
+{
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
+  return "pinsrq\t{%3, %2, %0|%0, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_pextrb"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+       (zero_extend:SI
+         (vec_select:QI
+           (match_operand:V16QI 1 "register_operand" "x")
+           (parallel [(match_operand:SI 2 "const_0_to_15_operand" "n")]))))]
+  "TARGET_SSE4_1"
+  "pextrb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_pextrb_memory"
+  [(set (match_operand:QI 0 "memory_operand" "=m")
+       (vec_select:QI
+         (match_operand:V16QI 1 "register_operand" "x")
+         (parallel [(match_operand:SI 2 "const_0_to_15_operand" "n")])))]
+  "TARGET_SSE4_1"
+  "pextrb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse2_pextrw"
   [(set (match_operand:SI 0 "register_operand" "=r")
        (zero_extend:SI
          (vec_select:HI
   "TARGET_SSE2"
   "pextrw\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_pextrw_memory"
+  [(set (match_operand:HI 0 "memory_operand" "=m")
+       (vec_select:HI
+         (match_operand:V8HI 1 "register_operand" "x")
+         (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n")])))]
+  "TARGET_SSE4_1"
+  "pextrw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_pextrd"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=rm")
+       (vec_select:SI
+         (match_operand:V4SI 1 "register_operand" "x")
+         (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))]
+  "TARGET_SSE4_1"
+  "pextrd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+;; It must come before *vec_extractv2di_1_sse since it is preferred.
+(define_insn "*sse4_1_pextrq"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=rm")
+       (vec_select:DI
+         (match_operand:V2DI 1 "register_operand" "x")
+         (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")])))]
+  "TARGET_SSE4_1 && TARGET_64BIT"
+  "pextrq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "sse2_pshufd"
   return "pshufd\t{%2, %1, %0|%0, %1, %2}";
 }
   [(set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "sse2_pshuflw"
   return "pshuflw\t{%2, %1, %0|%0, %1, %2}";
 }
   [(set_attr "type" "sselog")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "sse2_pshufhw"
   return "pshufhw\t{%2, %1, %0|%0, %1, %2}";
 }
   [(set_attr "type" "sselog")
+   (set_attr "prefix_rep" "1")
    (set_attr "mode" "TI")])
 
 (define_expand "sse2_loadd"
   "operands[2] = CONST0_RTX (V4SImode);")
 
 (define_insn "sse2_loadld"
-  [(set (match_operand:V4SI 0 "register_operand"       "=Y,x,x")
+  [(set (match_operand:V4SI 0 "register_operand"       "=Y2,Yi,x,x")
        (vec_merge:V4SI
          (vec_duplicate:V4SI
-           (match_operand:SI 2 "nonimmediate_operand" "mr,m,x"))
-         (match_operand:V4SI 1 "reg_or_0_operand"     " C,C,0")
+           (match_operand:SI 2 "nonimmediate_operand" "m  ,r ,m,x"))
+         (match_operand:V4SI 1 "reg_or_0_operand"     "C  ,C ,C,0")
          (const_int 1)))]
   "TARGET_SSE"
   "@
    movd\t{%2, %0|%0, %2}
+   movd\t{%2, %0|%0, %2}
    movss\t{%2, %0|%0, %2}
    movss\t{%2, %0|%0, %2}"
   [(set_attr "type" "ssemov")
-   (set_attr "mode" "TI,V4SF,SF")])
+   (set_attr "mode" "TI,TI,V4SF,SF")])
 
-;; ??? The hardware supports more, but TARGET_INTER_UNIT_MOVES must
-;; be taken into account, and movdi isn't fully populated even without.
 (define_insn_and_split "sse2_stored"
-  [(set (match_operand:SI 0 "nonimmediate_operand" "=mx")
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=mx,r")
        (vec_select:SI
-         (match_operand:V4SI 1 "register_operand" "x")
+         (match_operand:V4SI 1 "register_operand" "x,Yi")
          (parallel [(const_int 0)])))]
   "TARGET_SSE"
   "#"
-  "&& reload_completed"
+  "&& reload_completed
+   && (TARGET_INTER_UNIT_MOVES 
+       || MEM_P (operands [0])
+       || !GENERAL_REGNO_P (true_regnum (operands [0])))"
   [(set (match_dup 0) (match_dup 1))]
 {
   operands[1] = gen_rtx_REG (SImode, REGNO (operands[1]));
   "TARGET_SSE"
   "")
 
-;; ??? The hardware supports more, but TARGET_INTER_UNIT_MOVES must
-;; be taken into account, and movdi isn't fully populated even without.
+(define_insn "*sse2_storeq_rex64"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=mx,r")
+       (vec_select:DI
+         (match_operand:V2DI 1 "register_operand" "x,Yi")
+         (parallel [(const_int 0)])))]
+  "TARGET_64BIT && TARGET_SSE"
+  "#")
+
 (define_insn "*sse2_storeq"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=mx")
        (vec_select:DI
        (vec_select:DI
          (match_operand:V2DI 1 "register_operand" "")
          (parallel [(const_int 0)])))]
-  "TARGET_SSE && reload_completed"
+  "TARGET_SSE
+   && reload_completed
+   && (TARGET_INTER_UNIT_MOVES 
+       || MEM_P (operands [0])
+       || !GENERAL_REGNO_P (true_regnum (operands [0])))"
   [(set (match_dup 0) (match_dup 1))]
 {
   operands[1] = gen_rtx_REG (DImode, REGNO (operands[1]));
   "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
   "@
    movhps\t{%1, %0|%0, %1}
-   psrldq\t{$4, %0|%0, 4}
+   psrldq\t{$8, %0|%0, 8}
    movq\t{%H1, %0|%0, %H1}"
   [(set_attr "type" "ssemov,sseishft,ssemov")
    (set_attr "mode" "V2SF,TI,TI")])
    (set_attr "mode" "V2SF,V4SF,V2SF")])
 
 (define_insn "*vec_dupv4si"
-  [(set (match_operand:V4SI 0 "register_operand" "=Y,x")
+  [(set (match_operand:V4SI 0 "register_operand" "=Y2,x")
        (vec_duplicate:V4SI
-         (match_operand:SI 1 "register_operand" " Y,0")))]
+         (match_operand:SI 1 "register_operand" " Y2,0")))]
   "TARGET_SSE"
   "@
    pshufd\t{$0, %1, %0|%0, %1, 0}
    (set_attr "mode" "TI,V4SF")])
 
 (define_insn "*vec_dupv2di"
-  [(set (match_operand:V2DI 0 "register_operand" "=Y,x")
+  [(set (match_operand:V2DI 0 "register_operand" "=Y2,x")
        (vec_duplicate:V2DI
-         (match_operand:DI 1 "register_operand" " 0,0")))]
+         (match_operand:DI 1 "register_operand" " 0 ,0")))]
   "TARGET_SSE"
   "@
    punpcklqdq\t%0, %0
 ;; nonimmediate_operand for operand 2 and *not* allowing memory for the SSE
 ;; alternatives pretty much forces the MMX alternative to be chosen.
 (define_insn "*sse2_concatv2si"
-  [(set (match_operand:V2SI 0 "register_operand"     "=Y, Y,*y,*y")
+  [(set (match_operand:V2SI 0 "register_operand"     "=Y2, Y2,*y,*y")
        (vec_concat:V2SI
-         (match_operand:SI 1 "nonimmediate_operand" " 0,rm, 0,rm")
-         (match_operand:SI 2 "reg_or_0_operand"     " Y, C,*y, C")))]
+         (match_operand:SI 1 "nonimmediate_operand" " 0 ,rm , 0,rm")
+         (match_operand:SI 2 "reg_or_0_operand"     " Y2,C  ,*y, C")))]
   "TARGET_SSE2"
   "@
    punpckldq\t{%2, %0|%0, %2}
    (set_attr "mode" "V4SF,V4SF,DI,DI")])
 
 (define_insn "*vec_concatv4si_1"
-  [(set (match_operand:V4SI 0 "register_operand"       "=Y,x,x")
+  [(set (match_operand:V4SI 0 "register_operand"       "=Y2,x,x")
        (vec_concat:V4SI
-         (match_operand:V2SI 1 "register_operand"     " 0,0,0")
-         (match_operand:V2SI 2 "nonimmediate_operand" " Y,x,m")))]
+         (match_operand:V2SI 1 "register_operand"     " 0 ,0,0")
+         (match_operand:V2SI 2 "nonimmediate_operand" " Y2,x,m")))]
   "TARGET_SSE"
   "@
    punpcklqdq\t{%2, %0|%0, %2}
   [(set_attr "type" "sselog,ssemov,ssemov")
    (set_attr "mode" "TI,V4SF,V2SF")])
 
-(define_insn "*vec_concatv2di"
-  [(set (match_operand:V2DI 0 "register_operand"     "=Y,?Y,Y,x,x,x")
+(define_insn "vec_concatv2di"
+  [(set (match_operand:V2DI 0 "register_operand"     "=Y2,?Y2,Y2,x,x,x")
        (vec_concat:V2DI
-         (match_operand:DI 1 "nonimmediate_operand" " m,*y,0,0,0,m")
-         (match_operand:DI 2 "vector_move_operand"  " C, C,Y,x,m,0")))]
+         (match_operand:DI 1 "nonimmediate_operand" "  m,*y ,0 ,0,0,m")
+         (match_operand:DI 2 "vector_move_operand"  "  C,  C,Y2,x,m,0")))]
   "TARGET_SSE"
   "@
    movq\t{%1, %0|%0, %1}
   "TARGET_SSE2 && ix86_binary_operator_ok (PLUS, V16QImode, operands)"
   "pavgb\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_uavgv8hi3"
   "TARGET_SSE2 && ix86_binary_operator_ok (PLUS, V8HImode, operands)"
   "pavgw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 ;; The correct representation for this is absolutely enormous, and 
   "TARGET_SSE2"
   "psadbw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse_movmskps"
   "TARGET_SSE2"
   "pmovmskb\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
-   (set_attr "mode" "V2DF")])
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "SI")])
 
 (define_expand "sse2_maskmovdqu"
   [(set (match_operand:V16QI 0 "memory_operand" "")
   ;; @@@ check ordering of operands in intel/nonintel syntax
   "maskmovdqu\t{%2, %1|%1, %2}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse2_maskmovdqu_rex64"
   ;; @@@ check ordering of operands in intel/nonintel syntax
   "maskmovdqu\t{%2, %1|%1, %2}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "sse_ldmxcsr"
   "TARGET_SSSE3"
   "phaddw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_phaddwv4hi3"
   "TARGET_SSSE3"
   "phaddw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_phadddv4si3"
   "TARGET_SSSE3"
   "phaddd\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_phadddv2si3"
   "TARGET_SSSE3"
   "phaddd\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_phaddswv8hi3"
   "TARGET_SSSE3"
   "phaddsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_phaddswv4hi3"
   "TARGET_SSSE3"
   "phaddsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_phsubwv8hi3"
   "TARGET_SSSE3"
   "phsubw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_phsubwv4hi3"
   "TARGET_SSSE3"
   "phsubw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_phsubdv4si3"
   "TARGET_SSSE3"
   "phsubd\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_phsubdv2si3"
   "TARGET_SSSE3"
   "phsubd\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_phsubswv8hi3"
   "TARGET_SSSE3"
   "phsubsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_phsubswv4hi3"
   "TARGET_SSSE3"
   "phsubsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_pmaddubswv8hi3"
   "TARGET_SSSE3"
   "pmaddubsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_pmaddubswv4hi3"
   "TARGET_SSSE3"
   "pmaddubsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_pmulhrswv8hi3"
   "TARGET_SSSE3 && ix86_binary_operator_ok (MULT, V8HImode, operands)"
   "pmulhrsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseimul")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_pmulhrswv4hi3"
   "TARGET_SSSE3 && ix86_binary_operator_ok (MULT, V4HImode, operands)"
   "pmulhrsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_pshufbv16qi3"
   "TARGET_SSSE3"
   "pshufb\t{%2, %0|%0, %2}";
   [(set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_pshufbv8qi3"
   "TARGET_SSSE3"
   "pshufb\t{%2, %0|%0, %2}";
   [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_psign<mode>3"
   "TARGET_SSSE3"
   "psign<ssevecsize>\t{%2, %0|%0, %2}";
   [(set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_psign<mode>3"
   "TARGET_SSSE3"
   "psign<mmxvecsize>\t{%2, %0|%0, %2}";
   [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "ssse3_palignrti"
   return "palignr\t{%3, %2, %0|%0, %2, %3}";
 }
   [(set_attr "type" "sseishft")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "ssse3_palignrdi"
   return "palignr\t{%3, %2, %0|%0, %2, %3}";
 }
   [(set_attr "type" "sseishft")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
 (define_insn "abs<mode>2"
   "TARGET_SSSE3"
   "pabs<ssevecsize>\t{%1, %0|%0, %1}";
   [(set_attr "type" "sselog1")
+   (set_attr "prefix_data16" "1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
 (define_insn "abs<mode>2"
   "TARGET_SSSE3"
   "pabs<mmxvecsize>\t{%1, %0|%0, %1}";
   [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; AMD SSE4A instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "sse4a_vmmovntv2df"
+  [(set (match_operand:DF 0 "memory_operand" "=m")
+        (unspec:DF [(vec_select:DF 
+                      (match_operand:V2DF 1 "register_operand" "x")
+                      (parallel [(const_int 0)]))]
+                   UNSPEC_MOVNT))]
+  "TARGET_SSE4A"
+  "movntsd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "mode" "DF")])
+
+(define_insn "sse4a_movntdf"
+  [(set (match_operand:DF 0 "memory_operand" "=m")
+        (unspec:DF [(match_operand:DF 1 "register_operand" "x")]
+                   UNSPEC_MOVNT))]
+  "TARGET_SSE4A"
+  "movntsd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "mode" "DF")])
+
+(define_insn "sse4a_vmmovntv4sf"
+  [(set (match_operand:SF 0 "memory_operand" "=m")
+       (unspec:SF [(vec_select:SF 
+                     (match_operand:V4SF 1 "register_operand" "x")
+                     (parallel [(const_int 0)]))]
+                  UNSPEC_MOVNT))]
+  "TARGET_SSE4A"
+  "movntss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "mode" "SF")])
+
+(define_insn "sse4a_movntsf"
+  [(set (match_operand:SF 0 "memory_operand" "=m")
+       (unspec:SF [(match_operand:SF 1 "register_operand" "x")]
+                  UNSPEC_MOVNT))]
+  "TARGET_SSE4A"
+  "movntss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "mode" "SF")])
+
+(define_insn "sse4a_extrqi"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                      (match_operand 2 "const_int_operand" "")
+                      (match_operand 3 "const_int_operand" "")]
+                     UNSPEC_EXTRQI))]
+  "TARGET_SSE4A"
+  "extrq\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4a_extrq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                      (match_operand:V16QI 2 "register_operand" "x")]
+                     UNSPEC_EXTRQ))]
+  "TARGET_SSE4A"
+  "extrq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix_data16" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4a_insertqi"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                     (match_operand:V2DI 2 "register_operand" "x")
+                      (match_operand 3 "const_int_operand" "")
+                      (match_operand 4 "const_int_operand" "")]
+                     UNSPEC_INSERTQI))]
+  "TARGET_SSE4A"
+  "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}"
+  [(set_attr "type" "sseins")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4a_insertq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+        (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
+                     (match_operand:V2DI 2 "register_operand" "x")]
+                    UNSPEC_INSERTQ))]
+  "TARGET_SSE4A"
+  "insertq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseins")
+   (set_attr "prefix_rep" "1")
+   (set_attr "mode" "TI")])
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Intel SSE4.1 instructions
+;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(define_insn "sse4_1_blendpd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+       (vec_merge:V2DF
+         (match_operand:V2DF 2 "nonimmediate_operand" "xm")
+         (match_operand:V2DF 1 "register_operand" "0")
+         (match_operand:SI 3 "const_0_to_3_operand" "n")))]
+  "TARGET_SSE4_1"
+  "blendpd\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "sse4_1_blendps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+       (vec_merge:V4SF
+         (match_operand:V4SF 2 "nonimmediate_operand" "xm")
+         (match_operand:V4SF 1 "register_operand" "0")
+         (match_operand:SI 3 "const_0_to_15_operand" "n")))]
+  "TARGET_SSE4_1"
+  "blendps\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse4_1_blendvpd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+       (unspec:V2DF [(match_operand:V2DF 1 "register_operand"  "0")
+                     (match_operand:V2DF 2 "nonimmediate_operand" "xm")
+                     (reg:V2DF 21)]
+                    UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "blendvpd\t{%%xmm0, %2, %0|%0, %2, %%xmm0}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "sse4_1_blendvps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+       (unspec:V4SF [(match_operand:V4SF 1 "register_operand" "0")
+                     (match_operand:V4SF 2 "nonimmediate_operand" "xm")
+                     (reg:V4SF 21)]
+                    UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "blendvps\t{%%xmm0, %2, %0|%0, %2, %%xmm0}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse4_1_dppd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+       (unspec:V2DF [(match_operand:V2DF 1 "nonimmediate_operand" "%0")
+                     (match_operand:V2DF 2 "nonimmediate_operand" "xm")
+                     (match_operand:SI 3 "const_0_to_255_operand" "n")]
+                     UNSPEC_DP))]
+  "TARGET_SSE4_1"
+  "dppd\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssemul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "sse4_1_dpps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+       (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "%0")
+                     (match_operand:V4SF 2 "nonimmediate_operand" "xm")
+                     (match_operand:SI 3 "const_0_to_255_operand" "n")]
+                    UNSPEC_DP))]
+  "TARGET_SSE4_1"
+  "dpps\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssemul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse4_1_movntdqa"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (unspec:V2DI [(match_operand:V2DI 1 "memory_operand" "m")]
+                    UNSPEC_MOVNTDQA))]
+  "TARGET_SSE4_1"
+  "movntdqa\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_mpsadbw"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+       (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0")
+                      (match_operand:V16QI 2 "nonimmediate_operand" "xm")
+                      (match_operand:SI 3 "const_0_to_255_operand" "n")]
+                     UNSPEC_MPSADBW))]
+  "TARGET_SSE4_1"
+  "mpsadbw\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_packusdw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+       (vec_concat:V8HI
+         (us_truncate:V4HI
+           (match_operand:V4SI 1 "register_operand" "0"))
+         (us_truncate:V4HI
+           (match_operand:V4SI 2 "nonimmediate_operand" "xm"))))]
+  "TARGET_SSE4_1"
+  "packusdw\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_pblendvb"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+       (unspec:V16QI [(match_operand:V16QI 1 "register_operand"  "0")
+                      (match_operand:V16QI 2 "nonimmediate_operand" "xm")
+                      (reg:V16QI 21)]
+                     UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "pblendvb\t{%%xmm0, %2, %0|%0, %2, %%xmm0}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_pblendw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+       (vec_merge:V8HI
+         (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+         (match_operand:V8HI 1 "register_operand" "0")
+         (match_operand:SI 3 "const_0_to_255_operand" "n")))]
+  "TARGET_SSE4_1"
+  "pblendw\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_phminposuw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+       (unspec:V8HI [(match_operand:V8HI 1 "nonimmediate_operand" "xm")]
+                    UNSPEC_PHMINPOSUW))]
+  "TARGET_SSE4_1"
+  "phminposuw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_extendv8qiv8hi2"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+       (sign_extend:V8HI
+         (vec_select:V8QI
+           (match_operand:V16QI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)
+                      (const_int 4)
+                      (const_int 5)
+                      (const_int 6)
+                      (const_int 7)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxbw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_extendv8qiv8hi2"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+       (sign_extend:V8HI
+         (vec_select:V8QI
+           (vec_duplicate:V16QI
+             (match_operand:V8QI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)
+                      (const_int 4)
+                      (const_int 5)
+                      (const_int 6)
+                      (const_int 7)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxbw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_extendv4qiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (sign_extend:V4SI
+         (vec_select:V4QI
+           (match_operand:V16QI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxbd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_extendv4qiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (sign_extend:V4SI
+         (vec_select:V4QI
+           (vec_duplicate:V16QI
+             (match_operand:V4QI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxbd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_extendv2qiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (sign_extend:V2DI
+         (vec_select:V2QI
+           (match_operand:V16QI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxbq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_extendv2qiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (sign_extend:V2DI
+         (vec_select:V2QI
+           (vec_duplicate:V16QI
+             (match_operand:V2QI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxbq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_extendv4hiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (sign_extend:V4SI
+         (vec_select:V4HI
+           (match_operand:V8HI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxwd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_extendv4hiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (sign_extend:V4SI
+         (vec_select:V4HI
+           (vec_duplicate:V8HI
+             (match_operand:V2HI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxwd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_extendv2hiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (sign_extend:V2DI
+         (vec_select:V2HI
+           (match_operand:V8HI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxwq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_extendv2hiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (sign_extend:V2DI
+         (vec_select:V2HI
+           (vec_duplicate:V8HI
+             (match_operand:V8HI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxwq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_extendv2siv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (sign_extend:V2DI
+         (vec_select:V2SI
+           (match_operand:V4SI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxdq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_extendv2siv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (sign_extend:V2DI
+         (vec_select:V2SI
+           (vec_duplicate:V4SI
+             (match_operand:V2SI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovsxdq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_zero_extendv8qiv8hi2"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+       (zero_extend:V8HI
+         (vec_select:V8QI
+           (match_operand:V16QI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)
+                      (const_int 4)
+                      (const_int 5)
+                      (const_int 6)
+                      (const_int 7)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxbw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_zero_extendv8qiv8hi2"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+       (zero_extend:V8HI
+         (vec_select:V8QI
+           (vec_duplicate:V16QI
+             (match_operand:V8QI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)
+                      (const_int 4)
+                      (const_int 5)
+                      (const_int 6)
+                      (const_int 7)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxbw\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_zero_extendv4qiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (zero_extend:V4SI
+         (vec_select:V4QI
+           (match_operand:V16QI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxbd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_zero_extendv4qiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (zero_extend:V4SI
+         (vec_select:V4QI
+           (vec_duplicate:V16QI
+             (match_operand:V4QI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxbd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_zero_extendv2qiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (zero_extend:V2DI
+         (vec_select:V2QI
+           (match_operand:V16QI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxbq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_zero_extendv2qiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (zero_extend:V2DI
+         (vec_select:V2QI
+           (vec_duplicate:V16QI
+             (match_operand:V2QI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxbq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_zero_extendv4hiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (zero_extend:V4SI
+         (vec_select:V4HI
+           (match_operand:V8HI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxwd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_zero_extendv4hiv4si2"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+       (zero_extend:V4SI
+         (vec_select:V4HI
+           (vec_duplicate:V8HI
+             (match_operand:V4HI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)
+                      (const_int 2)
+                      (const_int 3)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxwd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_zero_extendv2hiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (zero_extend:V2DI
+         (vec_select:V2HI
+           (match_operand:V8HI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxwq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_zero_extendv2hiv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (zero_extend:V2DI
+         (vec_select:V2HI
+           (vec_duplicate:V8HI
+             (match_operand:V2HI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxwq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_zero_extendv2siv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (zero_extend:V2DI
+         (vec_select:V2SI
+           (match_operand:V4SI 1 "register_operand" "x")
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxdq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "*sse4_1_zero_extendv2siv2di2"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+       (zero_extend:V2DI
+         (vec_select:V2SI
+           (vec_duplicate:V4SI
+             (match_operand:V2SI 1 "nonimmediate_operand" "xm"))
+           (parallel [(const_int 0)
+                      (const_int 1)]))))]
+  "TARGET_SSE4_1"
+  "pmovzxdq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG.
+;; But it is not a really compare instruction.
+(define_insn "sse4_1_ptest"
+  [(set (reg:CC FLAGS_REG)
+       (unspec:CC [(match_operand:V2DI 0 "register_operand" "x")
+                   (match_operand:V2DI 1 "nonimmediate_operand" "xm")]
+                  UNSPEC_PTEST))]
+  "TARGET_SSE4_1"
+  "ptest\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_insn "sse4_1_roundpd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+       (unspec:V2DF [(match_operand:V2DF 1 "nonimmediate_operand" "xm")
+                     (match_operand:SI 2 "const_0_to_15_operand" "n")]
+                    UNSPEC_ROUNDP))]
+  "TARGET_SSE4_1"
+  "roundpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "sse4_1_roundps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+       (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm")
+                     (match_operand:SI 2 "const_0_to_15_operand" "n")]
+                    UNSPEC_ROUNDP))]
+  "TARGET_SSE4_1"
+  "roundps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V4SF")])
+
+(define_insn "sse4_1_roundsd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+       (vec_merge:V2DF
+         (unspec:V2DF [(match_operand:V2DF 2 "register_operand" "x")
+                       (match_operand:SI 3 "const_0_to_15_operand" "n")]
+                      UNSPEC_ROUNDS)
+         (match_operand:V2DF 1 "register_operand" "0")
+         (const_int 1)))]
+  "TARGET_SSE4_1"
+  "roundsd\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V2DF")])
+
+(define_insn "sse4_1_roundss"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+       (vec_merge:V4SF
+         (unspec:V4SF [(match_operand:V4SF 2 "register_operand" "x")
+                       (match_operand:SI 3 "const_0_to_15_operand" "n")]
+                      UNSPEC_ROUNDS)
+         (match_operand:V4SF 1 "register_operand" "0")
+         (const_int 1)))]
+  "TARGET_SSE4_1"
+  "roundss\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "V4SF")])