* i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.

author hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>

Sun, 9 Sep 2007 17:39:28 +0000 (17:39 +0000)

committer hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>

Sun, 9 Sep 2007 17:39:28 +0000 (17:39 +0000)
author hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
Sun, 9 Sep 2007 17:39:28 +0000 (17:39 +0000)
committer hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
Sun, 9 Sep 2007 17:39:28 +0000 (17:39 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index ff54c46..d748be2 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,20 @@
+2007-09-09  Jan Hubicka  <jh@suse.cz>
+            Dwarakanath Rajagopal <dwarak.rajagopal@amd.com>
+
+       * i386.h (ix86_tune_indices): Add X86_USE_VECTOR_CONVERTS.
+       (TARGET_USE_VECTOR_CONVERTS): New.
+       * i386.md: New post-reload splitters for converting SF to DF and DF to
+       SF.
+       (floatsi* expander): Special case vector conversions.
+       (floatsisf2_mixed_vector, floatsisf2_sse_vector_nointernunit,
+       floatsisf2_sse_vector_internunit, floatsisf2_sse_vector,
+       floatsidf2_mixed_vector, floatsidf2_sse_vector): New.
+       (floatsisf2_mixed, floatsisf2_sse, floatsidf2_mixed, floatsidf2_sse):
+       Disable when doing vector converts.
+       (floatsi<mode>2_i387): Disable when
+       * sse.md (vec_dupv2df): Export.
+       * i386.c (ix86_tune_features): Enable SSE conversions.
+
  2007-09-09  Richard Guenther  <rguenther@suse.de>
  
         * tree-ssa-operands.c (add_virtual_operand): Only mark
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index f6f80a0..c01198b 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1258,6 +1258,10 @@ unsigned int ix86_tune_features[X86_TUNE_LAST] = {
       operand that cannot be represented using a modRM byte.  The XOR
       replacement is long decoded, so this split helps here as well.  */
    m_K6,
+
+  /* X86_USE_VECTOR_CONVERTS: Preffer vector packed SSE conversion from
+  integer to FP. */
+  m_AMDFAM10,
  };
  
  /* Feature tests against the various architecture variations.  */
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index 93e24dd..06e90f4 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -257,6 +257,7 @@ enum ix86_tune_indices {
    X86_TUNE_MOVE_M1_VIA_OR,
    X86_TUNE_NOT_UNPAIRABLE,
    X86_TUNE_NOT_VECTORMODE,
+  X86_USE_VECTOR_CONVERTS,
  
    X86_TUNE_LAST
  };
@@ -337,6 +338,7 @@ extern unsigned int ix86_tune_features[X86_TUNE_LAST];
  #define        TARGET_MOVE_M1_VIA_OR   ix86_tune_features[X86_TUNE_MOVE_M1_VIA_OR]
  #define TARGET_NOT_UNPAIRABLE  ix86_tune_features[X86_TUNE_NOT_UNPAIRABLE]
  #define TARGET_NOT_VECTORMODE  ix86_tune_features[X86_TUNE_NOT_VECTORMODE]
+#define TARGET_USE_VECTOR_CONVERTS ix86_tune_features[X86_USE_VECTOR_CONVERTS]
  
  /* Feature tests against the various architecture variations.  */
  enum ix86_arch_indices {
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md

index 42b3bab..352f67d 100644 (file)
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3916,6 +3916,49 @@
      }
  })
  
+/* For converting SF(xmm2) to DF(xmm1), use the following code instead of
+   cvtss2sd:
+      unpcklps xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+      cvtps2pd xmm2,xmm1
+   We do the conversion post reload to avoid producing of 128bit spills
+   that might lead to ICE on 32bit target.  The sequence unlikely combine
+   anyway.  */
+(define_split
+  [(set (match_operand:DF 0 "register_operand" "")
+        (float_extend:DF
+         (match_operand:SF 1 "nonimmediate_operand" "")))]
+  "TARGET_USE_VECTOR_CONVERTS && !optimize_size 
+   && reload_completed && SSE_REG_P (operands[0])"
+   [(set (match_dup 2)
+        (float_extend:V2DF
+          (vec_select:V2SF
+            (match_dup 3)
+            (parallel [(const_int 0) (const_int 1)]))))]
+{
+  operands[2] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+  operands[3] = simplify_gen_subreg (V4SFmode, operands[0], DFmode, 0);
+  /* Use movss for loading from memory, unpcklps reg, reg for registers.
+     Try to avoid move when unpacking can be done in source.  */
+  if (REG_P (operands[1]))
+    {
+      /* If it is unsafe to overwrite upper half of source, we need
+        to move to destination and unpack there.  */
+      if ((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+          || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
+         && true_regnum (operands[0]) != true_regnum (operands[1]))
+       {
+         rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+         emit_move_insn (tmp, operands[1]);
+       }
+      else
+       operands[3] = simplify_gen_subreg (V4SFmode, operands[1], SFmode, 0);
+      emit_insn (gen_sse_unpcklps (operands[3], operands[3], operands[3]));
+    }
+  else
+    emit_insn (gen_vec_setv4sf_0 (operands[3], 
+                                 CONST0_RTX (V4SFmode), operands[1]));
+})
+
  (define_insn "*extendsfdf2_mixed"
    [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,x")
          (float_extend:DF
@@ -4009,6 +4052,51 @@
      }
  })
  
+/* For converting DF(xmm2) to SF(xmm1), use the following code instead of
+   cvtsd2ss:
+      unpcklpd xmm2,xmm2   ; packed conversion might crash on signaling NaNs
+      cvtpd2ps xmm2,xmm1
+   We do the conversion post reload to avoid producing of 128bit spills
+   that might lead to ICE on 32bit target.  The sequence unlikely combine
+   anyway.  */
+(define_split
+  [(set (match_operand:SF 0 "register_operand" "")
+        (float_truncate:SF
+         (match_operand:DF 1 "nonimmediate_operand" "")))]
+  "TARGET_USE_VECTOR_CONVERTS && !optimize_size 
+   && reload_completed && SSE_REG_P (operands[0])"
+   [(set (match_dup 2)
+        (vec_concat:V4SF
+          (float_truncate:V2SF
+            (match_dup 4))
+          (match_dup 3)))]
+{
+  operands[2] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+  operands[3] = CONST0_RTX (V2SFmode);
+  operands[4] = simplify_gen_subreg (V2DFmode, operands[0], SFmode, 0);
+  /* Use movsd for loading from memory, unpcklpd for registers.
+     Try to avoid move when unpacking can be done in source, or SSE3
+     movddup is available.  */
+  if (REG_P (operands[1]))
+    {
+      if (!TARGET_SSE3
+         && true_regnum (operands[0]) != true_regnum (operands[1])
+         && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
+             || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+       {
+         rtx tmp = simplify_gen_subreg (DFmode, operands[0], SFmode, 0);
+         emit_move_insn (tmp, operands[1]);
+         operands[1] = tmp;
+       }
+      else if (!TARGET_SSE3)
+       operands[4] = simplify_gen_subreg (V2DFmode, operands[1], DFmode, 0);
+      emit_insn (gen_vec_dupv2df (operands[4], operands[1]));
+    }
+  else
+    emit_insn (gen_sse2_loadlpd (operands[4],
+                                CONST0_RTX (V2DFmode), operands[1]));
+})
+
  (define_expand "truncdfsf2_with_temp"
    [(parallel [(set (match_operand:SF 0 "" "")
                    (float_truncate:SF (match_operand:DF 1 "" "")))
@@ -4685,12 +4773,67 @@
    [(set (match_operand:MODEF 0 "register_operand" "")
         (float:MODEF (match_operand:SI 1 "nonimmediate_operand" "")))]
    "TARGET_80387 || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
-  "")
+  "
+   /* When we use vector converts, we can't have input in memory.  */
+   if (GET_MODE (operands[0]) == DFmode && GET_MODE (operands[1]) == SImode
+       && TARGET_USE_VECTOR_CONVERTS && !optimize_size && TARGET_SSE_MATH
+       && SSE_FLOAT_MODE_P (DFmode))
+     operands[1] = force_reg (SImode, operands[1]);
+   
+   if (GET_MODE (operands[0]) == SFmode && GET_MODE (operands[1]) == SImode
+       && !optimize_size && TARGET_USE_VECTOR_CONVERTS && TARGET_SSE_MATH
+       && SSE_FLOAT_MODE_P (SFmode))
+     {
+       /* When !flag_trapping_math, we handle SImode->SFmode vector
+         conversions same way as SImode->DFmode.
+
+         For flat_trapping_math we can't safely use vector conversion without
+         clearing upper half, otherwise precision exception might occur.
+         However we can still generate the common sequence converting value
+         from general register to XMM register as:
+
+           mov         reg32, mem32
+           movd        mem32, xmm
+           cvtdq2pd xmm,xmm
+
+         because we know that movd clears the upper half.
+
+         Sadly in this case we can't rely on reload moving the value to XMM
+         register, since we need to know if upper half is OK, so we need
+         to do reloading by hand.  We force operand to memory unless target
+         supports inter unit moves.  */
+       if (!flag_trapping_math)
+         operands[1] = force_reg (SImode, operands[1]);
+       else if (!MEM_P (operands[1]))
+        {
+          rtx tmp = assign_386_stack_local (SImode, SLOT_VIRTUAL);
+          emit_move_insn (tmp, operands[1]);
+          operands[1] = tmp;
+        }
+     }
+  ")
+
+(define_insn "*floatsisf2_mixed_vector"
+  [(set (match_operand:SF 0 "register_operand" "=x,f,?f")
+       (float:SF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+  "TARGET_MIX_SSE_I387 && !flag_trapping_math 
+   && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+  "@
+   cvtpq2ps\t{%1, %0|%0, %1}
+   fild%z1\t%1
+   #"
+  [(set_attr "type" "sseicvt,fmov,multi")
+   (set_attr "mode" "SF")
+   (set_attr "unit" "*,i387,*")
+   (set_attr "athlon_decode" "double,*,*")
+   (set_attr "amdfam10_decode" "double,*,*")
+   (set_attr "fp_int_src" "false,true,true")])
  
  (define_insn "*floatsisf2_mixed"
    [(set (match_operand:SF 0 "register_operand" "=f,?f,x,x")
         (float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
-  "TARGET_MIX_SSE_I387"
+  "TARGET_MIX_SSE_I387
+   && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
    "@
     fild%z1\t%1
     #
@@ -4703,10 +4846,68 @@
     (set_attr "amdfam10_decode" "*,*,vector,double")
     (set_attr "fp_int_src" "true")])
  
+(define_insn "*floatsisf2_sse_vector_nointernunit"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+       (float:SF (match_operand:SI 1 "memory_operand" "m")))]
+  "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+   && !TARGET_INTER_UNIT_MOVES"
+  "#"
+  [(set_attr "type" "multi")])
+
+(define_insn "*floatsisf2_sse_vector_internunit"
+  [(set (match_operand:SF 0 "register_operand" "=x,x")
+       (float:SF (match_operand:SI 1 "nonimmediate_operand" "rm,x")))]
+  "flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+   && TARGET_INTER_UNIT_MOVES"
+  "#"
+  [(set_attr "type" "multi")])
+
+(define_split 
+  [(set (match_operand:SF 0 "register_operand" "")
+       (float:SF (match_operand:SI 1 "nonimmediate_operand" "")))]
+  "flag_trapping_math
+   && TARGET_USE_VECTOR_CONVERTS && reload_completed
+   && (TARGET_INTER_UNIT_MOVES || MEM_P (operands[1]))
+   && !SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+       (float:V4SF (match_dup 2)))]
+{
+  operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+  operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+  emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
+(define_split 
+  [(set (match_operand:SF 0 "register_operand" "")
+       (float:SF (match_operand:SI 1 "register_operand" "")))]
+  "flag_trapping_math
+   && TARGET_USE_VECTOR_CONVERTS && reload_completed
+   && SSE_REG_P (operands[1]) && SSE_REG_P (operands[0])"
+  [(set (match_dup 2) (vec_duplicate:V4SI (match_dup 1)))
+   (set (match_dup 0)
+       (float:V4SF (match_dup 2)))]
+{
+  operands[2] = simplify_gen_subreg (V4SImode, operands[0], SFmode, 0);
+  operands[0] = simplify_gen_subreg (V4SFmode, operands[0], SFmode, 0);
+})
+
+(define_insn "*floatsisf2_sse_vector"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+       (float:SF (match_operand:SI 1 "register_operand" "x")))]
+  "!flag_trapping_math && TARGET_USE_VECTOR_CONVERTS && !optimize_size
+   && !TARGET_INTER_UNIT_MOVES"
+  "cvtpq2ps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "mode" "SF")
+   (set_attr "athlon_decode" "double")
+   (set_attr "amdfam10_decode" "double")
+   (set_attr "fp_int_src" "true")])
+
  (define_insn "*floatsisf2_sse"
    [(set (match_operand:SF 0 "register_operand" "=x,x")
         (float:SF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
-  "TARGET_SSE_MATH"
+  "TARGET_SSE_MATH
+   && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
    "cvtsi2ss\t{%1, %0|%0, %1}"
    [(set_attr "type" "sseicvt")
     (set_attr "mode" "SF")
@@ -4714,38 +4915,89 @@
     (set_attr "amdfam10_decode" "vector,double")
     (set_attr "fp_int_src" "true")])
  
+(define_insn "*floatsidf2_mixed_vector"
+  [(set (match_operand:DF 0 "register_operand" "=x,f,f")
+       (float:DF (match_operand:SI 1 "nonimmediate_operand" "x,m,r")))]
+  "TARGET_SSE2 && TARGET_MIX_SSE_I387
+   && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+  "@
+   cvtdq2pd\t{%1, %0|%0, %1}
+   fild%z1\t%1
+   #"
+  [(set_attr "type" "sseicvt,fmov,multi")
+   (set_attr "mode" "V2DF,DF,DF")
+   (set_attr "unit" "*,*,i387")
+   (set_attr "athlon_decode" "double,*,*")
+   (set_attr "amdfam10_decode" "double,*,*")
+   (set_attr "fp_int_src" "false,true,true")])
+
  (define_insn "*floatsidf2_mixed"
-  [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x")
-       (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m")))]
-  "TARGET_SSE2 && TARGET_MIX_SSE_I387"
+  [(set (match_operand:DF 0 "register_operand" "=f,?f,x,x,!x")
+       (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,m,x")))]
+  "TARGET_SSE2 && TARGET_MIX_SSE_I387
+    && (!TARGET_USE_VECTOR_CONVERTS || !optimize_size)"
    "@
     fild%z1\t%1
     #
     cvtsi2sd\t{%1, %0|%0, %1}
-   cvtsi2sd\t{%1, %0|%0, %1}"
-  [(set_attr "type" "fmov,multi,sseicvt,sseicvt")
-   (set_attr "mode" "DF")
-   (set_attr "unit" "*,i387,*,*")
-   (set_attr "athlon_decode" "*,*,double,direct")
-   (set_attr "amdfam10_decode" "*,*,vector,double")
+   cvtsi2sd\t{%1, %0|%0, %1}
+   cvtdq2pd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "fmov,multi,sseicvt,sseicvt,sseicvt")
+   (set_attr "mode" "DF,DF,DF,DF,V2DF")
+   (set_attr "unit" "*,i387,*,*,*")
+   (set_attr "athlon_decode" "*,*,double,direct,double")
+   (set_attr "amdfam10_decode" "*,*,vector,double,double")
+   (set_attr "fp_int_src" "true,true,true,true,false")])
+
+(define_insn "*floatsidf2_sse_vector"
+  [(set (match_operand:DF 0 "register_operand" "=x")
+       (float:DF (match_operand:SI 1 "register_operand" "x")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && TARGET_USE_VECTOR_CONVERTS && !optimize_size"
+  "cvtdq2pd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "mode" "V2DF")
+   (set_attr "athlon_decode" "double")
+   (set_attr "amdfam10_decode" "double")
     (set_attr "fp_int_src" "true")])
  
+(define_split 
+  [(set (match_operand:DF 0 "register_operand" "")
+       (float:DF (match_operand:SI 1 "memory_operand" "")))]
+  "TARGET_USE_VECTOR_CONVERTS && reload_completed
+   && SSE_REG_P (operands[0])"
+  [(set (match_dup 0)
+       (float:V2DF
+         (vec_select:V2SI
+           (match_dup 2)
+           (parallel [(const_int 0) (const_int 1)]))))]
+{
+  operands[2] = simplify_gen_subreg (V4SImode, operands[0], DFmode, 0);
+  operands[0] = simplify_gen_subreg (V2DFmode, operands[0], DFmode, 0);
+  emit_insn (gen_sse2_loadld (operands[2], CONST0_RTX (V4SImode), operands[1]));
+})
+
  (define_insn "*floatsidf2_sse"
-  [(set (match_operand:DF 0 "register_operand" "=x,x")
-       (float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH"
-  "cvtsi2sd\t{%1, %0|%0, %1}"
+  [(set (match_operand:DF 0 "register_operand" "=x,x,!x")
+       (float:DF (match_operand:SI 1 "nonimmediate_operand" "r,m,x")))]
+  "TARGET_SSE2 && TARGET_SSE_MATH
+   && (!TARGET_USE_VECTOR_CONVERTS || optimize_size)"
+  "@
+   cvtsi2sd\t{%1, %0|%0, %1} 
+   cvtsi2sd\t{%1, %0|%0, %1} 
+   cvtdq2pd\t{%1, %0|%0, %1}"
    [(set_attr "type" "sseicvt")
-   (set_attr "mode" "DF")
-   (set_attr "athlon_decode" "double,direct")
-   (set_attr "amdfam10_decode" "vector,double")
+   (set_attr "mode" "DF,DF,V2DF")
+   (set_attr "athlon_decode" "double,direct,double")
+   (set_attr "amdfam10_decode" "vector,double,double")
     (set_attr "fp_int_src" "true")])
  
  (define_insn "*floatsi<mode>2_i387"
    [(set (match_operand:MODEF 0 "register_operand" "=f,f")
         (float:MODEF
           (match_operand:SI 1 "nonimmediate_operand" "m,?r")))]
-  "TARGET_80387"
+  "TARGET_80387
+   && (!TARGET_SSE_MATH || !SSE_FLOAT_MODE_P (GET_MODE (operands[0])))"
    "@
     fild%z1\t%1
     #"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md

index cb63ab9..03b2577 100644 (file)
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2740,7 +2740,7 @@
    [(set_attr "type" "sselog1")
     (set_attr "mode" "DF")])
  
-(define_insn "*vec_dupv2df"
+(define_insn "vec_dupv2df"
    [(set (match_operand:V2DF 0 "register_operand" "=x")
         (vec_duplicate:V2DF
           (match_operand:DF 1 "register_operand" "0")))]
author	hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
	Sun, 9 Sep 2007 17:39:28 +0000 (17:39 +0000)
committer	hubicka <hubicka@138bc75d-0d04-0410-961f-82ee72b054a4>
	Sun, 9 Sep 2007 17:39:28 +0000 (17:39 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/i386/i386.c		patch \| blob \| history
gcc/config/i386/i386.h		patch \| blob \| history
gcc/config/i386/i386.md		patch \| blob \| history
gcc/config/i386/sse.md		patch \| blob \| history