(udivsi3_sh2a, divsi3_sh2a): Give these patterns an "in_delay_slot" attribute

[pf3gnuchains/gcc-fork.git] / gcc / config / sh / sh.md
diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md

index 531db46..b725822 100644 (file)
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -161,7 +161,7 @@
  ;; Target CPU.
  
  (define_attr "cpu"
- "sh1,sh2,sh2e,sh3,sh3e,sh4,sh5"
+ "sh1,sh2,sh2e,sh2a,sh3,sh3e,sh4,sh4a,sh5"
    (const (symbol_ref "sh_cpu_attr")))
  
  (define_attr "endian" "big,little"
@@ -218,6 +218,9 @@
  ;; ftrc_s      fix_truncsfsi2_i4
  ;; dfdiv       double precision floating point divide (or square root)
  ;; cwb         ic_invalidate_line_i
+;; movua       SH4a unaligned load
+;; fsrra       square root reciprocal approximate
+;; fsca                sine and cosine approximate
  ;; tls_load     load TLS related address
  ;; arith_media SHmedia arithmetic, logical, and shift instructions
  ;; cbranch_media SHmedia conditional branch instructions
@@ -249,7 +252,7 @@
  ;; nil         no-op move, will be deleted.
  
  (define_attr "type"
- "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fdiv,ftrc_s,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,mem_fpscr,gp_fpscr,cwb,tls_load,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
+ "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fdiv,ftrc_s,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,mem_fpscr,gp_fpscr,cwb,movua,fsrra,fsca,tls_load,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
    (const_string "other"))
  
  ;; We define a new attribute namely "insn_class".We use
@@ -401,12 +404,15 @@
          (cond [(eq_attr "med_branch_p" "yes")
                 (const_int 2)
                 (and (eq (symbol_ref "GET_CODE (prev_nonnote_insn (insn))")
-                        (symbol_ref "INSN"))
-                    (eq (symbol_ref "INSN_CODE (prev_nonnote_insn (insn))")
-                        (symbol_ref "code_for_indirect_jump_scratch")))
-               (if_then_else (eq_attr "braf_branch_p" "yes")
-                             (const_int 6)
-                             (const_int 10))
+                         (symbol_ref "INSN"))
+                     (eq (symbol_ref "INSN_CODE (prev_nonnote_insn (insn))")
+                         (symbol_ref "code_for_indirect_jump_scratch")))
+                (cond [(eq_attr "braf_branch_p" "yes")
+                       (const_int 6)
+                       (eq (symbol_ref "flag_pic") (const_int 0))
+                       (const_int 10)
+                       (ne (symbol_ref "TARGET_SH2") (const_int 0))
+                       (const_int 10)] (const_int 18))
                 (eq_attr "braf_branch_p" "yes")
                 (const_int 10)
  ;; ??? using pc is not computed transitively.
@@ -422,134 +428,13 @@
                          (const_int 4)
                          (const_int 2))))
  
-;; Load and store instructions save a cycle if they are aligned on a
-;; four byte boundary.  Using a function unit for stores encourages
-;; gcc to separate load and store instructions by one instruction,
-;; which makes it more likely that the linker will be able to word
-;; align them when relaxing.
-
-;; SH-1 scheduling
-
-(define_automaton "sh1")
-(define_cpu_unit "sh1memory,sh1int,sh1mpy,sh1fp" "sh1")
-
-;; Loads have a latency of two.
-;; However, call insns can have a delay slot, so that we want one more
-;; insn to be scheduled between the load of the function address and the call.
-;; This is equivalent to a latency of three.
-;; ADJUST_COST can only properly handle reductions of the cost, so we
-;; use a latency of three here.
-;; We only do this for SImode loads of general registers, to make the work
-;; for ADJUST_COST easier.
-(define_insn_reservation "sh1_load_si" 3
-  (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "load_si,pcload_si"))
-  "sh1memory*2")
-
-(define_insn_reservation "sh1_load_store" 2
-  (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "load,pcload,pload,store,pstore"))
-  "sh1memory*2")
-
-(define_insn_reservation "sh1_arith3" 3
-  (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "arith3,arith3b"))
-  "sh1int*3")
-
-(define_insn_reservation "sh1_dyn_shift" 2
-  (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "dyn_shift"))
-  "sh1int*2")
-
-(define_insn_reservation "sh1_int" 1
-  (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "!arith3,arith3b,dyn_shift"))
-  "sh1int")
-
-;; ??? These are approximations.
-(define_insn_reservation "sh1_smpy" 2
-  (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "smpy"))
-  "sh1mpy*2")
-
-(define_insn_reservation "sh1_dmpy" 3
-  (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "dmpy"))
-  "sh1mpy*3")
-
-(define_insn_reservation "sh1_fp" 2
-  (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "fp,fmove"))
-  "sh1fp")
-
-(define_insn_reservation "sh1_fdiv" 13
-  (and (eq_attr "pipe_model" "sh1")
-       (eq_attr "type" "fdiv"))
-  "sh1fp*12")
-
-;; SH-5 SHmedia scheduling
-;; When executing SHmedia code, the SH-5 is a fairly straightforward
-;; single-issue machine.  It has four pipelines, the branch unit (br),
-;; the integer and multimedia unit (imu), the load/store unit (lsu), and
-;; the floating point unit (fpu).
-;;
-;; (define_function_unit {name} {num-units} {n-users} {test}
-;;                       {ready-delay} {issue-delay} [{conflict-list}])
-
-;; Every instruction on SH-5 occupies the issue resource for at least one
-;; cycle.
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media")
-       (eq_attr "type" "!pt_media,ptabs_media,invalidate_line_media,dmpy_media,load_media,fload_media,fcmp_media,fmove_media,fparith_media,dfparith_media,fpconv_media,dfpconv_media,dfmul_media,store_media,fstore_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media")) 1 1)
-
-;; Here model the instructions with a latency greater than one cycle.
-
-;; Specify the various types of instruction which have latency > 1
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media")
-       (eq_attr "type" "mcmp_media")) 2 1)
-
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media")
-       (eq_attr "type" "dmpy_media,load_media,fcmp_media,mac_media")) 3 1)
-;; but see sh_adjust_cost for mac_media exception.
-
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media")
-       (eq_attr "type" "fload_media,fmove_media")) 4 1)
-
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media")
-       (eq_attr "type" "d2mpy_media")) 4 2)
-
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media")
-       (eq_attr "type" "pt_media,ptabs_media")) 5 1)
-
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media")
-       (eq_attr "type" "fparith_media,dfparith_media,fpconv_media,dfpconv_media")) 6 1)
-
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media")
-       (eq_attr "type" "invalidate_line_media")) 7 7)
-
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media") (eq_attr "type" "dfmul_media")) 9 4)
-
-(define_function_unit "sh5issue" 1 0
-  (and (eq_attr "pipe_model" "sh5media") (eq_attr "type" "atrans_media")) 10 5)
+;; DFA descriptions for the pipelines
  
-;; Floating-point divide and square-root occupy an additional resource,
-;; which is not internally pipelined.  However, other instructions
-;; can continue to issue.
-(define_function_unit "sh5fds" 1 0
-  (and (eq_attr "pipe_model" "sh5media") (eq_attr "type" "fdiv_media"))  19 19)
+(include "sh1.md")
+(include "shmedia.md")
+(include "sh4.md")
  
-(define_function_unit "sh5fds" 1 0
-  (and (eq_attr "pipe_model" "sh5media") (eq_attr "type" "dfdiv_media")) 35 35)
-
-; Definitions for filling branch delay slots.
+;; Definitions for filling delay slots
  
  (define_attr "needs_delay_slot" "yes,no" (const_string "no"))
  
@@ -642,12 +527,16 @@
  ;; Say that we have annulled true branches, since this gives smaller and
  ;; faster code when branches are predicted as not taken.
  
+;; ??? The non-annulled condition should really be "in_delay_slot",
+;; but insns that can be filled in non-annulled get priority over insns
+;; that can only be filled in anulled.
+
  (define_delay
    (and (eq_attr "type" "cbranch")
         (ne (symbol_ref "TARGET_SH2") (const_int 0)))
    ;; SH2e has a hardware bug that pretty much prohibits the use of
    ;; annuled delay slots.
-  [(eq_attr "in_delay_slot" "yes") (and (eq_attr "cond_delay_slot" "yes")
+  [(eq_attr "cond_delay_slot" "yes") (and (eq_attr "cond_delay_slot" "yes")
                                         (not (eq_attr "cpu" "sh2e"))) (nil)])
  \f
  ;; -------------------------------------------------------------------------
@@ -1269,6 +1158,15 @@
    ""
    [(set_attr "length" "0")])
  
+(define_insn "udivsi3_sh2a"
+  [(set (match_operand:SI 0 "arith_reg_operand" "=r")
+       (udiv:SI (match_operand:SI 1 "arith_reg_operand" "0")
+               (match_operand:SI 2 "arith_reg_operand" "z")))]
+  "TARGET_SH2A"
+  "divu        %2,%1"
+  [(set_attr "type" "arith")
+   (set_attr "in_delay_slot" "no")])
+
  ;; We must use a pseudo-reg forced to reg 0 in the SET_DEST rather than
  ;; hard register 0.  If we used hard register 0, then the next instruction
  ;; would be a move from hard register 0 to a pseudo-reg.  If the pseudo-reg
@@ -1405,6 +1303,13 @@
        emit_insn (gen_udivsi3_i4_media (operands[0], operands[1], operands[2]));
        DONE;
      }
+  else if (TARGET_SH2A)
+    {
+      operands[1] = force_reg (SImode, operands[1]);
+      operands[2] = force_reg (SImode, operands[2]);
+      emit_insn (gen_udivsi3_sh2a (operands[0], operands[1], operands[2]));
+      DONE;
+    }
    else if (TARGET_SH5)
      {
        emit_move_insn (operands[3],
@@ -1438,6 +1343,15 @@
    DONE;
  }")
  
+(define_insn "divsi3_sh2a"
+  [(set (match_operand:SI 0 "arith_reg_operand" "=r")
+       (div:SI (match_operand:SI 1 "arith_reg_operand" "0")
+               (match_operand:SI 2 "arith_reg_operand" "z")))]
+  "TARGET_SH2A"
+  "divs        %2,%1"
+  [(set_attr "type" "arith")
+   (set_attr "in_delay_slot" "no")])
+
  (define_insn "divsi3_i1"
    [(set (match_operand:SI 0 "register_operand" "=z")
         (div:SI (reg:SI R4_REG) (reg:SI R5_REG)))
@@ -1545,6 +1459,13 @@
        else
         last = gen_divsi3_i4 (operands[0], operands[3]);
      }
+  else if (TARGET_SH2A)
+    {
+      operands[1] = force_reg (SImode, operands[1]);
+      operands[2] = force_reg (SImode, operands[2]);
+      emit_insn (gen_divsi3_sh2a (operands[0], operands[1], operands[2]));
+      DONE;
+    }
    else if (TARGET_SHMEDIA_FPU)
      {
        operands[1] = force_reg (SImode, operands[1]);
@@ -1703,6 +1624,14 @@
    "TARGET_SH1"
    "")
  
+(define_insn "mul_r"
+  [(set (match_operand:SI 0 "arith_reg_operand" "=r")
+       (mult:SI (match_operand:SI 1 "arith_reg_operand" "0")
+                (match_operand:SI 2 "arith_reg_operand" "z")))]
+  "TARGET_SH2A"
+  "mulr        %2,%0"
+  [(set_attr "type" "dmpy")])
+
  (define_insn "mul_l"
    [(set (reg:SI MACL_REG)
         (mult:SI (match_operand:SI 0 "arith_reg_operand" "r")
@@ -2252,6 +2181,15 @@
  ;;
  ;; shift left
  
+(define_insn "ashlsi3_sh2a"
+  [(set (match_operand:SI 0 "arith_reg_operand" "=r")
+       (ashift:SI (match_operand:SI 1 "arith_reg_operand" "0")
+                  (match_operand:SI 2 "arith_reg_operand" "r")))]
+  "TARGET_SH2A"
+  "shad        %2,%0"
+  [(set_attr "type" "arith")
+   (set_attr "length" "4")])
+
  ;; This pattern is used by init_expmed for computing the costs of shift
  ;; insns.
  
@@ -2387,6 +2325,15 @@
  ; arithmetic shift right
  ;
  
+(define_insn "ashrsi3_sh2a"
+  [(set (match_operand:SI 0 "arith_reg_operand" "=r")
+       (ashiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0")
+                  (neg:SI (match_operand:SI 2 "arith_reg_operand" "r"))))]
+  "TARGET_SH2A"
+  "shad        %2,%0"
+  [(set_attr "type" "dyn_shift")
+   (set_attr "length" "4")])
+
  (define_insn "ashrsi3_k"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
         (ashiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0")
@@ -2506,6 +2453,15 @@
  
  ;; logical shift right
  
+(define_insn "lshrsi3_sh2a"
+  [(set (match_operand:SI 0 "arith_reg_operand" "=r")
+       (lshiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0")
+                    (neg:SI (match_operand:SI 2 "arith_reg_operand" "r"))))]
+  "TARGET_SH2A"
+  "shld        %2,%0"
+  [(set_attr "type" "dyn_shift")
+   (set_attr "length" "4")])
+
  (define_insn "lshrsi3_d"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
         (lshiftrt:SI (match_operand:SI 1 "arith_reg_operand" "0")
@@ -3425,6 +3381,7 @@
          "Q,rI08,r,mr,x,l,t,r,x,l,r,r,>,>,i"))]
    "TARGET_SH1
     && ! TARGET_SH2E
+   && ! TARGET_SH2A
     && (register_operand (operands[0], SImode)
         || register_operand (operands[1], SImode))"
    "@
@@ -3454,15 +3411,16 @@
  ;; TARGET_FMOVD is in effect, and mode switching is done before reload.
  (define_insn "movsi_ie"
    [(set (match_operand:SI 0 "general_movdst_operand"
-           "=r,r,t,r,r,r,r,m,<,<,x,l,x,l,y,<,r,y,r,*f,y,*f,y")
+           "=r,r,r,t,r,r,r,r,m,<,<,x,l,x,l,y,<,r,y,r,*f,y,*f,y")
         (match_operand:SI 1 "general_movsrc_operand"
-        "Q,rI08,r,mr,x,l,t,r,x,l,r,r,>,>,>,y,i,r,y,y,*f,*f,y"))]
-  "TARGET_SH2E
+        "Q,rI08,I20,r,mr,x,l,t,r,x,l,r,r,>,>,>,y,i,r,y,y,*f,*f,y"))]
+  "(TARGET_SH2E || TARGET_SH2A)
     && (register_operand (operands[0], SImode)
         || register_operand (operands[1], SImode))"
    "@
         mov.l   %1,%0
         mov     %1,%0
+       movi20  %1,%0
         cmp/pl  %1
         mov.l   %1,%0
         sts     %1,%0
@@ -3484,9 +3442,9 @@
         flds    %1,fpul
         fmov    %1,%0
         ! move optimized away"
-  [(set_attr "type" "pcload_si,move,*,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,load,store,pcload_si,gp_fpul,fpul_gp,fmove,fmove,fmove,nil")
-   (set_attr "late_fp_use" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes,*,*,yes,*,*,*,*")
-   (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")])
+  [(set_attr "type" "pcload_si,move,move,*,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,load,store,pcload_si,gp_fpul,fpul_gp,fmove,fmove,fmove,nil")
+   (set_attr "late_fp_use" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes,*,*,yes,*,*,*,*")
+   (set_attr "length" "*,*,4,*,4,*,*,*,4,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")])
  
  (define_insn "movsi_i_lowpart"
    [(set (strict_low_part (match_operand:SI 0 "general_movdst_operand" "+r,r,r,r,r,r,m,r"))
@@ -3510,7 +3468,7 @@
         (unspec:SI [(match_operand 1 "register_operand" "")] UNSPEC_RA))]
    "TARGET_SH1"
    "#"
-  "&& ! rtx_equal_function_value_matters"
+  "&& ! currently_expanding_to_rtl"
    [(set (match_dup 0) (match_dup 1))]
    "
  {
@@ -3609,6 +3567,11 @@
        emit_insn (gen_ic_invalidate_line_compact (operands[0], operands[1]));
        DONE;
      }
+  else if (TARGET_SH4A_ARCH)
+    {
+      emit_insn (gen_ic_invalidate_line_sh4a (operands[0]));
+      DONE;
+    }
    operands[0] = force_reg (Pmode, operands[0]);
    operands[1] = force_reg (Pmode, GEN_INT (trunc_int_for_mode (0xf0000008,
                                                                Pmode)));
@@ -3629,6 +3592,14 @@
    [(set_attr "length" "8")
     (set_attr "type" "cwb")])
  
+(define_insn "ic_invalidate_line_sh4a"
+  [(unspec_volatile [(match_operand:SI 0 "register_operand" "r")]
+                   UNSPEC_ICACHE)]
+  "TARGET_SH4A_ARCH"
+  "ocbwb\\t@%0\;synco\;icbi\\t@%0"
+  [(set_attr "length" "16")
+   (set_attr "type" "cwb")])
+
  ;; ??? could make arg 0 an offsettable memory operand to allow to save
  ;; an add in the code that calculates the address.
  (define_insn "ic_invalidate_line_media"
@@ -4068,7 +4039,8 @@
             }
           /* Try movi / mshflo.l  */
           val2 = (HOST_WIDE_INT) val >> 32;
-         if (val2 == trunc_int_for_mode (val, SImode))
+         if (val2 == ((unsigned HOST_WIDE_INT)
+                       trunc_int_for_mode (val, SImode)))
             {
               operands[1] = gen_mshflo_l_di (operands[0], operands[0],
                                              operands[0]);
@@ -4213,7 +4185,7 @@
    [(set (match_operand:DF 0 "general_movdst_operand" "=r,r,r,m")
         (match_operand:DF 1 "general_movsrc_operand" "r,FQ,m,r"))]
    "TARGET_SH1
-   && (! TARGET_SH4 || reload_completed
+   && (! (TARGET_SH4 || TARGET_SH2A_DOUBLE) || reload_completed
         /* ??? We provide some insn so that direct_{load,store}[DFmode] get set */
         || (GET_CODE (operands[0]) == REG && REGNO (operands[0]) == 3)
         || (GET_CODE (operands[1]) == REG && REGNO (operands[1]) == 3))
@@ -4234,7 +4206,7 @@
         (match_operand:DF 1 "general_movsrc_operand" "d,r,F,m,d,FQ,m,r,d,r"))
     (use (match_operand:PSI 2 "fpscr_operand" "c,c,c,c,c,c,c,c,c,c"))
     (clobber (match_scratch:SI 3 "=X,X,&z,X,X,X,X,X,X,X"))]
-  "TARGET_SH4
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)
     && (arith_reg_operand (operands[0], DFmode)
         || arith_reg_operand (operands[1], DFmode))"
    "@
@@ -4252,8 +4224,8 @@
       [(if_then_else (eq_attr "fmovd" "yes") (const_int 2) (const_int 4))
        (const_int 4)
        (if_then_else (eq_attr "fmovd" "yes") (const_int 4) (const_int 6))
-      (if_then_else (eq_attr "fmovd" "yes") (const_int 2) (const_int 6))
-      (if_then_else (eq_attr "fmovd" "yes") (const_int 2) (const_int 6))
+      (if_then_else (eq_attr "fmovd" "yes") (const_int 4) (const_int 6))
+      (if_then_else (eq_attr "fmovd" "yes") (const_int 4) (const_int 6))
        (const_int 4)
        (const_int 8) (const_int 8) ;; these need only 8 bytes for @(r0,rn)
        ;; We can't use 4-byte push/pop on SHcompact, so we have to
@@ -4282,7 +4254,7 @@
         (match_operand:DF 1 "register_operand" ""))
     (use (match_operand:PSI 2 "fpscr_operand" ""))
     (clobber (match_scratch:SI 3 "=X"))]
-  "TARGET_SH4 && reload_completed
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) && reload_completed
     && (true_regnum (operands[0]) < 16) != (true_regnum (operands[1]) < 16)"
    [(const_int 0)]
    "
@@ -4321,7 +4293,7 @@
         (match_operand:DF 1 "general_movsrc_operand"  ""))
     (use (match_operand:PSI 2 "fpscr_operand" ""))
     (clobber (match_scratch:SI 3 ""))]
-  "TARGET_SH4
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)
     && reload_completed
     && true_regnum (operands[0]) < 16
     && true_regnum (operands[1]) < 16"
@@ -4394,7 +4366,7 @@
         (match_operand:DF 1 "memory_operand"  ""))
     (use (match_operand:PSI 2 "fpscr_operand" ""))
     (clobber (reg:SI R0_REG))]
-  "TARGET_SH4 && reload_completed"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) && reload_completed"
    [(parallel [(set (match_dup 0) (match_dup 1))
               (use (match_dup 2))
               (clobber (scratch:SI))])]
@@ -4451,7 +4423,7 @@
         (mem:DF (match_operand:SI 1 "register_operand" "")))
     (use (match_operand:PSI 2 "fpscr_operand" ""))
     (clobber (match_scratch:SI 3 ""))]
-  "TARGET_SH4 && ! TARGET_FMOVD && reload_completed
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) && ! TARGET_FMOVD && reload_completed
     && FP_OR_XD_REGISTER_P (true_regnum (operands[0]))
     && find_regno_note (insn, REG_DEAD, true_regnum (operands[1]))"
    [(const_int 0)]
@@ -4477,7 +4449,7 @@
         (match_operand:DF 1 "memory_operand" ""))
     (use (match_operand:PSI 2 "fpscr_operand" ""))
     (clobber (match_scratch:SI 3 ""))]
-  "TARGET_SH4 && ! TARGET_FMOVD && reload_completed
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) && ! TARGET_FMOVD && reload_completed
     && FP_OR_XD_REGISTER_P (true_regnum (operands[0]))"
    [(const_int 0)]
    "
@@ -4518,7 +4490,7 @@
         (match_operand:DF 1 "register_operand" ""))
     (use (match_operand:PSI 2 "fpscr_operand" ""))
     (clobber (match_scratch:SI 3 ""))]
-  "TARGET_SH4 && ! TARGET_FMOVD && reload_completed
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) && ! TARGET_FMOVD && reload_completed
     && FP_OR_XD_REGISTER_P (true_regnum (operands[1]))"
    [(const_int 0)]
    "
@@ -4672,7 +4644,7 @@
         emit_insn (gen_movdf_media_nofpu (operands[0], operands[1]));
        DONE;
      }
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        emit_df_insn (gen_movdf_i4 (operands[0], operands[1], get_fpscr_rtx ()));
        DONE;
@@ -4969,7 +4941,7 @@
         ! move optimized away"
    [(set_attr "type" "fmove,move,fmove,fmove,pcfload,fload,store,pcload,load,store,fmove,fmove,load,*,fpul_gp,gp_fpul,store,load,nil")
     (set_attr "late_fp_use" "*,*,*,*,*,*,yes,*,*,*,*,*,*,*,yes,*,yes,*,*")
-   (set_attr "length" "*,*,*,*,4,*,*,*,*,*,2,2,2,4,2,2,2,2,0")
+   (set_attr "length" "*,*,*,*,4,4,4,*,*,*,2,2,2,4,2,2,2,2,0")
     (set (attr "fp_mode") (if_then_else (eq_attr "fmovd" "yes")
                                            (const_string "single")
                                            (const_string "none")))])
@@ -5830,11 +5802,19 @@
               if (GET_MODE (operands[0]) != DImode)
                 operands[0] = gen_rtx_SUBREG (DImode, operands[0], 0);
             }
-         else
+         else if (TARGET_SHMEDIA64)
             {
               operands[0] = shallow_copy_rtx (operands[0]);
               PUT_MODE (operands[0], DImode);
             }
+         else
+           {
+             rtx reg = gen_reg_rtx (DImode);
+
+             operands[0] = copy_to_mode_reg (SImode, operands[0]);
+             emit_insn (gen_extendsidi2 (reg, operands[0]));
+             operands[0] = reg;
+           }
         }
        if (! target_reg_operand (operands[0], DImode))
         operands[0] = copy_to_mode_reg (DImode, operands[0]);
@@ -6055,11 +6035,19 @@
               if (GET_MODE (operands[1]) != DImode)
                 operands[1] = gen_rtx_SUBREG (DImode, operands[1], 0);
             }
-         else
+         else if (TARGET_SHMEDIA64)
             {
               operands[1] = shallow_copy_rtx (operands[1]);
               PUT_MODE (operands[1], DImode);
             }
+         else
+           {
+             rtx reg = gen_reg_rtx (DImode);
+
+             operands[1] = copy_to_mode_reg (SImode, operands[1]);
+             emit_insn (gen_extendsidi2 (reg, operands[1]));
+             operands[1] = reg;
+           }
         }
        if (! target_reg_operand (operands[1], DImode))
         operands[1] = copy_to_mode_reg (DImode, operands[1]);
@@ -6577,7 +6565,7 @@
                     (const_int 0))
               (match_operand 1 "" "")
               (match_operand 2 "" "")])]
-  "TARGET_SH2E || TARGET_SHMEDIA"
+  "(TARGET_SH2E || TARGET_SH2A) || TARGET_SHMEDIA"
    "
  {
    int i;
@@ -6799,7 +6787,7 @@
    PUT_MODE (gotsym, Pmode);
    insn = emit_insn (gen_symGOT_load (operands[0], gotsym));
  
-  RTX_UNCHANGING_P (SET_SRC (PATTERN (insn))) = 1;
+  MEM_READONLY_P (SET_SRC (PATTERN (insn))) = 1;
  
    DONE;
  }")
@@ -6883,8 +6871,8 @@
  
  (define_insn "tls_global_dynamic"
    [(set (match_operand:SI 0 "register_operand" "=&z")
-       (call (unspec:SI [(match_operand:SI 1 "" "")]
-                         UNSPEC_TLSGD)
+       (call (mem:SI (unspec:SI [(match_operand:SI 1 "" "")]
+                                 UNSPEC_TLSGD))
               (const_int 0)))
     (use (reg:PSI FPSCR_REG))
     (use (reg:SI PIC_REG))
@@ -6912,8 +6900,8 @@ mov.l\\t1f,r4\\n\\
  
  (define_insn "tls_local_dynamic"
    [(set (match_operand:SI 0 "register_operand" "=&z")
-       (call (unspec:SI [(match_operand:SI 1 "" "")]
-                         UNSPEC_TLSLDM)
+       (call (mem:SI (unspec:SI [(match_operand:SI 1 "" "")]
+                                 UNSPEC_TLSLDM))
               (const_int 0)))
     (use (reg:PSI FPSCR_REG))
     (use (reg:SI PIC_REG))
@@ -7171,7 +7159,7 @@ mov.l\\t1f,r0\\n\\
    "*
  {
    rtx diff_vec = PATTERN (next_real_insn (operands[2]));
-  char *load;
+  const char *load;
  
    if (GET_CODE (diff_vec) != ADDR_DIFF_VEC)
      abort ();
@@ -7344,7 +7332,7 @@ mov.l\\t1f,r0\\n\\
      {
        rtx r18 = gen_rtx_REG (DImode, PR_MEDIA_REG);
  
-      if (! call_used_regs[TR0_REG] || fixed_regs[TR0_REG])
+      if (! call_really_used_regs[TR0_REG] || fixed_regs[TR0_REG])
         abort ();
        tr_regno = TR0_REG;
        tr = gen_rtx_REG (DImode, tr_regno);
@@ -7415,7 +7403,7 @@ mov.l\\t1f,r0\\n\\
    [(use (match_operand 0 "register_operand" ""))]
    ""
  {
-  rtx tmp, ra = operands[0];
+  rtx ra = operands[0];
  
    if (TARGET_SHMEDIA64)
      emit_insn (gen_eh_set_ra_di (ra));
@@ -7513,7 +7501,7 @@ mov.l\\t1f,r0\\n\\
      }
    if (sh_expand_t_scc (EQ, operands[0]))
      DONE;
-  if (! rtx_equal_function_value_matters)
+  if (! currently_expanding_to_rtl)
      FAIL;
    operands[1] = prepare_scc_operands (EQ);
  }")
@@ -7561,7 +7549,7 @@ mov.l\\t1f,r0\\n\\
         }
        DONE;
      }
-  if (! rtx_equal_function_value_matters)
+  if (! currently_expanding_to_rtl)
      FAIL;
    operands[1] = prepare_scc_operands (LT);
  }")
@@ -7665,7 +7653,7 @@ mov.l\\t1f,r0\\n\\
         }
        DONE;
      }
-  if (! rtx_equal_function_value_matters)
+  if (! currently_expanding_to_rtl)
      FAIL;
    operands[1] = prepare_scc_operands (GT);
  }")
@@ -7719,7 +7707,7 @@ mov.l\\t1f,r0\\n\\
        DONE;
      }
  
-  if (! rtx_equal_function_value_matters)
+  if (! currently_expanding_to_rtl)
      FAIL;
    if (GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
      {
@@ -7760,7 +7748,7 @@ mov.l\\t1f,r0\\n\\
                                      sh_compare_op0, sh_compare_op1));
        DONE;
      }
-  if (! rtx_equal_function_value_matters)
+  if (! currently_expanding_to_rtl)
      FAIL;
    operands[1] = prepare_scc_operands (GTU);
  }")
@@ -7786,7 +7774,7 @@ mov.l\\t1f,r0\\n\\
                                      sh_compare_op1, sh_compare_op0));
        DONE;
      }
-  if (! rtx_equal_function_value_matters)
+  if (! currently_expanding_to_rtl)
      FAIL;
    operands[1] = prepare_scc_operands (LTU);
  }")
@@ -7817,7 +7805,7 @@ mov.l\\t1f,r0\\n\\
  
        DONE;
      }
-  if (! rtx_equal_function_value_matters)
+  if (! currently_expanding_to_rtl)
      FAIL;
    operands[1] = prepare_scc_operands (LEU);
  }")
@@ -7849,7 +7837,7 @@ mov.l\\t1f,r0\\n\\
        DONE;
      }
  
-  if (! rtx_equal_function_value_matters)
+  if (! currently_expanding_to_rtl)
      FAIL;
    operands[1] = prepare_scc_operands (GEU);
  }")
@@ -7900,7 +7888,7 @@ mov.l\\t1f,r0\\n\\
  
    if (sh_expand_t_scc (NE, operands[0]))
      DONE;
-  if (! rtx_equal_function_value_matters)
+  if (! currently_expanding_to_rtl)
      FAIL;
    operands[1] = prepare_scc_operands (EQ);
    operands[2] = gen_reg_rtx (SImode);
@@ -8173,7 +8161,7 @@ mov.l\\t1f,r0\\n\\
  (define_expand "fpu_switch0"
    [(set (match_operand:SI 0 "" "") (match_dup 2))
     (set (match_dup 1) (mem:PSI (match_dup 0)))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "
  {
    operands[1] = get_fpscr_rtx ();
@@ -8187,7 +8175,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:SI 0 "" "") (match_dup 2))
     (set (match_dup 3) (plus:SI (match_dup 0) (const_int 4)))
     (set (match_dup 1) (mem:PSI (match_dup 3)))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "
  {
    operands[1] = get_fpscr_rtx ();
@@ -8201,7 +8189,7 @@ mov.l\\t1f,r0\\n\\
  (define_expand "movpsi"
    [(set (match_operand:PSI 0 "register_operand" "")
         (match_operand:PSI 1 "general_movsrc_operand" ""))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "")
  
  ;; The c / m alternative is a fake to guide reload to load directly into
@@ -8235,7 +8223,7 @@ mov.l\\t1f,r0\\n\\
  (define_split
    [(set (reg:PSI FPSCR_REG)
         (mem:PSI (match_operand:SI 0 "register_operand" "")))]
-  "TARGET_SH4 && find_regno_note (insn, REG_DEAD, true_regnum (operands[0]))"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) && find_regno_note (insn, REG_DEAD, true_regnum (operands[0]))"
    [(set (match_dup 0) (match_dup 0))]
    "
  {
@@ -8249,7 +8237,7 @@ mov.l\\t1f,r0\\n\\
  (define_split
    [(set (reg:PSI FPSCR_REG)
         (mem:PSI (match_operand:SI 0 "register_operand" "")))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    [(set (match_dup 0) (plus:SI (match_dup 0) (const_int -4)))]
    "
  {
@@ -8269,9 +8257,21 @@ mov.l\\t1f,r0\\n\\
  (define_insn "toggle_sz"
    [(set (reg:PSI FPSCR_REG)
         (xor:PSI (reg:PSI FPSCR_REG) (const_int 1048576)))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fschg"
-  [(set_attr "fp_set" "unknown")])
+  [(set_attr "type" "fp") (set_attr "fp_set" "unknown")])
+
+;; There's no way we can use it today, since optimize mode switching
+;; doesn't enable us to know from which mode we're switching to the
+;; mode it requests, to tell whether we can use a relative mode switch
+;; (like toggle_pr) or an absolute switch (like loading fpscr from
+;; memory).
+(define_insn "toggle_pr"
+  [(set (reg:PSI FPSCR_REG)
+       (xor:PSI (reg:PSI FPSCR_REG) (const_int 524288)))]
+  "TARGET_SH4A_FP && ! TARGET_FPU_SINGLE"
+  "fpchg"
+  [(set_attr "type" "fp")])
  
  (define_expand "addsf3"
    [(set (match_operand:SF 0 "arith_reg_operand" "")
@@ -8415,7 +8415,7 @@ mov.l\\t1f,r0\\n\\
    "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_SINGLE)
      expand_sf_binop (&gen_mulsf3_i4, operands);
    else if (TARGET_SH2E)
      emit_insn (gen_mulsf3_ie (operands[0], operands[1], operands[2]));
@@ -8445,7 +8445,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
         (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%0")
                  (match_operand:SF 2 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH2E && ! TARGET_SH4"
+  "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "fmul        %2,%0"
    [(set_attr "type" "fp")])
  
@@ -8514,7 +8514,7 @@ mov.l\\t1f,r0\\n\\
    "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_SINGLE)
      {
        emit_sf_insn (gen_floatsisf2_i4 (operands[0], operands[1], get_fpscr_rtx ()));
        DONE;
@@ -8532,7 +8532,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
         (float:SF (match_operand:SI 1 "fpul_operand" "y")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "float       %1,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8540,7 +8540,7 @@ mov.l\\t1f,r0\\n\\
  (define_insn "*floatsisf2_ie"
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
         (float:SF (match_operand:SI 1 "fpul_operand" "y")))]
-  "TARGET_SH2E && ! TARGET_SH4"
+  "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "float       %1,%0"
    [(set_attr "type" "fp")])
  
@@ -8557,7 +8557,7 @@ mov.l\\t1f,r0\\n\\
    "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_SINGLE)
      {
        emit_sf_insn (gen_fix_truncsfsi2_i4 (operands[0], operands[1], get_fpscr_rtx ()));
        DONE;
@@ -8575,7 +8575,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:SI 0 "fpul_operand" "=y")
         (fix:SI (match_operand:SF 1 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "ftrc        %1,%0"
    [(set_attr "type" "ftrc_s")
     (set_attr "fp_mode" "single")])
@@ -8605,7 +8605,7 @@ mov.l\\t1f,r0\\n\\
  (define_insn "*fixsfsi"
    [(set (match_operand:SI 0 "fpul_operand" "=y")
         (fix:SI (match_operand:SF 1 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH2E && ! TARGET_SH4"
+  "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "ftrc        %1,%0"
    [(set_attr "type" "fp")])
  
@@ -8613,7 +8613,7 @@ mov.l\\t1f,r0\\n\\
    [(set (reg:SI T_REG)
         (gt:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
                (match_operand:SF 1 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH2E && ! TARGET_SH4"
+  "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "fcmp/gt     %1,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8622,7 +8622,7 @@ mov.l\\t1f,r0\\n\\
    [(set (reg:SI T_REG)
         (eq:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
                (match_operand:SF 1 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH2E && ! TARGET_SH4"
+  "TARGET_SH2E && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "fcmp/eq     %1,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8632,7 +8632,7 @@ mov.l\\t1f,r0\\n\\
         (ior:SI (reg:SI T_REG)
                 (eq:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
                        (match_operand:SF 1 "fp_arith_reg_operand" "f"))))]
-  "TARGET_SH2E && TARGET_IEEE && ! TARGET_SH4"
+  "TARGET_SH2E && TARGET_IEEE && ! (TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "* return output_ieee_ccmpeq (insn, operands);"
    [(set_attr "length" "4")])
  
@@ -8642,7 +8642,7 @@ mov.l\\t1f,r0\\n\\
         (gt:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
                (match_operand:SF 1 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "fcmp/gt     %1,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8652,7 +8652,7 @@ mov.l\\t1f,r0\\n\\
         (eq:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
                (match_operand:SF 1 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "fcmp/eq     %1,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8663,7 +8663,7 @@ mov.l\\t1f,r0\\n\\
                 (eq:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
                        (match_operand:SF 1 "fp_arith_reg_operand" "f"))))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_IEEE && TARGET_SH4"
+  "TARGET_IEEE && (TARGET_SH4 || TARGET_SH2A_SINGLE)"
    "* return output_ieee_ccmpeq (insn, operands);"
    [(set_attr "length" "4")
     (set_attr "fp_mode" "single")])
@@ -8770,6 +8770,117 @@ mov.l\\t1f,r0\\n\\
    [(set_attr "type" "fdiv")
     (set_attr "fp_mode" "single")])
  
+(define_insn "rsqrtsf2"
+  [(set (match_operand:SF 0 "register_operand" "=f")
+       (div:SF (match_operand:SF 1 "immediate_operand" "i")
+               (sqrt:SF (match_operand:SF 2 "register_operand" "0"))))
+   (use (match_operand:PSI 3 "fpscr_operand" "c"))]
+  "TARGET_SH4A_FP && flag_unsafe_math_optimizations
+   && operands[1] == CONST1_RTX (SFmode)"
+  "fsrra       %0"
+  [(set_attr "type" "fsrra")
+   (set_attr "fp_mode" "single")])
+
+(define_insn "fsca"
+  [(set (match_operand:V2SF 0 "fp_arith_reg_operand" "=f")
+       (vec_concat:V2SF
+        (unspec:SF [(mult:SF
+                     (float:SF (match_operand:SI 1 "fpul_operand" "y"))
+                     (match_operand:SF 2 "immediate_operand" "i"))
+                   ] UNSPEC_FSINA)
+        (unspec:SF [(mult:SF (float:SF (match_dup 1)) (match_dup 2))
+                   ] UNSPEC_FCOSA)))
+   (use (match_operand:PSI 3 "fpscr_operand" "c"))]
+  "TARGET_SH4A_FP && flag_unsafe_math_optimizations
+   && operands[2] == sh_fsca_int2sf ()"
+  "fsca        fpul,%d0"
+  [(set_attr "type" "fsca")
+   (set_attr "fp_mode" "single")])
+
+(define_expand "sinsf2"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "")
+       (unspec:SF [(match_operand:SF 1 "fp_arith_reg_operand" "")]
+                  UNSPEC_FSINA))]
+  "TARGET_SH4A_FP && flag_unsafe_math_optimizations"
+  "
+{
+  rtx scaled = gen_reg_rtx (SFmode);
+  rtx truncated = gen_reg_rtx (SImode);
+  rtx fsca = gen_reg_rtx (V2SFmode);
+  rtx scale_reg = force_reg (SFmode, sh_fsca_sf2int ());
+
+  emit_sf_insn (gen_mulsf3 (scaled, operands[1], scale_reg));
+  emit_sf_insn (gen_fix_truncsfsi2 (truncated, scaled));
+  emit_sf_insn (gen_fsca (fsca, truncated, sh_fsca_int2sf (),
+                         get_fpscr_rtx ()));
+  emit_move_insn (operands[0], gen_rtx_SUBREG (SFmode, fsca, 0));
+  DONE;
+}")
+
+(define_expand "cossf2"
+  [(set (match_operand:SF 0 "nonimmediate_operand" "")
+       (unspec:SF [(match_operand:SF 1 "fp_arith_reg_operand" "")]
+                  UNSPEC_FCOSA))]
+  "TARGET_SH4A_FP && flag_unsafe_math_optimizations"
+  "
+{
+  rtx scaled = gen_reg_rtx (SFmode);
+  rtx truncated = gen_reg_rtx (SImode);
+  rtx fsca = gen_reg_rtx (V2SFmode);
+  rtx scale_reg = force_reg (SFmode, sh_fsca_sf2int ());
+
+  emit_sf_insn (gen_mulsf3 (scaled, operands[1], scale_reg));
+  emit_sf_insn (gen_fix_truncsfsi2 (truncated, scaled));
+  emit_sf_insn (gen_fsca (fsca, truncated, sh_fsca_int2sf (),
+                         get_fpscr_rtx ()));
+  emit_move_insn (operands[0], gen_rtx_SUBREG (SFmode, fsca, 4));
+  DONE;
+}")
+
+(define_expand "sindf2"
+  [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
+       (unspec:DF [(match_operand:DF 1 "fp_arith_reg_operand" "")]
+                  UNSPEC_FSINA))]
+  "TARGET_SH4A_FP && ! TARGET_FPU_SINGLE && flag_unsafe_math_optimizations"
+  "
+{
+  rtx scaled = gen_reg_rtx (DFmode);
+  rtx truncated = gen_reg_rtx (SImode);
+  rtx fsca = gen_reg_rtx (V2SFmode);
+  rtx scale_reg = force_reg (DFmode, sh_fsca_df2int ());
+  rtx sfresult = gen_reg_rtx (SFmode);
+
+  emit_df_insn (gen_muldf3 (scaled, operands[1], scale_reg));
+  emit_df_insn (gen_fix_truncdfsi2 (truncated, scaled));
+  emit_sf_insn (gen_fsca (fsca, truncated, sh_fsca_int2sf (),
+                         get_fpscr_rtx ()));
+  emit_move_insn (sfresult, gen_rtx_SUBREG (SFmode, fsca, 0));
+  emit_df_insn (gen_extendsfdf2 (operands[0], sfresult));
+  DONE;
+}")
+
+(define_expand "cosdf2"
+  [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
+       (unspec:DF [(match_operand:DF 1 "fp_arith_reg_operand" "")]
+                  UNSPEC_FCOSA))]
+  "TARGET_SH4A_FP && ! TARGET_FPU_SINGLE && flag_unsafe_math_optimizations"
+  "
+{
+  rtx scaled = gen_reg_rtx (DFmode);
+  rtx truncated = gen_reg_rtx (SImode);
+  rtx fsca = gen_reg_rtx (V2SFmode);
+  rtx scale_reg = force_reg (DFmode, sh_fsca_df2int ());
+  rtx sfresult = gen_reg_rtx (SFmode);
+
+  emit_df_insn (gen_muldf3 (scaled, operands[1], scale_reg));
+  emit_df_insn (gen_fix_truncdfsi2 (truncated, scaled));
+  emit_sf_insn (gen_fsca (fsca, truncated, sh_fsca_int2sf (),
+                         get_fpscr_rtx ()));
+  emit_move_insn (sfresult, gen_rtx_SUBREG (SFmode, fsca, 4));
+  emit_df_insn (gen_extendsfdf2 (operands[0], sfresult));
+  DONE;
+}")
+
  (define_expand "abssf2"
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
         (abs:SF (match_operand:SF 1 "fp_arith_reg_operand" "")))]
@@ -8803,10 +8914,10 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
         (plus:DF (match_operand:DF 1 "fp_arith_reg_operand" "")
                  (match_operand:DF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        expand_df_binop (&gen_adddf3_i, operands);
        DONE;
@@ -8826,7 +8937,7 @@ mov.l\\t1f,r0\\n\\
         (plus:DF (match_operand:DF 1 "fp_arith_reg_operand" "%0")
                  (match_operand:DF 2 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 3 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fadd        %2,%0"
    [(set_attr "type" "dfp_arith")
     (set_attr "fp_mode" "double")])
@@ -8835,10 +8946,10 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
         (minus:DF (match_operand:DF 1 "fp_arith_reg_operand" "")
                   (match_operand:DF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        expand_df_binop (&gen_subdf3_i, operands);
        DONE;
@@ -8858,7 +8969,7 @@ mov.l\\t1f,r0\\n\\
         (minus:DF (match_operand:DF 1 "fp_arith_reg_operand" "0")
                   (match_operand:DF 2 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 3 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fsub        %2,%0"
    [(set_attr "type" "dfp_arith")
     (set_attr "fp_mode" "double")])
@@ -8867,10 +8978,10 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
         (mult:DF (match_operand:DF 1 "fp_arith_reg_operand" "")
                  (match_operand:DF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        expand_df_binop (&gen_muldf3_i, operands);
        DONE;
@@ -8890,7 +9001,7 @@ mov.l\\t1f,r0\\n\\
         (mult:DF (match_operand:DF 1 "fp_arith_reg_operand" "%0")
                  (match_operand:DF 2 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 3 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fmul        %2,%0"
    [(set_attr "type" "dfp_arith")
     (set_attr "fp_mode" "double")])
@@ -8899,10 +9010,10 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
         (div:DF (match_operand:DF 1 "fp_arith_reg_operand" "")
                 (match_operand:DF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        expand_df_binop (&gen_divdf3_i, operands);
        DONE;
@@ -8922,7 +9033,7 @@ mov.l\\t1f,r0\\n\\
         (div:DF (match_operand:DF 1 "fp_arith_reg_operand" "0")
                 (match_operand:DF 2 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 3 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fdiv        %2,%0"
    [(set_attr "type" "dfdiv")
     (set_attr "fp_mode" "double")])
@@ -8937,10 +9048,10 @@ mov.l\\t1f,r0\\n\\
  (define_expand "floatsidf2"
    [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
         (float:DF (match_operand:SI 1 "fpul_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        emit_df_insn (gen_floatsidf2_i (operands[0], operands[1],
                                       get_fpscr_rtx ()));
@@ -8959,7 +9070,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:DF 0 "fp_arith_reg_operand" "=f")
         (float:DF (match_operand:SI 1 "fpul_operand" "y")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "float       %1,%0"
    [(set_attr "type" "dfp_conv")
     (set_attr "fp_mode" "double")])
@@ -8974,10 +9085,10 @@ mov.l\\t1f,r0\\n\\
  (define_expand "fix_truncdfsi2"
    [(set (match_operand:SI 0 "fpul_operand" "")
         (fix:SI (match_operand:DF 1 "fp_arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        emit_df_insn (gen_fix_truncdfsi2_i (operands[0], operands[1],
                                           get_fpscr_rtx ()));
@@ -8996,7 +9107,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:SI 0 "fpul_operand" "=y")
         (fix:SI (match_operand:DF 1 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "ftrc        %1,%0"
    [(set_attr "type" "dfp_conv")
     (set_attr "dfp_comp" "no")
@@ -9029,7 +9140,7 @@ mov.l\\t1f,r0\\n\\
         (gt:SI (match_operand:DF 0 "arith_reg_operand" "f")
                (match_operand:DF 1 "arith_reg_operand" "f")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fcmp/gt     %1,%0"
    [(set_attr "type" "dfp_cmp")
     (set_attr "fp_mode" "double")])
@@ -9039,7 +9150,7 @@ mov.l\\t1f,r0\\n\\
         (eq:SI (match_operand:DF 0 "arith_reg_operand" "f")
                (match_operand:DF 1 "arith_reg_operand" "f")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fcmp/eq     %1,%0"
    [(set_attr "type" "dfp_cmp")
     (set_attr "fp_mode" "double")])
@@ -9050,7 +9161,7 @@ mov.l\\t1f,r0\\n\\
                 (eq:SI (match_operand:DF 0 "arith_reg_operand" "f")
                        (match_operand:DF 1 "arith_reg_operand" "f"))))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_IEEE && TARGET_SH4"
+  "TARGET_IEEE && (TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "* return output_ieee_ccmpeq (insn, operands);"
    [(set_attr "length" "4")
     (set_attr "fp_mode" "double")])
@@ -9091,7 +9202,7 @@ mov.l\\t1f,r0\\n\\
    [(set (reg:SI T_REG)
         (compare (match_operand:DF 0 "arith_operand" "")
                  (match_operand:DF 1 "arith_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
    sh_compare_op0 = operands[0];
@@ -9102,10 +9213,10 @@ mov.l\\t1f,r0\\n\\
  (define_expand "negdf2"
    [(set (match_operand:DF 0 "arith_reg_operand" "")
         (neg:DF (match_operand:DF 1 "arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        expand_df_unop (&gen_negdf2_i, operands);
        DONE;
@@ -9123,7 +9234,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:DF 0 "arith_reg_operand" "=f")
         (neg:DF (match_operand:DF 1 "arith_reg_operand" "0")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fneg        %0"
    [(set_attr "type" "fmove")
     (set_attr "fp_mode" "double")])
@@ -9131,10 +9242,10 @@ mov.l\\t1f,r0\\n\\
  (define_expand "sqrtdf2"
    [(set (match_operand:DF 0 "arith_reg_operand" "")
         (sqrt:DF (match_operand:DF 1 "arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        expand_df_unop (&gen_sqrtdf2_i, operands);
        DONE;
@@ -9152,7 +9263,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:DF 0 "arith_reg_operand" "=f")
         (sqrt:DF (match_operand:DF 1 "arith_reg_operand" "0")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fsqrt       %0"
    [(set_attr "type" "dfdiv")
     (set_attr "fp_mode" "double")])
@@ -9160,10 +9271,10 @@ mov.l\\t1f,r0\\n\\
  (define_expand "absdf2"
    [(set (match_operand:DF 0 "arith_reg_operand" "")
         (abs:DF (match_operand:DF 1 "arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        expand_df_unop (&gen_absdf2_i, operands);
        DONE;
@@ -9181,7 +9292,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:DF 0 "arith_reg_operand" "=f")
         (abs:DF (match_operand:DF 1 "arith_reg_operand" "0")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fabs        %0"
    [(set_attr "type" "fmove")
     (set_attr "fp_mode" "double")])
@@ -9189,10 +9300,10 @@ mov.l\\t1f,r0\\n\\
  (define_expand "extendsfdf2"
    [(set (match_operand:DF 0 "fp_arith_reg_operand" "")
         (float_extend:DF (match_operand:SF 1 "fpul_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        emit_df_insn (gen_extendsfdf2_i4 (operands[0], operands[1],
                                         get_fpscr_rtx ()));
@@ -9211,7 +9322,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:DF 0 "fp_arith_reg_operand" "=f")
         (float_extend:DF (match_operand:SF 1 "fpul_operand" "y")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fcnvsd  %1,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "double")])
@@ -9219,10 +9330,10 @@ mov.l\\t1f,r0\\n\\
  (define_expand "truncdfsf2"
    [(set (match_operand:SF 0 "fpul_operand" "")
         (float_truncate:SF (match_operand:DF 1 "fp_arith_reg_operand" "")))]
-  "TARGET_SH4 || TARGET_SHMEDIA_FPU"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE) || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH4)
+  if (TARGET_SH4 || TARGET_SH2A_DOUBLE)
      {
        emit_df_insn (gen_truncdfsf2_i4 (operands[0], operands[1],
                                        get_fpscr_rtx ()));
@@ -9241,7 +9352,7 @@ mov.l\\t1f,r0\\n\\
    [(set (match_operand:SF 0 "fpul_operand" "=y")
         (float_truncate:SF (match_operand:DF 1 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH4"
+  "(TARGET_SH4 || TARGET_SH2A_DOUBLE)"
    "fcnvds  %1,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "double")])
@@ -9258,7 +9369,7 @@ mov.l\\t1f,r0\\n\\
    "
  {
    rtx addr_target, orig_address, shift_reg, qi_val;
-  HOST_WIDE_INT bitsize, size, v;
+  HOST_WIDE_INT bitsize, size, v = 0;
    rtx x = operands[3];
  
    /* ??? expmed doesn't care for non-register predicates.  */
@@ -9308,6 +9419,71 @@ mov.l\\t1f,r0\\n\\
  
    DONE;
  }")
+
+(define_insn "movua"
+  [(set (match_operand:SI 0 "register_operand" "=z")
+       (sign_extract:SI (match_operand:SI 1 "unaligned_load_operand" "Sua>")
+                        (const_int 32) (const_int 0)))]
+  "TARGET_SH4A_ARCH"
+  "movua.l     %1,%0"
+  [(set_attr "type" "movua")])
+
+;; We shouldn't need this, but cse replaces increments with references
+;; to other regs before flow has a chance to create post_inc
+;; addressing modes, and only postreload's cse_move2add brings the
+;; increments back to a usable form.
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand" "")
+       (sign_extract:SI (mem:SI (match_operand:SI 1 "register_operand" ""))
+                        (const_int 32) (const_int 0)))
+   (set (match_dup 1) (plus:SI (match_dup 1) (const_int 4)))]
+  "TARGET_SH4A_ARCH && REGNO (operands[0]) != REGNO (operands[1])"
+  [(set (match_operand:SI 0 "register_operand" "")
+       (sign_extract:SI (mem:SI (post_inc:SI
+                                 (match_operand:SI 1 "register_operand" "")))
+                        (const_int 32) (const_int 0)))]
+  "")
+
+(define_expand "extv"
+  [(set (match_operand:SI 0 "register_operand" "")
+       (sign_extract:SI (match_operand:QI 1 "unaligned_load_operand" "")
+                        (match_operand 2 "const_int_operand" "")
+                        (match_operand 3 "const_int_operand" "")))]
+  "TARGET_SH4A_ARCH"
+{
+  if (TARGET_SH4A_ARCH
+      && INTVAL (operands[2]) == 32
+      && INTVAL (operands[3]) == -24 * (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
+      && GET_CODE (operands[1]) == MEM && MEM_ALIGN (operands[1]) < 32)
+    {
+      emit_insn (gen_movua (operands[0],
+                           adjust_address (operands[1], SImode, 0)));
+      DONE;
+    }
+
+  FAIL;
+})
+
+(define_expand "extzv"
+  [(set (match_operand:SI 0 "register_operand" "")
+       (zero_extract:SI (match_operand:QI 1 "unaligned_load_operand" "")
+                        (match_operand 2 "const_int_operand" "")
+                        (match_operand 3 "const_int_operand" "")))]
+  "TARGET_SH4A_ARCH"
+{
+  if (TARGET_SH4A_ARCH
+      && INTVAL (operands[2]) == 32
+      && INTVAL (operands[3]) == -24 * (BITS_BIG_ENDIAN != BYTES_BIG_ENDIAN)
+      && GET_CODE (operands[1]) == MEM && MEM_ALIGN (operands[1]) < 32)
+    {
+      emit_insn (gen_movua (operands[0],
+                           adjust_address (operands[1], SImode, 0)));
+      DONE;
+    }
+
+  FAIL;
+})
+
  \f
  ;; -------------------------------------------------------------------------
  ;; Peepholes
@@ -9511,7 +9687,7 @@ mov.l\\t1f,r0\\n\\
         (match_operand 1 "sh_rep_vec" ""))]
    "TARGET_SHMEDIA && reload_completed
     && GET_MODE (operands[0]) == GET_MODE (operands[1])
-   && VECTOR_MODE_SUPPORTED_P (GET_MODE (operands[0]))
+   && sh_vector_mode_supported_p (GET_MODE (operands[0]))
     && GET_MODE_SIZE (GET_MODE (operands[0])) == 8
     && (XVECEXP (operands[1], 0, 0) != const0_rtx
         || XVECEXP (operands[1], 0, 1) != const0_rtx)
@@ -9555,7 +9731,7 @@ mov.l\\t1f,r0\\n\\
         (match_operand 1 "sh_const_vec" ""))]
    "TARGET_SHMEDIA && reload_completed
     && GET_MODE (operands[0]) == GET_MODE (operands[1])
-   && VECTOR_MODE_SUPPORTED_P (GET_MODE (operands[0]))
+   && sh_vector_mode_supported_p (GET_MODE (operands[0]))
     && operands[1] != CONST0_RTX (GET_MODE (operands[1]))"
    [(set (match_dup 0) (match_dup 1))]
    "
@@ -10777,7 +10953,7 @@ mov.l\\t1f,r0\\n\\
    "byterev     %1, %0"
    [(set_attr "type" "arith_media")])
  
-(define_insn "prefetch"
+(define_insn "prefetch_media"
    [(prefetch (match_operand:QI 0 "address_operand" "p")
               (match_operand:SI 1 "const_int_operand" "n")
               (match_operand:SI 2 "const_int_operand" "n"))]
@@ -10790,451 +10966,32 @@ mov.l\\t1f,r0\\n\\
  }"
    [(set_attr "type" "other")])
  
-;; The following description  models the
-;; SH4 pipeline using the DFA based scheduler.
-;; The DFA based description is better way to model
-;; a superscalar pipeline as compared to function unit
-;; reservation model.
-;; 1. The function unit based model is oriented to describe at most one
-;;    unit reservation by each insn. It is difficult to model unit reservations in multiple
-;;    pipeline units by same insn. This can be done using DFA based description.
-;; 2. The execution performance of DFA based scheduler does not depend on processor complexity.
-;; 3. Writing all unit reservations for an instruction class is more natural description
-;;    of the pipeline and makes interface of the hazard recognizer simpler than the
-;;    old function unit based model.
-;; 4. The DFA model is richer and is a part of greater overall framework of RCSP.
-
-
-;; Two automata are defined to reduce number of states
-;; which a single large automaton will have.(Factoring)
-
-(define_automaton "inst_pipeline,fpu_pipe")
-
-;; This unit is basically the decode unit of the processor.
-;; Since SH4 is a dual issue machine,it is as if there are two
-;; units so that any insn can be processed by either one
-;; of the decoding unit.
-
-(define_cpu_unit "pipe_01,pipe_02" "inst_pipeline")
-
-
-;; The fixed point arithmetic calculator(?? EX Unit).
-
-(define_cpu_unit  "int" "inst_pipeline")
-
-;; f1_1 and f1_2 are floating point units.Actually there is
-;; a f1 unit which can overlap with other f1 unit but
-;; not another F1 unit.It is as though there were two
-;; f1 units.
-
-(define_cpu_unit "f1_1,f1_2" "fpu_pipe")
-
-;; The floating point units (except FS - F2 always precedes it.)
-
-(define_cpu_unit "F0,F1,F2,F3" "fpu_pipe")
-
-;; This is basically the MA unit of SH4
-;; used in LOAD/STORE pipeline.
-
-(define_cpu_unit "memory" "inst_pipeline")
-
-;; However, there are LS group insns that don't use it, even ones that
-;; complete in 0 cycles.  So we use an extra unit for the issue of LS insns.
-(define_cpu_unit "load_store" "inst_pipeline")
-
-;; The address calculator used for branch instructions.
-;; This will be reserved after "issue" of branch instructions
-;; and this is to make sure that no two branch instructions
-;; can be issued in parallel.
-
-(define_cpu_unit "pcr_addrcalc" "inst_pipeline")
-
-;; ----------------------------------------------------
-;; This reservation is to simplify the dual issue description.
-
-(define_reservation  "issue"  "pipe_01|pipe_02")
-
-;; This is to express the locking of D stage.
-;; Note that the issue of a CO group insn also effectively locks the D stage.
-
-(define_reservation  "d_lock" "pipe_01+pipe_02")
-
-;; Every FE instruction but fipr / ftrv starts with issue and this.
-(define_reservation "F01" "F0+F1")
-
-;; This is to simplify description where F1,F2,FS
-;; are used simultaneously.
-
-(define_reservation "fpu" "F1+F2")
-
-;; This is to highlight the fact that f1
-;; cannot overlap with F1.
-
-(exclusion_set  "f1_1,f1_2" "F1")
-
-(define_insn_reservation "nil" 0 (eq_attr "type" "nil") "nothing")
-
-;; Although reg moves have a latency of zero
-;; we need to highlight that they use D stage
-;; for one cycle.
-
-;; Group:      MT
-
-(define_insn_reservation "reg_mov" 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "move"))
-  "issue")
-
-;; Group:      LS
+(define_insn "prefetch_i4"
+  [(prefetch (match_operand:SI 0 "register_operand" "r")
+             (match_operand:SI 1 "const_int_operand" "n")
+             (match_operand:SI 2 "const_int_operand" "n"))]
+  "TARGET_HARD_SH4"
+  "*
+{
+  return \"pref @%0\";
+}"
+  [(set_attr "type" "other")])
  
-(define_insn_reservation "freg_mov" 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "fmove"))
-  "issue+load_store")
-
-;; We don't model all pipeline stages; we model the issue ('D') stage
-;; inasmuch as we allow only two instructions to issue simultaneously,
-;; and CO instructions prevent any simultaneous issue of another instruction.
-;; (This uses pipe_01 and pipe_02).
-;; Double issue of EX insns is prevented by using the int unit in the EX stage.
-;; Double issue of EX / BR insns is prevented by using the int unit /
-;; pcr_addrcalc unit in the EX stage.
-;; Double issue of BR / LS instructions is prevented by using the
-;; pcr_addrcalc / load_store unit in the issue cycle.
-;; Double issue of FE instructions is prevented by using F0 in the first
-;; pipeline stage after the first D stage.
-;; There is no need to describe the [ES]X / [MN]A / S stages after a D stage
-;; (except in the cases outlined above), nor to describe the FS stage after
-;; the F2 stage.
-
-;; Other MT  group instructions(1 step operations)
-;; Group:      MT
-;; Latency:    1
-;; Issue Rate:         1
-
-(define_insn_reservation "mt" 1
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "mt_group"))
-  "issue")
-
-;; Fixed Point Arithmetic Instructions(1 step operations)
-;; Group:      EX
-;; Latency:    1
-;; Issue Rate:         1
-
-(define_insn_reservation "sh4_simple_arith" 1
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "insn_class" "ex_group"))
-  "issue,int")
-
-;; Load and store instructions have no alignment peculiarities for the SH4,
-;; but they use the load-store unit, which they share with the fmove type
-;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) .
-;; Loads have a latency of two.
-;; However, call insns can only paired with a preceding insn, and have
-;; a delay slot, so that we want two more insns to be scheduled between the
-;; load of the function address and the call.  This is equivalent to a
-;; latency of three.
-;; ADJUST_COST can only properly handle reductions of the cost, so we
-;; use a latency of three here, which gets multiplied by 10 to yield 30.
-;; We only do this for SImode loads of general registers, to make the work
-;; for ADJUST_COST easier.
-
-;; Load Store instructions. (MOV.[BWL]@(d,GBR)
-;; Group:      LS
-;; Latency:    2
-;; Issue Rate:         1
-
-(define_insn_reservation "sh4_load" 2
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "load,pcload"))
-  "issue+load_store,nothing,memory")
-
-;; calls / sfuncs need an extra instruction for their delay slot.
-;; Moreover, estimating the latency for SImode loads as 3 will also allow
-;; adjust_cost to meaningfully bump it back up to 3 if they load the shift
-;; count of a dynamic shift.
-(define_insn_reservation "sh4_load_si" 3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "load_si,pcload_si"))
-  "issue+load_store,nothing,memory")
-
-;; (define_bypass 2 "sh4_load_si" "!sh4_call")
-
-;; The load latency is upped to three higher if the dependent insn does
-;; double precision computation.  We want the 'default' latency to reflect
-;; that increased latency because otherwise the insn priorities won't
-;; allow proper scheduling.
-(define_insn_reservation "sh4_fload" 3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "fload,pcfload"))
-  "issue+load_store,nothing,memory")
-
-;; (define_bypass 2 "sh4_fload" "!")
-
-(define_insn_reservation "sh4_store" 1
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "store"))
-  "issue+load_store,nothing,memory")
-
-;; Load Store instructions.
-;; Group:      LS
-;; Latency:    1
-;; Issue Rate:         1
-
-(define_insn_reservation "sh4_gp_fpul" 1
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "gp_fpul"))
-  "issue+load_store")
-
-;; Load Store instructions.
-;; Group:      LS
-;; Latency:    3
-;; Issue Rate:         1
-
-(define_insn_reservation "sh4_fpul_gp" 3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "fpul_gp"))
-  "issue+load_store")
-
-;; Branch (BF,BF/S,BT,BT/S,BRA)
-;; Group:      BR
-;; Latency when taken:         2 (or 1)
-;; Issue Rate:         1
-;; The latency is 1 when displacement is 0.
-;; We can't really do much with the latency, even if we could express it,
-;; but the pairing restrictions are useful to take into account.
-;; ??? If the branch is likely, we might want to fill the delay slot;
-;; if the branch is likely, but not very likely, should we pretend to use
-;; a resource that CO instructions use, to get a pairable delay slot insn?
-
-(define_insn_reservation "sh4_branch"  1
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "cbranch,jump"))
-  "issue+pcr_addrcalc")
-
-;; Branch Far (JMP,RTS,BRAF)
-;; Group:      CO
-;; Latency:    3
-;; Issue Rate:         2
-;; ??? Scheduling happens before branch shortening, and hence jmp and braf
-;; can't be distinguished from bra for the "jump" pattern.
-
-(define_insn_reservation "sh4_return" 3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "return,jump_ind"))
-         "d_lock*2")
-
-;; RTE
-;; Group:      CO
-;; Latency:    5
-;; Issue Rate:         5
-;; this instruction can be executed in any of the pipelines
-;; and blocks the pipeline for next 4 stages.
-
-(define_insn_reservation "sh4_return_from_exp" 5
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "rte"))
-  "d_lock*5")
-
-;; OCBP, OCBWB
-;; Group:      CO
-;; Latency:    1-5
-;; Issue Rate:         1
-
-;; cwb is used for the sequence ocbwb @%0; extu.w %0,%2; or %1,%2; mov.l %0,@%2
-;; ocbwb on its own would be "d_lock,nothing,memory*5"
-(define_insn_reservation "ocbwb"  6
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "cwb"))
-  "d_lock*2,(d_lock+memory)*3,issue+load_store+memory,memory*2")
-
-;; LDS to PR,JSR
-;; Group:      CO
-;; Latency:    3
-;; Issue Rate:         2
-;; The SX stage is blocked for last 2 cycles.
-;; OTOH, the only time that has an effect for insns generated by the compiler
-;; is when lds to PR is followed by sts from PR - and that is highly unlikely -
-;; or when we are doing a function call - and we don't do inter-function
-;; scheduling.  For the function call case, it's really best that we end with
-;; something that models an rts.
-
-(define_insn_reservation "sh4_lds_to_pr" 3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "prset") )
-  "d_lock*2")
-
-;; calls introduce a longisch delay that is likely to flush the pipelines
-;; of the caller's instructions.  Ordinary functions tend to end with a
-;; load to restore a register (in the delay slot of rts), while sfuncs
-;; tend to end with an EX or MT insn.  But that is not actually relevant,
-;; since there are no instructions that contend for memory access early.
-;; We could, of course, provide exact scheduling information for specific
-;; sfuncs, if that should prove useful.
-
-(define_insn_reservation "sh4_call" 16
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "call,sfunc"))
-  "d_lock*16")
-
-;; LDS.L to PR
-;; Group:      CO
-;; Latency:    3
-;; Issue Rate:         2
-;; The SX unit is blocked for last 2 cycles.
-
-(define_insn_reservation "ldsmem_to_pr"  3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "pload"))
-  "d_lock*2")
-
-;; STS from PR
-;; Group:      CO
-;; Latency:    2
-;; Issue Rate:         2
-;; The SX unit in second and third cycles.
-
-(define_insn_reservation "sts_from_pr" 2
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "prget"))
-  "d_lock*2")
-
-;; STS.L from PR
-;; Group:      CO
-;; Latency:    2
-;; Issue Rate:         2
-
-(define_insn_reservation "sh4_prstore_mem" 2
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "pstore"))
-  "d_lock*2,nothing,memory")
-
-;; LDS to FPSCR
-;; Group:      CO
-;; Latency:    4
-;; Issue Rate:         1
-;; F1 is blocked for last three cycles.
-
-(define_insn_reservation "fpscr_load" 4
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "gp_fpscr"))
-  "d_lock,nothing,F1*3")
-
-;; LDS.L to FPSCR
-;; Group:      CO
-;; Latency:    1 / 4
-;; Latency to update Rn is 1 and latency to update FPSCR is 4
-;; Issue Rate:         1
-;; F1 is blocked for last three cycles.
-
-(define_insn_reservation "fpscr_load_mem" 4
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type"  "mem_fpscr"))
-  "d_lock,nothing,(F1+memory),F1*2")
+(define_expand "prefetch"
+  [(prefetch (match_operand:QI 0 "address_operand" "p")
+             (match_operand:SI 1 "const_int_operand" "n")
+             (match_operand:SI 2 "const_int_operand" "n"))]
+  "TARGET_SHMEDIA || TARGET_HARD_SH4"
+  "
+{
+  if (TARGET_HARD_SH4 && ! register_operand (operands[0], SImode))
+    {
+      rtx reg = gen_reg_rtx (SImode);
+      emit_move_insn (reg, operands[0]);
+      operands[0] = reg;
+    }
  
-\f
-;; Fixed point multiplication (DMULS.L DMULU.L MUL.L MULS.W,MULU.W)
-;; Group:      CO
-;; Latency:    4 / 4
-;; Issue Rate:         1
-
-(define_insn_reservation "multi" 4
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "smpy,dmpy"))
-  "d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2")
-
-;; Fixed STS from MACL / MACH
-;; Group:      CO
-;; Latency:    3
-;; Issue Rate:         1
-
-(define_insn_reservation "sh4_mac_gp" 3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "mac_gp"))
-  "d_lock")
-
-
-;; Single precision floating point computation FCMP/EQ,
-;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRVHG, FSCHG
-;; Group:      FE
-;; Latency:    3/4
-;; Issue Rate:         1
-
-(define_insn_reservation "fp_arith"  3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "fp"))
-  "issue,F01,F2")
-
-(define_insn_reservation "fp_arith_ftrc"  3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "ftrc_s"))
-  "issue,F01,F2")
-
-(define_bypass 1 "fp_arith_ftrc" "sh4_fpul_gp")
-
-;; Single Precision FDIV/SQRT
-;; Group:      FE
-;; Latency:    12/13 (FDIV); 11/12 (FSQRT)
-;; Issue Rate:         1
-;; We describe fdiv here; fsqrt is actually one cycle faster.
-
-(define_insn_reservation "fp_div" 12
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "fdiv"))
-  "issue,F01+F3,F2+F3,F3*7,F1+F3,F2")
-
-;; Double Precision floating point computation
-;; (FCNVDS, FCNVSD, FLOAT, FTRC)
-;; Group:      FE
-;; Latency:    (3,4)/5
-;; Issue Rate:         1
-
-(define_insn_reservation "dp_float" 4
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "dfp_conv"))
-  "issue,F01,F1+F2,F2")
-
-;; Double-precision floating-point (FADD,FMUL,FSUB)
-;; Group:      FE
-;; Latency:    (7,8)/9
-;; Issue Rate:         1
-
-(define_insn_reservation "fp_double_arith" 8
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "dfp_arith"))
-  "issue,F01,F1+F2,fpu*4,F2")
-
-;; Double-precision FCMP (FCMP/EQ,FCMP/GT)
-;; Group:      CO
-;; Latency:    3/5
-;; Issue Rate:         2
-
-(define_insn_reservation "fp_double_cmp" 3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "dfp_cmp"))
-  "d_lock,(d_lock+F01),F1+F2,F2")
-
-;; Double precision FDIV/SQRT
-;; Group:      FE
-;; Latency:    (24,25)/26
-;; Issue Rate:         1
-
-(define_insn_reservation "dp_div" 25
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "dfdiv"))
-  "issue,F01+F3,F1+F2+F3,F2+F3,F3*16,F1+F3,(fpu+F3)*2,F2")
-
-
-;; Use the branch-not-taken case to model arith3 insns.  For the branch taken
-;; case, we'd get a d_lock instead of issue at the end.
-(define_insn_reservation "arith3" 3
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "arith3"))
-  "issue,d_lock+pcr_addrcalc,issue")
-
-;; arith3b insns schedule the same no matter if the branch is taken or not.
-(define_insn_reservation "arith3b" 2
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "arith3"))
-  "issue,d_lock+pcr_addrcalc")
+  emit_insn ((TARGET_SHMEDIA ? gen_prefetch_media : gen_prefetch_i4)
+            (operands[0], operands[1], operands[2]));
+  DONE;
+}")