* sh.h (OVERRIDE_OPTIONS): Set default values for align_loops

[pf3gnuchains/gcc-fork.git] / gcc / config / sh / sh.md
diff --git a/gcc/config/sh/sh.md b/gcc/config/sh/sh.md

index aa64bf5..65dd696 100644 (file)
--- a/gcc/config/sh/sh.md
+++ b/gcc/config/sh/sh.md
@@ -1,5 +1,5 @@
  ;;- Machine description for Hitachi / SuperH SH.
-;;  Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002
+;;  Copyright (C) 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003
  ;;  Free Software Foundation, Inc.
  ;;  Contributed by Steve Chamberlain (sac@cygnus.com).
  ;;  Improved by Jim Wilson (wilson@cygnus.com).
@@ -135,6 +135,13 @@
    (UNSPEC_FSINA                16)
    (UNSPEC_NSB          17)
    (UNSPEC_ALLOCO       18)
+  (UNSPEC_EH_RETURN    19)
+  (UNSPEC_TLSGD                20)
+  (UNSPEC_TLSLDM       21)
+  (UNSPEC_TLSIE                22)
+  (UNSPEC_DTPOFF       23)
+  (UNSPEC_GOTTPOFF     24)
+  (UNSPEC_TPOFF                25)
  
    ;; These are used with unspec_volatile.
    (UNSPECV_BLOCKAGE    0)
@@ -153,7 +160,7 @@
  ;; Target CPU.
  
  (define_attr "cpu"
- "sh1,sh2,sh3,sh3e,sh4,sh5"
+ "sh1,sh2,sh2e,sh3,sh3e,sh4,sh5"
    (const (symbol_ref "sh_cpu_attr")))
  
  (define_attr "endian" "big,little"
@@ -183,8 +190,10 @@
  ;; arith3b     like above, but might end with a redirected branch
  ;; load                from memory
  ;; load_si     Likewise, SImode variant for general register.
+;; fload       Likewise, but load to fp register.
  ;; store       to memory
-;; move                register to register
+;; move                general purpose register to register
+;; mt_group    other sh4 mt instructions
  ;; fmove       register to register, floating point
  ;; smpy                word precision integer multiply
  ;; dmpy                longword or doublelongword precision integer multiply
@@ -194,15 +203,21 @@
  ;; pstore      store of pr reg, which can't be put into delay slot of jsr
  ;; prget       copy pr to register, ditto
  ;; pcload      pc relative load of constant value
+;; pcfload     Likewise, but load to fp register.
  ;; pcload_si   Likewise, SImode variant for general register.
  ;; rte         return from exception
  ;; sfunc       special function call with known used registers
  ;; call                function call
  ;; fp          floating point
  ;; fdiv                floating point divide (or square root)
-;; gp_fpul     move between general purpose register and fpul
+;; gp_fpul     move from general purpose register to fpul
+;; fpul_gp     move from fpul to general purpose register
+;; mac_gp      move from mac[lh] to general purpose register
  ;; dfp_arith, dfp_cmp,dfp_conv
+;; ftrc_s      fix_truncsfsi2_i4
  ;; dfdiv       double precision floating point divide (or square root)
+;; cwb         ic_invalidate_line_i
+;; tls_load     load TLS related address 
  ;; arith_media SHmedia arithmetic, logical, and shift instructions
  ;; cbranch_media SHmedia conditional branch instructions
  ;; cmp_media   SHmedia compare instructions
@@ -219,7 +234,7 @@
  ;; fpconv_media        SHmedia single precision floating point conversions
  ;; fstore_media        SHmedia floating point register store instructions
  ;; gettr_media SHmedia gettr instruction
-;; invalidate_line_media SHmedia invaldiate_line sequence
+;; invalidate_line_media SHmedia invalidate_line sequence
  ;; jump_media  SHmedia unconditional branch instructions
  ;; load_media  SHmedia general register load instructions
  ;; pt_media    SHmedia pt instruction (expanded by assembler)
@@ -228,35 +243,37 @@
  ;; mcmp_media  SHmedia multimedia compare, absolute, saturating ops
  ;; mac_media   SHmedia mac-style fixed point operations
  ;; d2mpy_media SHmedia: two 32 bit integer multiplies
-;; atrans      SHmedia approximate transcendential functions
+;; atrans      SHmedia approximate transcendental functions
  ;; ustore_media        SHmedia unaligned stores
  ;; nil         no-op move, will be deleted.
  
  (define_attr "type"
- "cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,rte,sfunc,call,fp,fdiv,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
+ "mt_group,cbranch,jump,jump_ind,arith,arith3,arith3b,dyn_shift,load,load_si,fload,store,move,fmove,smpy,dmpy,return,pload,prset,pstore,prget,pcload,pcload_si,pcfload,rte,sfunc,call,fp,fdiv,ftrc_s,dfp_arith,dfp_cmp,dfp_conv,dfdiv,gp_fpul,fpul_gp,mac_gp,mem_fpscr,gp_fpscr,cwb,tls_load,arith_media,cbranch_media,cmp_media,dfdiv_media,dfmul_media,dfparith_media,dfpconv_media,dmpy_media,fcmp_media,fdiv_media,fload_media,fmove_media,fparith_media,fpconv_media,fstore_media,gettr_media,invalidate_line_media,jump_media,load_media,pt_media,ptabs_media,store_media,mcmp_media,mac_media,d2mpy_media,atrans_media,ustore_media,nil,other"
    (const_string "other"))
  
  ;; We define a new attribute namely "insn_class".We use
-;; this for DFA based pipeline description.
-;; Although the "type" attribute covers almost all insn 
-;; classes,it is more convenient to define new attribute
-;; for certain reservations.
+;; this for the DFA based pipeline description.
  ;;
  ;; mt_group      SH4 "mt" group instructions.
  ;;
-;; ex_group      SH4 "ex" group instructions.They mostly
-;;               overlap with arithmetic instructions but
-;;               new attribute defined to distinguish from
-;;              mt group instructions.
+;; ex_group      SH4 "ex" group instructions.
+;;
+;; ls_group      SH4 "ls" group instructions.
  ;;
-;; lds_to_fpscr  The "type" attribute couldn't sufficiently
-;;               distinguish it from others.It is part of 
-;;               new attribute.Similar case with ldsmem_to_fpscr
-;;              and cwb. 
  
  (define_attr "insn_class"
-            "mt_group,ex_group,lds_to_fpscr,ldsmem_to_fpscr,cwb,none"
-            (const_string "none"))
+  "mt_group,ex_group,ls_group,br_group,fe_group,co_group,none"
+  (cond [(eq_attr "type" "move,mt_group") (const_string "mt_group")
+         (eq_attr "type" "arith,dyn_shift") (const_string "ex_group")
+        (eq_attr "type" "fmove,load,pcload,load_si,pcload_si,fload,pcfload,store,gp_fpul,fpul_gp") (const_string "ls_group")
+        (eq_attr "type" "cbranch,jump") (const_string "br_group")
+        (eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_conv,dfdiv")
+          (const_string "fe_group")
+        (eq_attr "type" "jump_ind,smpy,dmpy,mac_gp,return,pload,prset,pstore,prget,rte,sfunc,call,dfp_cmp,mem_fpscr,gp_fpscr,cwb") (const_string "co_group")]
+       (const_string "none")))
+;; nil are zero instructions, and arith3 / arith3b are multiple instructions,
+;; so these do not belong in an insn group, although they are modeled
+;; with their own define_insn_reservations.
  
  ;; Indicate what precision must be selected in fpscr for this insn, if any.
  
@@ -374,9 +391,9 @@
          (eq_attr "type" "jump")
          (cond [(eq_attr "med_branch_p" "yes")
                 (const_int 2)
-               (and (eq (symbol_ref "GET_CODE (PREV_INSN (insn))")
+               (and (eq (symbol_ref "GET_CODE (prev_nonnote_insn (insn))")
                          (symbol_ref "INSN"))
-                    (eq (symbol_ref "INSN_CODE (PREV_INSN (insn))")
+                    (eq (symbol_ref "INSN_CODE (prev_nonnote_insn (insn))")
                          (symbol_ref "code_for_indirect_jump_scratch")))
                 (if_then_else (eq_attr "braf_branch_p" "yes")
                               (const_int 6)
@@ -445,178 +462,6 @@
    (and (eq_attr "pipe_model" "sh1") (eq_attr "type" "fdiv")) 13 12)
  
  
-;; SH4 scheduling
-;; The SH4 is a dual-issue implementation, thus we have to multiply all
-;; costs by at least two.
-;; There will be single increments of the modeled that don't correspond
-;; to the actual target ;; whenever two insns to be issued depend one a
-;; single resource, and the scheduler picks to be the first one.
-;; If we multiplied the costs just by two, just two of these single
-;; increments would amount to an actual cycle.  By picking a larger
-;; factor, we can ameliorate the effect; However, we then have to make sure
-;; that only two insns are modeled as issued per actual cycle.
-;; Moreover, we need a way to specify the latency of insns that don't
-;; use an actual function unit.
-;; We use an 'issue' function unit to do that, and a cost factor of 10.
-
-(define_function_unit "issue" 2 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "!nil,arith3"))
-  10 10)
-
-(define_function_unit "issue" 2 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "arith3"))
-  30 30)
-
-;; There is no point in providing exact scheduling information about branches,
-;; because they are at the starts / ends of basic blocks anyways.
-
-;; Some insns cannot be issued before/after another insn in the same cycle,
-;; irrespective of the type of the other insn.
-
-;; default is dual-issue, but can't be paired with an insn that
-;; uses multiple function units.
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "!smpy,dmpy,pload,pstore,dfp_cmp,gp_fpul,call,sfunc,arith3,arith3b"))
-  1 10
-  [(eq_attr "type" "smpy,dmpy,pload,pstore,dfp_cmp,gp_fpul")])
-
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "smpy,dmpy,pload,pstore,dfp_cmp,gp_fpul"))
-  10 10
-  [(const_int 1)])
-
-;; arith3 insns are always pairable at the start, but not inecessarily at
-;; the end; however, there doesn't seem to be a way to express that.
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "arith3"))
-  30 20
-  [(const_int 1)])
-
-;; arith3b insn are pairable at the end and have latency that prevents pairing
-;; with the following branch, but we don't want this latency be respected;
-;; When the following branch is immediately adjacent, we can redirect the
-;; internal branch, which is likly to be a larger win.
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "arith3b"))
-  20 20
-  [(const_int 1)])
-
-;; calls introduce a longisch delay that is likely to flush the pipelines.
-(define_function_unit "single_issue"     1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "call,sfunc"))
-  160 160
-  [(eq_attr "type" "!call") (eq_attr "type" "call")])
-
-;; Load and store instructions have no alignment peculiarities for the SH4,
-;; but they use the load-store unit, which they share with the fmove type
-;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) .
-;; Loads have a latency of two.
-;; However, call insns can only paired with a preceding insn, and have
-;; a delay slot, so that we want two more insns to be scheduled between the
-;; load of the function address and the call.  This is equivalent to a
-;; latency of three.
-;; We cannot use a conflict list for this, because we need to distinguish
-;; between the actual call address and the function arguments.
-;; ADJUST_COST can only properly handle reductions of the cost, so we
-;; use a latency of three here, which gets multiplied by 10 to yield 30.
-;; We only do this for SImode loads of general registers, to make the work
-;; for ADJUST_COST easier.
-
-;; When specifying different latencies for different insns using the
-;; the same function unit, genattrtab.c assumes a 'FIFO constraint'
-;; so that the blockage is at least READY-COST (E) + 1 - READY-COST (C)
-;; for an executing insn E and a candidate insn C.
-;; Therefore, we define three different function units for load_store:
-;; load_store, load and load_si.
-
-(define_function_unit "load_si" 1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "load_si,pcload_si")) 30 10)
-(define_function_unit "load" 1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "load,pcload,pload")) 20 10)
-(define_function_unit "load_store" 1 0
-  (and (eq_attr "pipe_model" "sh4")
-       (eq_attr "type" "load_si,pcload_si,load,pcload,pload,store,pstore,fmove"))
-  10 10)
-
-(define_function_unit "int"    1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "arith,dyn_shift")) 10 10)
-
-;; Again, we have to pretend a lower latency for the "int" unit to avoid a
-;; spurious FIFO constraint; the multiply instructions use the "int"
-;; unit actually only for two cycles.
-(define_function_unit "int"    1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "smpy,dmpy")) 20 20)
-
-;; We use a fictous "mpy" unit to express the actual latency.
-(define_function_unit "mpy"    1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "smpy,dmpy")) 40 20)
-
-;; Again, we have to pretend a lower latency for the "int" unit to avoid a
-;; spurious FIFO constraint.
-(define_function_unit "int"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "gp_fpul")) 10 10)
-
-;; We use a fictous "gp_fpul" unit to express the actual latency.
-(define_function_unit "gp_fpul"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "gp_fpul")) 20 10)
-
-;; ??? multiply uses the floating point unit, but with a two cycle delay.
-;; Thus, a simple single-precision fp operation could finish if issued in
-;; the very next cycle, but stalls when issued two or three cycles later.
-;; Similarily, a divide / sqrt can work without stalls if issued in
-;; the very next cycle, while it would have to block if issued two or
-;; three cycles later.
-;; There is no way to model this with gcc's function units.  This problem is
-;; actually mentioned in md.texi.  Tackling this problem requires first that
-;; it is possible to speak about the target in an open discussion.
-;;
-;; However, simple double-precision operations always conflict.
-
-(define_function_unit "fp"    1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "smpy,dmpy")) 40 40
-  [(eq_attr "type" "dfp_cmp,dfp_conv,dfp_arith")])
-
-;; The "fp" unit is for pipeline stages F1 and F2.
-
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "fp")) 30 10)
-
-;; Again, we have to pretend a lower latency for the "fp" unit to avoid a
-;; spurious FIFO constraint; the bulk of the fdiv type insns executes in
-;; the F3 stage.
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "fdiv")) 30 10)
-
-;; The "fdiv" function unit models the aggregate effect of the F1, F2 and F3
-;; pipeline stages on the pipelining of fdiv/fsqrt insns.
-;; We also use it to give the actual latency here.
-;; fsqrt is actually one cycle faster than fdiv (and the value used here),
-;; but that will hardly matter in practice for scheduling.
-(define_function_unit "fdiv"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "fdiv")) 120 100)
-
-;; There is again a late use of the "fp" unit by [d]fdiv type insns
-;; that we can't express.
-
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "dfp_cmp,dfp_conv")) 40 20)
-
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "dfp_arith")) 80 60)
-
-(define_function_unit "fp"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "dfdiv")) 230 10)
-
-(define_function_unit "fdiv"     1 0
-  (and (eq_attr "pipe_model" "sh4") (eq_attr "type" "dfdiv")) 230 210)
-
  ;; SH-5 SHmedia scheduling
  ;; When executing SHmedia code, the SH-5 is a fairly straightforward
  ;; single-issue machine.  It has four pipelines, the branch unit (br),
@@ -696,12 +541,45 @@
          (eq_attr "length" "2") (const_string "yes")
          ] (const_string "no")))
  
+(define_attr "cond_delay_slot" "yes,no"
+  (cond [(eq_attr "in_delay_slot" "yes") (const_string "yes")
+        ] (const_string "no")))
+
  (define_attr "is_sfunc" ""
    (if_then_else (eq_attr "type" "sfunc") (const_int 1) (const_int 0)))
  
  (define_attr "is_mac_media" ""
    (if_then_else (eq_attr "type" "mac_media") (const_int 1) (const_int 0)))
  
+(define_attr "branch_zero" "yes,no"
+  (cond [(eq_attr "type" "!cbranch") (const_string "no")
+        (ne (symbol_ref "(next_active_insn (insn)\
+                          == (prev_active_insn\
+                              (XEXP (SET_SRC (PATTERN (insn)), 1))))\
+                         && get_attr_length (next_active_insn (insn)) == 2")
+            (const_int 0))
+        (const_string "yes")]
+       (const_string "no")))
+
+;; SH4 Double-precision computation with double-precision result -
+;; the two halves are ready at different times.
+(define_attr "dfp_comp" "yes,no"
+  (cond [(eq_attr "type" "dfp_arith,dfp_conv,dfdiv") (const_string "yes")]
+       (const_string "no")))
+
+;; Insns for which the latency of a preceding fp insn is decreased by one.
+(define_attr "late_fp_use" "yes,no" (const_string "no"))
+;; And feeding insns for which this relevant.
+(define_attr "any_fp_comp" "yes,no"
+  (cond [(eq_attr "type" "fp,fdiv,ftrc_s,dfp_arith,dfp_conv,dfdiv")
+        (const_string "yes")]
+       (const_string "no")))
+
+(define_attr "any_int_load" "yes,no"
+  (cond [(eq_attr "type" "load,load_si,pcload,pcload_si")
+        (const_string "yes")]
+       (const_string "no")))
+
  (define_delay
    (eq_attr "needs_delay_slot" "yes")
    [(eq_attr "in_delay_slot" "yes") (nil) (nil)])
@@ -738,7 +616,10 @@
  (define_delay
    (and (eq_attr "type" "cbranch")
         (ne (symbol_ref "TARGET_SH2") (const_int 0)))
-  [(eq_attr "in_delay_slot" "yes") (eq_attr "in_delay_slot" "yes") (nil)])
+  ;; SH2e has a hardware bug that pretty much prohibits the use of
+  ;; annuled delay slots.
+  [(eq_attr "in_delay_slot" "yes") (and (eq_attr "cond_delay_slot" "yes")
+                                       (not (eq_attr "cpu" "sh2e"))) (nil)])
  \f
  ;; -------------------------------------------------------------------------
  ;; SImode signed integer comparisons
@@ -751,7 +632,7 @@
                (const_int 0)))]
    "TARGET_SH1"
    "tst %1,%0"
-  [(set_attr "insn_class" "mt_group")])
+  [(set_attr "type" "mt_group")])
  
  ;; ??? Perhaps should only accept reg/constant if the register is reg 0.
  ;; That would still allow reload to create cmpi instructions, but would
@@ -768,7 +649,7 @@
         tst     %0,%0
         cmp/eq  %1,%0
         cmp/eq  %1,%0"
-   [(set_attr "insn_class" "mt_group,mt_group,mt_group")])
+   [(set_attr "type" "mt_group")])
  
  (define_insn "cmpgtsi_t"
    [(set (reg:SI T_REG)
@@ -778,7 +659,7 @@
    "@
         cmp/gt  %1,%0
         cmp/pl  %0"
-   [(set_attr "insn_class" "mt_group,mt_group")])
+   [(set_attr "type" "mt_group")])
  
  (define_insn "cmpgesi_t"
    [(set (reg:SI T_REG)
@@ -788,7 +669,7 @@
    "@
         cmp/ge  %1,%0
         cmp/pz  %0"
-   [(set_attr "insn_class" "mt_group,mt_group")])
+   [(set_attr "type" "mt_group")])
  
  ;; -------------------------------------------------------------------------
  ;; SImode unsigned integer comparisons
@@ -800,7 +681,7 @@
                 (match_operand:SI 1 "arith_reg_operand" "r")))]
    "TARGET_SH1"
    "cmp/hs      %1,%0"
-   [(set_attr "insn_class" "mt_group")])
+   [(set_attr "type" "mt_group")])
  
  (define_insn "cmpgtusi_t"
    [(set (reg:SI T_REG)
@@ -808,7 +689,7 @@
                 (match_operand:SI 1 "arith_reg_operand" "r")))]
    "TARGET_SH1"
    "cmp/hi      %1,%0"
-   [(set_attr "insn_class" "mt_group")])
+   [(set_attr "type" "mt_group")])
  
  ;; We save the compare operands in the cmpxx patterns and use them when
  ;; we generate the branch.
@@ -905,7 +786,7 @@
         cmp/eq\\t%S1,%S0\;bf{.|/}s\\t%,Ldi%=\;cmp/ge\\t%S1,%S0\;cmp/hs\\t%R1,%R0\\n%,Ldi%=:
         cmp/pz\\t%S0"
    [(set_attr "length" "8,2")
-   (set_attr "type" "arith3,arith")])
+   (set_attr "type" "arith3,mt_group")])
  \f
  ;; -------------------------------------------------------------------------
  ;; DImode unsigned integer comparisons
@@ -1172,8 +1053,7 @@
         (ltu:SI (plus:SI (match_dup 1) (match_dup 2)) (match_dup 1)))]
    "TARGET_SH1"
    "addc        %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "addc1"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -1183,8 +1063,7 @@
     (clobber (reg:SI T_REG))]
    "TARGET_SH1"
    "addc        %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_expand "addsi3"
    [(set (match_operand:SI 0 "arith_reg_operand" "")
@@ -1213,8 +1092,7 @@
                  (match_operand:SI 2 "arith_operand" "rI")))]
    "TARGET_SH1"
    "add %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  ;; -------------------------------------------------------------------------
  ;; Subtraction instructions
@@ -1283,8 +1161,7 @@
         (gtu:SI (minus:SI (match_dup 1) (match_dup 2)) (match_dup 1)))]
    "TARGET_SH1"
    "subc        %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "subc1"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -1294,8 +1171,7 @@
     (clobber (reg:SI T_REG))]
    "TARGET_SH1"
    "subc        %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "*subsi3_internal"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -1303,8 +1179,7 @@
                   (match_operand:SI 2 "arith_reg_operand" "r")))]
    "TARGET_SH1"
    "sub %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "*subsi3_media"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -1384,7 +1259,7 @@
  ; the udivsi3 libcall has the same name, we must consider all registers
  ; clobbered that are in the union of the registers clobbered by the
  ; shmedia and the shcompact implementation.  Note, if the shcompact
-; implemenation actually used shcompact code, we'd need to clobber
+; implementation actually used shcompact code, we'd need to clobber
  ; also r23 and fr23.
  (define_insn "udivsi3_i1_media"
    [(set (match_operand:SI 0 "register_operand" "=z")
@@ -1477,11 +1352,11 @@
    ""
    "
  {
-  rtx first = 0, last;
+  rtx first, last;
  
    operands[3] = gen_reg_rtx (Pmode);
    /* Emit the move of the address to a pseudo outside of the libcall.  */
-  if (TARGET_HARD_SH4 && TARGET_SH3E)
+  if (TARGET_HARD_SH4 && TARGET_SH2E)
      {
        emit_move_insn (operands[3],
                       gen_rtx_SYMBOL_REF (SImode, \"__udivsi3_i4\"));
@@ -1494,8 +1369,8 @@
      {
        operands[1] = force_reg (SImode, operands[1]);
        operands[2] = force_reg (SImode, operands[2]);
-      last = gen_udivsi3_i4_media (operands[0], operands[1], operands[2]);
-      first = last;
+      emit_insn (gen_udivsi3_i4_media (operands[0], operands[1], operands[2]));
+      DONE;
      }
    else if (TARGET_SH5)
      {
@@ -1522,11 +1397,8 @@
                       gen_rtx_SYMBOL_REF (SImode, \"__udivsi3\"));
        last = gen_udivsi3_i1 (operands[0], operands[3]);
      }
-  if (! first)
-    {
-      first = emit_move_insn (gen_rtx_REG (SImode, 4), operands[1]);
-      emit_move_insn (gen_rtx_REG (SImode, 5), operands[2]);
-    }
+  first = emit_move_insn (gen_rtx_REG (SImode, 4), operands[1]);
+  emit_move_insn (gen_rtx_REG (SImode, 5), operands[2]);
    last = emit_insn (last);
    /* Wrap the sequence in REG_LIBCALL / REG_RETVAL notes so that loop
       invariant code motion can move it.  */
@@ -1553,7 +1425,7 @@
  ; the sdivsi3 libcall has the same name, we must consider all registers
  ; clobbered that are in the union of the registers clobbered by the
  ; shmedia and the shcompact implementation.  Note, if the shcompact
-; implemenation actually used shcompact code, we'd need to clobber
+; implementation actually used shcompact code, we'd need to clobber
  ; also r22, r23 and fr23.
  (define_insn "divsi3_i1_media"
    [(set (match_operand:SI 0 "register_operand" "=z")
@@ -1630,11 +1502,11 @@
    ""
    "
  {
-  rtx first = 0, last;
+  rtx first, last;
  
    operands[3] = gen_reg_rtx (Pmode);
    /* Emit the move of the address to a pseudo outside of the libcall.  */
-  if (TARGET_HARD_SH4 && TARGET_SH3E)
+  if (TARGET_HARD_SH4 && TARGET_SH2E)
      {
        emit_move_insn (operands[3],
                       gen_rtx_SYMBOL_REF (SImode, \"__sdivsi3_i4\"));
@@ -1647,8 +1519,8 @@
      {
        operands[1] = force_reg (SImode, operands[1]);
        operands[2] = force_reg (SImode, operands[2]);
-      last = gen_divsi3_i4_media (operands[0], operands[1], operands[2]);
-      first = last;
+      emit_insn (gen_divsi3_i4_media (operands[0], operands[1], operands[2]));
+      DONE;
      }
    else if (TARGET_SH5)
      {
@@ -1674,11 +1546,8 @@
        emit_move_insn (operands[3], gen_rtx_SYMBOL_REF (SImode, \"__sdivsi3\"));
        last = gen_divsi3_i1 (operands[0], operands[3]);
      }
-  if (! first)
-    {
-      first = emit_move_insn (gen_rtx_REG (SImode, 4), operands[1]);
-      emit_move_insn (gen_rtx_REG (SImode, 5), operands[2]);
-    }
+  first = emit_move_insn (gen_rtx_REG (SImode, 4), operands[1]);
+  emit_move_insn (gen_rtx_REG (SImode, 5), operands[2]);
    last = emit_insn (last);
    /* Wrap the sequence in REG_LIBCALL / REG_RETVAL notes so that loop
       invariant code motion can move it.  */
@@ -1730,6 +1599,14 @@
       invariant code motion can move it.  */
    REG_NOTES (first) = gen_rtx_INSN_LIST (REG_LIBCALL, last, REG_NOTES (first));
    REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
+  /* expand_binop can't find a suitable code in umul_widen_optab to
+     make a REG_EQUAL note from, so make one here.
+     See also smulsi3_highpart.
+     ??? Alternatively, we could put this at the calling site of expand_binop,
+     i.e. expand_expr.  */
+  REG_NOTES (last)
+    = gen_rtx_EXPR_LIST (REG_EQUAL, copy_rtx (SET_SRC (single_set (first))),
+                        REG_NOTES (last));
    DONE;
  }")
  
@@ -1752,6 +1629,14 @@
       invariant code motion can move it.  */
    REG_NOTES (first) = gen_rtx_INSN_LIST (REG_LIBCALL, last, REG_NOTES (first));
    REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
+  /* expand_binop can't find a suitable code in umul_widen_optab to
+     make a REG_EQUAL note from, so make one here.
+     See also smulsi3_highpart.
+     ??? Alternatively, we could put this at the calling site of expand_binop,
+     i.e. expand_expr.  */
+  REG_NOTES (last)
+    = gen_rtx_EXPR_LIST (REG_EQUAL, copy_rtx (SET_SRC (single_set (first))),
+                        REG_NOTES (last));
    DONE;
  }")
  
@@ -2015,6 +1900,7 @@
    REG_NOTES (last) = gen_rtx_INSN_LIST (REG_RETVAL, first, REG_NOTES (last));
    /* expand_binop can't find a suitable code in mul_highpart_optab to
       make a REG_EQUAL note from, so make one here.
+     See also {,u}mulhisi.
       ??? Alternatively, we could put this at the calling site of expand_binop,
       i.e. expand_mult_highpart.  */
    REG_NOTES (last)
@@ -2072,10 +1958,9 @@
                 (match_operand:SI 2 "logical_operand" "r,L")))]
    "TARGET_SH1"
    "and %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
-;; If the constant is 255, then emit a extu.b instruction instead of an
+;; If the constant is 255, then emit an extu.b instruction instead of an
  ;; and, since that will give better code.
  
  (define_expand "andsi3"
@@ -2096,7 +1981,7 @@
  (define_insn_and_split "anddi3"
    [(set (match_operand:DI 0 "arith_reg_operand" "=r,r,r")
         (and:DI (match_operand:DI 1 "arith_reg_operand" "%r,r,r")
-               (match_operand:DI 2 "and_operand" "r,P,n")))]
+               (match_operand:DI 2 "and_operand" "r,P,Z")))]
    "TARGET_SHMEDIA"
    "@
         and     %1, %2, %0
@@ -2129,8 +2014,7 @@
                 (match_operand:SI 2 "logical_operand" "r,L")))]
    "TARGET_SH1"
    "or  %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "iordi3"
    [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -2148,8 +2032,7 @@
                 (match_operand:SI 2 "logical_operand" "L,r")))]
    "TARGET_SH1"
    "xor %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "xordi3"
    [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -2160,6 +2043,33 @@
         xor     %1, %2, %0
         xori    %1, %2, %0"
    [(set_attr "type" "arith_media")])
+
+;; Combiner bridge pattern for 2 * sign extend -> logical op -> truncate.
+;; converts 2 * sign extend -> logical op into logical op -> sign extend
+(define_split
+  [(set (match_operand:DI 0 "arith_reg_operand" "")
+       (sign_extend:DI (match_operator 4 "binary_logical_operator"
+                         [(match_operand 1 "any_register_operand" "")
+                          (match_operand 2 "any_register_operand" "")])))]
+  "TARGET_SHMEDIA"
+  [(set (match_dup 5) (match_dup 4))
+   (set (match_dup 0) (sign_extend:DI (match_dup 5)))]
+"
+{
+  enum machine_mode inmode = GET_MODE (operands[1]);
+  int regno, offset = 0;
+
+  if (GET_CODE (operands[0]) == SUBREG)
+    {
+      offset = SUBREG_BYTE (operands[0]);
+      operands[0] = SUBREG_REG (operands[0]);
+    }
+  if (GET_CODE (operands[0]) != REG)
+    abort ();
+  if (! TARGET_LITTLE_ENDIAN)
+    offset += 8 - GET_MODE_SIZE (inmode);
+  operands[5] = gen_rtx_SUBREG (inmode, operands[0], offset);
+}")
  \f
  ;; -------------------------------------------------------------------------
  ;; Shifts and rotates
@@ -2216,8 +2126,7 @@
         (lshiftrt:SI (match_dup 1) (const_int 31)))]
    "TARGET_SH1"
    "rotl        %0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "rotlsi3_31"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2226,8 +2135,7 @@
     (clobber (reg:SI T_REG))]
    "TARGET_SH1"
    "rotr        %0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "rotlsi3_16"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2235,8 +2143,7 @@
                    (const_int 16)))]
    "TARGET_SH1"
    "swap.w      %1,%0"
-  [(set_attr "type" "arith")
-  (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_expand "rotlsi3"
    [(set (match_operand:SI 0 "arith_reg_operand" "")
@@ -2300,8 +2207,7 @@
                    (const_int 8)))]
    "TARGET_SH1"
    "swap.b      %1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_expand "rotlhi3"
    [(set (match_operand:HI 0 "arith_reg_operand" "")
@@ -2343,8 +2249,7 @@
       (clobber (match_dup 4))])]
    "operands[4] = gen_rtx_SCRATCH (SImode);"
    [(set_attr "length" "*,*,*,4")
-   (set_attr "type" "dyn_shift,arith,arith,arith")
-   (set_attr "insn_class" "ex_group,ex_group,ex_group,ex_group")])
+   (set_attr "type" "dyn_shift,arith,arith,arith")])
  
  (define_insn "ashlhi3_k"
    [(set (match_operand:HI 0 "arith_reg_operand" "=r,r")
@@ -2354,8 +2259,7 @@
    "@
         add     %0,%0
         shll%O2 %0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "ashlsi3_n"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2372,8 +2276,7 @@
                (eq (symbol_ref "shift_insns_rtx (insn)") (const_int 3))
                (const_string "6")]
               (const_string "8")))
-   (set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+   (set_attr "type" "arith")])
  
  (define_split
    [(set (match_operand:SI 0 "arith_reg_operand" "")
@@ -2462,8 +2365,7 @@
     (clobber (reg:SI T_REG))]
    "TARGET_SH1 && INTVAL (operands[2]) == 1"
    "shar        %0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  ;; We can't do HImode right shifts correctly unless we start out with an
  ;; explicit zero / sign extension; doing that would result in worse overall
@@ -2522,8 +2424,7 @@
         (lt:SI (match_dup 1) (const_int 0)))]
    "TARGET_SH1"
    "shll        %0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "ashrsi3_d"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2531,8 +2432,7 @@
                      (neg:SI (match_operand:SI 2 "arith_reg_operand" "r"))))]
    "TARGET_SH3"
    "shad        %2,%0"
-  [(set_attr "type" "dyn_shift")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "dyn_shift")])
  
  (define_insn "ashrsi3_n"
    [(set (reg:SI R4_REG)
@@ -2583,8 +2483,7 @@
                      (neg:SI (match_operand:SI 2 "arith_reg_operand" "r"))))]
    "TARGET_SH3"
    "shld        %2,%0"
-  [(set_attr "type" "dyn_shift")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "dyn_shift")])
  
  ;;  Only the single bit shift clobbers the T bit.
  
@@ -2595,8 +2494,7 @@
     (clobber (reg:SI T_REG))]
    "TARGET_SH1 && CONST_OK_FOR_M (INTVAL (operands[2]))"
    "shlr        %0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "lshrsi3_k"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2605,8 +2503,7 @@
    "TARGET_SH1 && CONST_OK_FOR_K (INTVAL (operands[2]))
     && ! CONST_OK_FOR_M (INTVAL (operands[2]))"
    "shlr%O2     %0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "lshrsi3_n"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -2685,8 +2582,7 @@
    "TARGET_SH1"
    "shll        %R0\;rotcl      %S0"
    [(set_attr "length" "4")
-   (set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+   (set_attr "type" "arith")])
  
  (define_insn "ashldi3_media"
    [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -2726,8 +2622,7 @@
    "TARGET_SH1"
    "shlr        %S0\;rotcr      %R0"
    [(set_attr "length" "4")
-   (set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+   (set_attr "type" "arith")])
  
  (define_insn "lshrdi3_media"
    [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -2767,8 +2662,7 @@
    "TARGET_SH1"
    "shar        %S0\;rotcr      %R0"
    [(set_attr "length" "4")
-   (set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+   (set_attr "type" "arith")])
  
  (define_insn "ashrdi3_media"
    [(set (match_operand:DI 0 "arith_reg_operand" "=r,r")
@@ -2805,7 +2699,7 @@
         (and:SI (ashift:SI (match_operand:SI 1 "register_operand" "")
                            (match_operand:SI 2 "const_int_operand" ""))
                 (match_operand:SI 3 "const_int_operand" "")))]
-  "TARGET_SH1 && (unsigned)INTVAL (operands[2]) < 32"
+  "TARGET_SH1 && reload_completed && (unsigned)INTVAL (operands[2]) < 32"
    [(use (reg:SI R0_REG))]
    "if (gen_shl_and (operands[0], operands[2], operands[3], operands[1])) FAIL;
     DONE;")
@@ -2816,7 +2710,7 @@
                            (match_operand:SI 2 "const_int_operand" ""))
                 (match_operand:SI 3 "const_int_operand" "")))
     (clobber (reg:SI T_REG))]
-  "TARGET_SH1 && (unsigned)INTVAL (operands[2]) < 32"
+  "TARGET_SH1 && reload_completed && (unsigned)INTVAL (operands[2]) < 32"
    [(use (reg:SI R0_REG))]
    "if (gen_shl_and (operands[0], operands[2], operands[3], operands[1])) FAIL;
     DONE;")
@@ -3003,8 +2897,7 @@
                              (const_int 16))))]
    "TARGET_SH1"
    "xtrct       %1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "xtrct_right"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
@@ -3014,8 +2907,7 @@
                            (const_int 16))))]
    "TARGET_SH1"
    "xtrct       %2,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  ;; -------------------------------------------------------------------------
  ;; Unary arithmetic
@@ -3030,8 +2922,7 @@
                (const_int 0)))]
    "TARGET_SH1"
    "negc        %1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "*negdi_media"
    [(set (match_operand:DI 0 "arith_reg_operand" "=r")
@@ -3069,16 +2960,14 @@
         (neg:SI (match_operand:SI 1 "arith_reg_operand" "r")))]
    "TARGET_SH1"
    "neg %1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "one_cmplsi2"
    [(set (match_operand:SI 0 "arith_reg_operand" "=r")
         (not:SI (match_operand:SI 1 "arith_reg_operand" "r")))]
    "TARGET_SH1"
    "not %1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_expand "one_cmpldi2"
    [(set (match_operand:DI 0 "arith_reg_operand" "")
@@ -3153,8 +3042,7 @@
         (zero_extend:SI (match_operand:HI 1 "arith_reg_operand" "r")))]
    "TARGET_SH1"
    "extu.w      %1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "*zero_extendhisi2_media"
    [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -3192,8 +3080,7 @@
         (zero_extend:SI (match_operand:QI 1 "arith_reg_operand" "r")))]
    "TARGET_SH1"
    "extu.b      %1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  (define_insn "*zero_extendqisi2_media"
    [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -3209,8 +3096,7 @@
         (zero_extend:HI (match_operand:QI 1 "arith_reg_operand" "r")))]
    "TARGET_SH1"
    "extu.b      %1,%0"
-  [(set_attr "type" "arith")
-   (set_attr "insn_class" "ex_group")])
+  [(set_attr "type" "arith")])
  
  ;; -------------------------------------------------------------------------
  ;; Sign extension instructions
@@ -3284,8 +3170,7 @@
    "@
         exts.w  %1,%0
         mov.w   %1,%0"
-  [(set_attr "type" "arith,load")
-   (set_attr "insn_class" "ex_group,*")])
+  [(set_attr "type" "arith,load")])
  
  (define_insn "*extendhisi2_media"
    [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -3321,8 +3206,7 @@
    "@
         exts.b  %1,%0
         mov.b   %1,%0"
-  [(set_attr "type" "arith,load")
-   (set_attr "insn_class" "ex_group,*")])
+  [(set_attr "type" "arith,load")])
  
  (define_insn "*extendqisi2_media"
    [(set (match_operand:SI 0 "register_operand" "=r,r")
@@ -3352,8 +3236,7 @@
    "@
         exts.b  %1,%0
         mov.b   %1,%0"
-  [(set_attr "type" "arith,load")
-   (set_attr "insn_class" "ex_group,*")])
+  [(set_attr "type" "arith,load")])
  
  /* It would seem useful to combine the truncXi patterns into the movXi
     patterns, but unary operators are ignored when matching constraints,
@@ -3424,9 +3307,10 @@
  
  (define_insn "push_fpul"
    [(set (mem:SF (pre_dec:SI (reg:SI SP_REG))) (reg:SF FPUL_REG))]
-  "TARGET_SH3E && ! TARGET_SH5"
+  "TARGET_SH2E && ! TARGET_SH5"
    "sts.l       fpul,@-r15"
    [(set_attr "type" "store")
+   (set_attr "late_fp_use" "yes")
     (set_attr "hit_stack" "yes")])
  
  ;; DFmode pushes for sh4 require a lot of what is defined for movdf_i4,
@@ -3449,7 +3333,7 @@
  
  (define_insn "pop_fpul"
    [(set (reg:SF FPUL_REG) (mem:SF (post_inc:SI (reg:SI SP_REG))))]
-  "TARGET_SH3E && ! TARGET_SH5"
+  "TARGET_SH2E && ! TARGET_SH5"
    "lds.l       @r15+,fpul"
    [(set_attr "type" "load")
     (set_attr "hit_stack" "yes")])
@@ -3462,6 +3346,32 @@
    "TARGET_SH1 && ! TARGET_SH5"
    "")
  
+(define_expand "push_fpscr"
+  [(const_int 0)]
+  "TARGET_SH3E"
+  "
+{
+  rtx insn = emit_insn (gen_fpu_switch (gen_rtx (MEM, PSImode,
+                                                gen_rtx (PRE_DEC, Pmode,
+                                                         stack_pointer_rtx)),
+                                       get_fpscr_rtx ()));
+  REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, stack_pointer_rtx, NULL_RTX);
+  DONE;
+}")
+
+(define_expand "pop_fpscr"
+  [(const_int 0)]
+  "TARGET_SH3E"
+  "
+{
+  rtx insn = emit_insn (gen_fpu_switch (get_fpscr_rtx (),
+                                       gen_rtx (MEM, PSImode,
+                                                gen_rtx (POST_INC, Pmode,
+                                                         stack_pointer_rtx))));
+  REG_NOTES (insn) = gen_rtx (EXPR_LIST, REG_INC, stack_pointer_rtx, NULL_RTX);
+  DONE;
+}")
+
  ;; These two patterns can happen as the result of optimization, when
  ;; comparisons get simplified to a move of zero or 1 into the T reg.
  ;; They don't disappear completely, because the T reg is a fixed hard reg.
@@ -3483,7 +3393,7 @@
    [(set (match_operand:SI 0 "general_movdst_operand" "=r,r,t,r,r,r,r,m,<,<,x,l,x,l,r")
         (match_operand:SI 1 "general_movsrc_operand" "Q,rI,r,mr,x,l,t,r,x,l,r,r,>,>,i"))]
    "TARGET_SH1
-   && ! TARGET_SH3E
+   && ! TARGET_SH2E
     && (register_operand (operands[0], SImode)
         || register_operand (operands[1], SImode))"
    "@
@@ -3502,18 +3412,19 @@
         lds.l   %1,%0
         lds.l   %1,%0
         fake    %1,%0"
-  [(set_attr "type" "pcload_si,move,*,load_si,move,prget,move,store,store,pstore,move,prset,load,pload,pcload_si")
-   (set_attr "insn_class"  "*,*,mt_group,*,*,*,*,*,*,*,*,*,*,*,*")
+  [(set_attr "type" "pcload_si,move,mt_group,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,pcload_si")
     (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")])
  
  ;; t/r must come after r/r, lest reload will try to reload stuff like
  ;; (subreg:SI (reg:SF FR14_REG) 0) into T (compiling stdlib/strtod.c -m3e -O2)
  ;; ??? This allows moves from macl to fpul to be recognized, but these moves
  ;; will require a reload.
+;; ??? We can't include f/f because we need the proper FPSCR setting when
+;; TARGET_FMOVD is in effect, and mode switching is done before reload.
  (define_insn "movsi_ie"
-  [(set (match_operand:SI 0 "general_movdst_operand" "=r,r,t,r,r,r,r,m,<,<,x,l,x,l,y,<,r,y,r,y")
-       (match_operand:SI 1 "general_movsrc_operand" "Q,rI,r,mr,x,l,t,r,x,l,r,r,>,>,>,y,i,r,y,y"))]
-  "TARGET_SH3E
+  [(set (match_operand:SI 0 "general_movdst_operand" "=r,r,t,r,r,r,r,m,<,<,x,l,x,l,y,<,r,y,r,*f,y,*f,y")
+       (match_operand:SI 1 "general_movsrc_operand" "Q,rI,r,mr,x,l,t,r,x,l,r,r,>,>,>,y,i,r,y,y,*f,*f,y"))]
+  "TARGET_SH2E
     && (register_operand (operands[0], SImode)
         || register_operand (operands[1], SImode))"
    "@
@@ -3536,9 +3447,13 @@
         fake    %1,%0
         lds     %1,%0
         sts     %1,%0
+       fsts    fpul,%0
+       flds    %1,fpul
+       fmov    %1,%0
         ! move optimized away"
-  [(set_attr "type" "pcload_si,move,*,load_si,move,prget,move,store,store,pstore,move,prset,load,pload,load,store,pcload_si,gp_fpul,gp_fpul,nil")
-   (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")])
+  [(set_attr "type" "pcload_si,move,*,load_si,mac_gp,prget,move,store,store,pstore,move,prset,load,pload,load,store,pcload_si,gp_fpul,fpul_gp,fmove,fmove,fmove,nil")
+   (set_attr "late_fp_use" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,yes,*,*,yes,*,*,*,*")
+   (set_attr "length" "*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,0")])
  
  (define_insn "movsi_i_lowpart"
    [(set (strict_low_part (match_operand:SI 0 "general_movdst_operand" "+r,r,r,r,r,r,m,r"))
@@ -3559,16 +3474,16 @@
  
  (define_insn "*movsi_media"
    [(set (match_operand:SI 0 "general_movdst_operand" "=r,r,r,r,m,f,m,f,r,f,*b,r,b")
-       (match_operand:SI 1 "general_movsrc_operand" "r,JS,ns,m,r,m,f,rU,f,f,r,*b,T"))]
+       (match_operand:SI 1 "general_movsrc_operand" "r,JS,ns,m,rU,m,f,rU,f,f,r,*b,T"))]
    "TARGET_SHMEDIA_FPU
     && (register_operand (operands[0], SImode)
-       || register_operand (operands[1], SImode))"
+       || sh_register_operand (operands[1], SImode))"
    "@
         add.l   %1, r63, %0
         movi    %1, %0
         #
         ld%M1.l %m1, %0
-       st%M0.l %m0, %1
+       st%M0.l %m0, %N1
         fld%M1.s        %m1, %0
         fst%M0.s        %m0, %1
         fmov.ls %N1, %0
@@ -3582,16 +3497,16 @@
  
  (define_insn "*movsi_media_nofpu"
    [(set (match_operand:SI 0 "general_movdst_operand" "=r,r,r,r,m,*b,r,b")
-       (match_operand:SI 1 "general_movsrc_operand" "r,JS,ns,m,r,r,*b,T"))]
+       (match_operand:SI 1 "general_movsrc_operand" "r,JS,ns,m,rU,r,*b,T"))]
    "TARGET_SHMEDIA
     && (register_operand (operands[0], SImode)
-       || register_operand (operands[1], SImode))"
+       || sh_register_operand (operands[1], SImode))"
    "@
         add.l   %1, r63, %0
         movi    %1, %0
         #
         ld%M1.l %m1, %0
-       st%M0.l %m0, %1
+       st%M0.l %m0, %N1
         ptabs   %1, %0
         gettr   %1, %0
         pt      %1, %0"
@@ -3662,7 +3577,7 @@
    "TARGET_HARD_SH4"
    "ocbwb\\t@%0\;extu.w\\t%0,%2\;or\\t%1,%2\;mov.l\\t%0,@%2"
    [(set_attr "length" "8")
-   (set_attr "insn_class" "cwb")])
+   (set_attr "type" "cwb")])
  
  ;; ??? could make arg 0 an offsettable memory operand to allow to save
  ;; an add in the code that calculates the address.
@@ -3693,9 +3608,8 @@
  {
    rtx sfun, tramp;
  
+  tramp = force_reg (Pmode, operands[0]);
    sfun = force_reg (Pmode, gen_rtx_SYMBOL_REF (Pmode, \"__init_trampoline\"));
-  tramp = gen_rtx_REG (SImode, R0_REG);
-  emit_move_insn (tramp, operands[0]);
    emit_move_insn (gen_rtx_REG (SImode, R2_REG), operands[1]);
    emit_move_insn (gen_rtx_REG (SImode, R3_REG), operands[2]);
  
@@ -3732,15 +3646,15 @@
  
  (define_insn "*movqi_media"
    [(set (match_operand:QI 0 "general_movdst_operand" "=r,r,r,m")
-       (match_operand:QI 1 "general_movsrc_operand" "r,JS,m,r"))]
+       (match_operand:QI 1 "general_movsrc_operand" "r,JS,m,rU"))]
    "TARGET_SHMEDIA
     && (arith_reg_operand (operands[0], QImode)
-       || arith_reg_operand (operands[1], QImode))"
+       || arith_reg_or_0_operand (operands[1], QImode))"
    "@
         add.l   %1, r63, %0
         movi    %1, %0
         ld%M1.ub        %m1, %0
-       st%M0.b %m0, %1"
+       st%M0.b %m0, %N1"
    [(set_attr "type" "arith_media,arith_media,load_media,store_media")])
  
  (define_expand "movqi"
@@ -3784,16 +3698,16 @@
  
  (define_insn "*movhi_media"
    [(set (match_operand:HI 0 "general_movdst_operand" "=r,r,r,r,m")
-       (match_operand:HI 1 "general_movsrc_operand" "r,JS,n,m,r"))]
+       (match_operand:HI 1 "general_movsrc_operand" "r,JS,n,m,rU"))]
    "TARGET_SHMEDIA
     && (arith_reg_operand (operands[0], HImode)
-       || arith_reg_operand (operands[1], HImode))"
+       || arith_reg_or_0_operand (operands[1], HImode))"
    "@
         add.l   %1, r63, %0
         movi    %1, %0
         #
         ld%M1.w %m1, %0
-       st%M0.w %m0, %1"
+       st%M0.w %m0, %N1"
    [(set_attr "type" "arith_media,arith_media,*,load_media,store_media")])
  
  (define_split
@@ -3825,11 +3739,9 @@
    operands[3] = gen_rtx_REG (DImode, REGNO (operands[2]));
  }")
  
-;; ??? This should be a define expand.
-
  ;; x/r can be created by inlining/cse, e.g. for execute/961213-1.c
  ;; compiled with -m2 -ml -O3 -funroll-loops
-(define_insn ""
+(define_insn "*movdi_i"
    [(set (match_operand:DI 0 "general_movdst_operand" "=r,r,r,m,r,r,r,*!x")
         (match_operand:DI 1 "general_movsrc_operand" "Q,r,m,r,I,i,x,r"))]
    "TARGET_SH1
@@ -3890,16 +3802,16 @@
  
  (define_insn "*movdi_media"
    [(set (match_operand:DI 0 "general_movdst_operand" "=r,r,r,rl,m,f,m,f,r,f,*b,r,b")
-       (match_operand:DI 1 "general_movsrc_operand" "r,JS,iF,m,rl,m,f,rU,f,f,r,*b,T"))]
+       (match_operand:DI 1 "general_movsrc_operand" "r,JS,iF,m,rlU,m,f,rU,f,f,r,*b,T"))]
    "TARGET_SHMEDIA_FPU
     && (register_operand (operands[0], DImode)
-       || register_operand (operands[1], DImode))"
+       || sh_register_operand (operands[1], DImode))"
    "@
         add     %1, r63, %0
         movi    %1, %0
         #
         ld%M1.q %m1, %0
-       st%M0.q %m0, %1
+       st%M0.q %m0, %N1
         fld%M1.d        %m1, %0
         fst%M0.d        %m0, %1
         fmov.qd %N1, %0
@@ -3913,16 +3825,16 @@
  
  (define_insn "*movdi_media_nofpu"
    [(set (match_operand:DI 0 "general_movdst_operand" "=r,r,r,rl,m,*b,r,b")
-       (match_operand:DI 1 "general_movsrc_operand" "r,JS,iF,m,rl,r,*b,T"))]
+       (match_operand:DI 1 "general_movsrc_operand" "r,JS,iF,m,rlU,r,*b,T"))]
    "TARGET_SHMEDIA
     && (register_operand (operands[0], DImode)
-       || register_operand (operands[1], DImode))"
+       || sh_register_operand (operands[1], DImode))"
    "@
         add     %1, r63, %0
         movi    %1, %0
         #
         ld%M1.q %m1, %0
-       st%M0.q %m0, %1
+       st%M0.q %m0, %N1
         ptabs   %1, %0
         gettr   %1, %0
         pt      %1, %0"
@@ -3989,14 +3901,7 @@
     && MOVI_SHORI_BASE_OPERAND_P (operands[1])"
    "
  {
-  if (GET_CODE (operands[1]) == LABEL_REF
-      && GET_CODE (XEXP (operands[1], 0)) == CODE_LABEL)
-    LABEL_NUSES (XEXP (operands[1], 0)) += 4;
-  else if (GOTOFF_P (operands[1])
-          && GET_CODE (XVECEXP (XEXP (operands[1], 0), 0, 0)) == LABEL_REF
-          && (GET_CODE (XEXP (XVECEXP (XEXP (operands[1], 0), 0, 0), 0))
-              == CODE_LABEL))
-    LABEL_NUSES (XEXP (XVECEXP (XEXP (operands[1], 0), 0, 0), 0)) += 4;
+  sh_mark_label (operands[1], 4);
  }")
  
  (define_expand "movdi_const_32bit"
@@ -4018,14 +3923,7 @@
     && MOVI_SHORI_BASE_OPERAND_P (operands[1])"
    "
  {
-  if (GET_CODE (operands[1]) == LABEL_REF
-      && GET_CODE (XEXP (operands[1], 0)) == CODE_LABEL)
-    LABEL_NUSES (XEXP (operands[1], 0)) += 2;
-  else if (GOTOFF_P (operands[1])
-          && GET_CODE (XVECEXP (XEXP (operands[1], 0), 0, 0)) == LABEL_REF
-          && (GET_CODE (XEXP (XVECEXP (XEXP (operands[1], 0), 0, 0), 0))
-              == CODE_LABEL))
-    LABEL_NUSES (XEXP (XVECEXP (XEXP (operands[1], 0), 0, 0), 0)) += 2;
+  sh_mark_label (operands[1], 2);
  }")
  
  (define_expand "movdi_const_16bit"
@@ -4196,10 +4094,10 @@
  
  (define_insn "movdf_media"
    [(set (match_operand:DF 0 "general_movdst_operand" "=f,f,r,r,r,f,m,r,m")
-       (match_operand:DF 1 "general_movsrc_operand" "f,rU,f,r,F,m,f,m,r"))]
+       (match_operand:DF 1 "general_movsrc_operand" "f,rU,f,r,F,m,f,m,rU"))]
    "TARGET_SHMEDIA_FPU
     && (register_operand (operands[0], DFmode)
-       || register_operand (operands[1], DFmode))"
+       || sh_register_operand (operands[1], DFmode))"
    "@
         fmov.d  %1, %0
         fmov.qd %N1, %0
@@ -4209,20 +4107,20 @@
         fld%M1.d        %m1, %0
         fst%M0.d        %m0, %1
         ld%M1.q %m1, %0
-       st%M0.q %m0, %1"
+       st%M0.q %m0, %N1"
    [(set_attr "type" "fmove_media,fload_media,dfpconv_media,arith_media,*,fload_media,fstore_media,load_media,store_media")])
  
  (define_insn "movdf_media_nofpu"
    [(set (match_operand:DF 0 "general_movdst_operand" "=r,r,r,m")
-       (match_operand:DF 1 "general_movsrc_operand" "r,F,m,r"))]
+       (match_operand:DF 1 "general_movsrc_operand" "r,F,m,rU"))]
    "TARGET_SHMEDIA
     && (register_operand (operands[0], DFmode)
-       || register_operand (operands[1], DFmode))"
+       || sh_register_operand (operands[1], DFmode))"
    "@
         add     %1, r63, %0
         #
         ld%M1.q %m1, %0
-       st%M0.q %m0, %1"
+       st%M0.q %m0, %N1"
    [(set_attr "type" "arith_media,*,load_media,store_media")])
  
  (define_split
@@ -4309,7 +4207,8 @@
        (if_then_else
         (ne (symbol_ref "TARGET_SHCOMPACT") (const_int 0))
         (const_int 10) (const_int 8))])
-   (set_attr "type" "fmove,move,pcload,load,store,pcload,load,store,load,load")
+   (set_attr "type" "fmove,move,pcfload,fload,store,pcload,load,store,load,fload")
+   (set_attr "late_fp_use" "*,*,*,*,yes,*,*,*,*,*")
     (set (attr "fp_mode") (if_then_else (eq_attr "fmovd" "yes")
                                            (const_string "double")
                                            (const_string "none")))])
@@ -4465,7 +4364,7 @@
         (match_operand:SF 1 "register_operand" ""))
     (use (match_operand:PSI 2 "fpscr_operand" ""))
     (clobber (match_scratch:SI 3 "X"))]
-  "TARGET_SH3E && reload_completed
+  "TARGET_SH2E && reload_completed
     && true_regnum (operands[0]) == true_regnum (operands[1])"
    [(set (match_dup 0) (match_dup 0))]
    "")
@@ -4805,7 +4704,7 @@
  
  (define_insn_and_split "*movv4sf_i"
    [(set (match_operand:V4SF 0 "nonimmediate_operand" "=f,f,m")
-       (match_operand:V4SF 1 "general_operand" "fU,m,f"))]
+       (match_operand:V4SF 1 "general_operand" "fU,m,fU"))]
    "TARGET_SHMEDIA_FPU"
    "#"
    "&& reload_completed"
@@ -4903,10 +4802,10 @@
  
  (define_insn "movsf_media"
    [(set (match_operand:SF 0 "general_movdst_operand" "=f,f,r,r,r,f,m,r,m")
-       (match_operand:SF 1 "general_movsrc_operand" "f,rU,f,r,F,m,f,m,r"))]
+       (match_operand:SF 1 "general_movsrc_operand" "f,rU,f,r,F,m,f,m,rU"))]
    "TARGET_SHMEDIA_FPU
     && (register_operand (operands[0], SFmode)
-       || register_operand (operands[1], SFmode))"
+       || sh_register_operand (operands[1], SFmode))"
    "@
         fmov.s  %1, %0
         fmov.ls %N1, %0
@@ -4916,20 +4815,20 @@
         fld%M1.s        %m1, %0
         fst%M0.s        %m0, %1
         ld%M1.l %m1, %0
-       st%M0.l %m0, %1"
+       st%M0.l %m0, %N1"
    [(set_attr "type" "fmove_media,fload_media,fpconv_media,arith_media,*,fload_media,fstore_media,load_media,store_media")])
  
  (define_insn "movsf_media_nofpu"
    [(set (match_operand:SF 0 "general_movdst_operand" "=r,r,r,m")
-       (match_operand:SF 1 "general_movsrc_operand" "r,F,m,r"))]
+       (match_operand:SF 1 "general_movsrc_operand" "r,F,m,rU"))]
    "TARGET_SHMEDIA
     && (register_operand (operands[0], SFmode)
-       || register_operand (operands[1], SFmode))"
+       || sh_register_operand (operands[1], SFmode))"
    "@
         add.l   %1, r63, %0
         #
         ld%M1.l %m1, %0
-       st%M0.l %m0, %1"
+       st%M0.l %m0, %N1"
    [(set_attr "type" "arith_media,*,load_media,store_media")])
  
  (define_split
@@ -4954,7 +4853,7 @@
    [(set (match_operand:SF 0 "general_movdst_operand" "=r,r,r,r,m,l,r")
         (match_operand:SF 1 "general_movsrc_operand"  "r,I,FQ,mr,r,r,l"))]
    "TARGET_SH1
-   && (! TARGET_SH3E
+   && (! TARGET_SH2E
         /* ??? We provide some insn so that direct_{load,store}[SFmode] get set */
         || (GET_CODE (operands[0]) == REG && REGNO (operands[0]) == 3)
         || (GET_CODE (operands[1]) == REG && REGNO (operands[1]) == 3))
@@ -4981,7 +4880,7 @@
     (use (match_operand:PSI 2 "fpscr_operand" "c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c"))
     (clobber (match_scratch:SI 3 "=X,X,X,X,&z,X,X,X,X,X,X,X,X,y,X,X,X,X,X"))]
  
-  "TARGET_SH3E
+  "TARGET_SH2E
     && (arith_reg_operand (operands[0], SFmode)
         || arith_reg_operand (operands[1], SFmode)
         || arith_reg_operand (operands[3], SImode)
@@ -5011,7 +4910,8 @@
         sts.l   %1,%0
         lds.l   %1,%0
         ! move optimized away"
-  [(set_attr "type" "fmove,move,fmove,fmove,pcload,load,store,pcload,load,store,fmove,fmove,load,*,gp_fpul,gp_fpul,store,load,nil")
+  [(set_attr "type" "fmove,move,fmove,fmove,pcfload,fload,store,pcload,load,store,fmove,fmove,load,*,fpul_gp,gp_fpul,store,load,nil")
+   (set_attr "late_fp_use" "*,*,*,*,*,*,yes,*,*,*,*,*,*,*,yes,*,yes,*,*")
     (set_attr "length" "*,*,*,*,4,*,*,*,*,*,2,2,2,4,2,2,2,2,0")
     (set (attr "fp_mode") (if_then_else (eq_attr "fmovd" "yes")
                                            (const_string "single")
@@ -5047,7 +4947,7 @@
         emit_insn (gen_movsf_media_nofpu (operands[0], operands[1]));
        DONE;
      }
-  if (TARGET_SH3E)
+  if (TARGET_SH2E)
      {
        emit_sf_insn (gen_movsf_ie (operands[0], operands[1], get_fpscr_rtx ()));
        DONE;
@@ -5056,7 +4956,7 @@
  
  (define_insn "mov_nop"
    [(set (match_operand 0 "any_register_operand" "") (match_dup 0))]
-  "TARGET_SH3E"
+  "TARGET_SH2E"
    ""
    [(set_attr "length" "0")
     (set_attr "type" "nil")])
@@ -5080,7 +4980,7 @@
    [(set (match_operand:SI 0 "register_operand" "=y,y")
         (match_operand:SI 1 "immediate_operand" "Qi,I"))
     (clobber (match_scratch:SI 2 "=&z,r"))]
-  "TARGET_SH3E
+  "TARGET_SH2E
     && (reload_in_progress || reload_completed)"
    "#"
    [(set_attr "length" "4")
@@ -5135,12 +5035,29 @@
  
  ;; This one has the additional purpose to record a possible scratch register
  ;; for the following branch.
+;; ??? Unfortunately, just setting the scratch register is not good enough,
+;; because the insn then might be deemed dead and deleted.  And we can't
+;; make the use in the jump insn explicit because that would disable
+;; delay slot scheduling from the target.
  (define_insn "indirect_jump_scratch"
    [(set (match_operand:SI 0 "register_operand" "=r")
-       (unspec:SI [(match_operand 1 "const_int_operand" "")] UNSPEC_BBR))]
+       (unspec:SI [(match_operand 1 "const_int_operand" "")] UNSPEC_BBR)) 
+   (set (pc) (unspec [(const_int 0)] UNSPEC_BBR))]
    "TARGET_SH1"
    ""
    [(set_attr "length" "0")])
+
+;; This one is used to preemt an insn from beyond the bra / braf / jmp
+;; being pulled into the delay slot of a condbranch that has been made to
+;; jump around the unconditional jump because it was out of range.
+(define_insn "stuff_delay_slot"
+  [(set (pc)
+       (unspec [(match_operand 0 "const_int_operand" "") (pc)] UNSPEC_BBR))
+   (set (reg:SI T_REG) (match_operand 1 "const_int_operand" ""))]
+  "TARGET_SH1"
+  ""
+  [(set_attr "length" "0")
+   (set_attr "cond_delay_slot" "yes")])
  \f
  ;; Conditional branch insns
  
@@ -5388,7 +5305,7 @@
        DONE;
      }
  
-  if (TARGET_SH3E
+  if (TARGET_SH2E
        && TARGET_IEEE
        && GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
      {
@@ -5429,7 +5346,7 @@
        DONE;
      }
  
-  if (TARGET_SH3E
+  if (TARGET_SH2E
        && ! TARGET_IEEE
        && GET_MODE_CLASS (GET_MODE (sh_compare_op0)) == MODE_FLOAT)
      {
@@ -5563,6 +5480,19 @@
    [(set_attr "type" "jump")
     (set_attr "needs_delay_slot" "yes")])
  
+;; ??? It would be much saner to explicitly use the scratch register
+;; in the jump insn, and have indirect_jump_scratch only set it,
+;; but fill_simple_delay_slots would refuse to do delay slot filling
+;; from the target then, as it uses simplejump_p.
+;;(define_insn "jump_compact_far"
+;;  [(set (pc)
+;;     (label_ref (match_operand 0 "" "")))
+;;   (use (match_operand 1 "register_operand" "r")]
+;;  "TARGET_SH1"
+;;  "* return output_far_jump(insn, operands[0], operands[1]);"
+;;  [(set_attr "type" "jump")
+;;   (set_attr "needs_delay_slot" "yes")])
+
  (define_insn "jump_media"
    [(set (pc)
         (match_operand:DI 0 "target_operand" "b"))]
@@ -6585,7 +6515,7 @@
                     (const_int 0))
               (match_operand 1 "" "")
               (match_operand 2 "" "")])]
-  "TARGET_SH3E || TARGET_SHMEDIA"
+  "TARGET_SH2E || TARGET_SHMEDIA"
    "
  {
    int i;
@@ -6896,6 +6826,142 @@
    ""
    "")
  
+;; TLS code generation.
+;; ??? this should be a define_insn_and_split
+;; See the thread [PATCH/RFA] SH TLS support on gcc-patches
+;; <http://gcc.gnu.org/ml/gcc-patches/2003-02/msg01898.html>
+;; for details.
+
+(define_insn "tls_global_dynamic"
+  [(set (match_operand:SI 0 "register_operand" "=&z")
+       (unspec:SI [(match_operand:SI 1 "" "")]
+                   UNSPEC_TLSGD))
+   (use (reg:PSI FPSCR_REG))
+   (use (reg:SI PIC_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (scratch:SI))]
+  "TARGET_SH1"
+  "*
+{
+  return \"\\
+mov.l\\t1f,r4\\n\\
+\\tmova\\t2f,r0\\n\\
+\\tmov.l\\t2f,r1\\n\\
+\\tadd\\tr0,r1\\n\\
+\\tjsr\\t@r1\\n\\
+\\tadd\\tr12,r4\\n\\
+\\tbra\\t3f\\n\\
+\\tnop\\n\\
+\\t.align\\t2\\n\\
+1:\\t.long\\t%a1@TLSGD\\n\\
+2:\\t.long\\t__tls_get_addr@PLT\\n\\
+3:\";
+}"
+  [(set_attr "type" "tls_load")
+   (set_attr "length" "26")])
+
+(define_insn "tls_local_dynamic"
+  [(set (match_operand:SI 0 "register_operand" "=&z")
+       (unspec:SI [(match_operand:SI 1 "" "")]
+                   UNSPEC_TLSLDM))
+   (use (reg:PSI FPSCR_REG))
+   (use (reg:SI PIC_REG))
+   (clobber (reg:SI PR_REG))
+   (clobber (scratch:SI))]
+  "TARGET_SH1"
+  "*
+{
+  return \"\\
+mov.l\\t1f,r4\\n\\
+\\tmova\\t2f,r0\\n\\
+\\tmov.l\\t2f,r1\\n\\
+\\tadd\\tr0,r1\\n\\
+\\tjsr\\t@r1\\n\\
+\\tadd\\tr12,r4\\n\\
+\\tbra\\t3f\\n\\
+\\tnop\\n\\
+\\t.align\\t2\\n\\
+1:\\t.long\\t%a1@TLSLDM\\n\\
+2:\\t.long\\t__tls_get_addr@PLT\\n\\
+3:\";
+}"
+  [(set_attr "type" "tls_load")
+   (set_attr "length" "26")])
+
+(define_expand "sym2DTPOFF"
+  [(const (unspec [(match_operand 0 "" "")] UNSPEC_DTPOFF))]
+  ""
+  "")
+
+(define_expand "symDTPOFF2reg"
+  [(match_operand 0 "" "") (match_operand 1 "" "") (match_operand 2 "" "")]
+  ""
+  "
+{
+  rtx dtpoffsym, insn;
+  rtx t = no_new_pseudos ? operands[0] : gen_reg_rtx (GET_MODE (operands[0]));
+
+  dtpoffsym = gen_sym2DTPOFF (operands[1]);
+  PUT_MODE (dtpoffsym, Pmode);
+  emit_move_insn (t, dtpoffsym);
+  insn = emit_move_insn (operands[0],
+                        gen_rtx_PLUS (Pmode, t, operands[2]));
+  DONE;
+}")
+
+(define_expand "sym2GOTTPOFF"
+  [(const (unspec [(match_operand 0 "" "")] UNSPEC_GOTTPOFF))]
+  ""
+  "")
+
+(define_insn "tls_initial_exec"
+  [(set (match_operand:SI 0 "register_operand" "=&r")
+       (unspec:SI [(match_operand:SI 1 "" "")]
+                   UNSPEC_TLSIE))
+   (use (reg:SI GBR_REG))
+   (use (reg:SI PIC_REG))
+   (clobber (reg:SI R0_REG))]
+  ""
+  "*
+{
+  return \"\\
+mov.l\\t1f,r0\\n\\
+\\tstc\\tgbr,%0\\n\\
+\\tmov.l\\t@(r0,r12),r0\\n\\
+\\tbra\\t2f\\n\\
+\\tadd\\tr0,%0\\n\\
+\\t.align\\t2\\n\\
+1:\\t.long\\t%a1\\n\\
+2:\";
+}"
+  [(set_attr "type" "tls_load")
+   (set_attr "length" "16")])
+
+(define_expand "sym2TPOFF"
+  [(const (unspec [(match_operand 0 "" "")] UNSPEC_TPOFF))]
+  ""
+  "")
+
+(define_expand "symTPOFF2reg"
+  [(match_operand 0 "" "") (match_operand 1 "" "")]
+  ""
+  "
+{
+  rtx tpoffsym, insn;
+
+  tpoffsym = gen_sym2TPOFF (operands[1]);
+  PUT_MODE (tpoffsym, Pmode);
+  insn = emit_move_insn (operands[0], tpoffsym);
+  DONE;
+}")
+
+(define_insn "load_gbr"
+  [(set (match_operand:SI 0 "register_operand" "") (reg:SI GBR_REG))
+   (use (reg:SI GBR_REG))]
+  ""
+  "stc gbr,%0"
+  [(set_attr "type" "tls_load")])
+
  ;; case instruction for switch statements.
  
  ;; Operand 0 is index
@@ -7244,6 +7310,48 @@
    DONE;
  }")
  
+(define_expand "eh_return"
+  [(use (match_operand 0 "register_operand" ""))
+   (use (match_operand 1 "register_operand" ""))]
+  ""
+{
+  rtx tmp, sa = operands[0], ra = operands[1];
+
+  if (TARGET_SHMEDIA64)
+    emit_insn (gen_eh_set_ra_di (ra));
+  else
+    emit_insn (gen_eh_set_ra_si (ra));
+
+  emit_move_insn (EH_RETURN_STACKADJ_RTX, sa);
+  DONE;
+})
+
+;; Clobber the return address on the stack.  We can't expand this
+;; until we know where it will be put in the stack frame.
+
+(define_insn "eh_set_ra_si"
+  [(unspec [(match_operand:SI 0 "register_operand" "r")] UNSPEC_EH_RETURN)
+   (clobber (match_scratch:SI 1 "=&r"))]
+  "! TARGET_SHMEDIA64"
+  "#")
+
+(define_insn "eh_set_ra_di"
+  [(unspec [(match_operand:DI 0 "register_operand" "r")] UNSPEC_EH_RETURN)
+   (clobber (match_scratch:DI 1 "=&r"))]
+  "TARGET_SHMEDIA64"
+  "#")
+
+(define_split
+  [(unspec [(match_operand 0 "register_operand" "")] UNSPEC_EH_RETURN)
+   (clobber (match_scratch 1 ""))]
+  "reload_completed"
+  [(const_int 0)]
+  "
+{
+  sh_set_return_address (operands[0], operands[1]);
+  DONE;
+}")
+
  (define_insn "blockage"
    [(unspec_volatile [(const_int 0)] UNSPECV_BLOCKAGE)]
    ""
@@ -7980,12 +8088,12 @@
  ;; GO_IF_LEGITIMATE_ADDRESS guards about bogus addresses before reload,
  ;; SECONDARY_INPUT_RELOAD_CLASS does this during reload, and the insn's
  ;; predicate after reload.
-;; The gp_fpul type for r/!c might look a bit odd, but it actually schedules
-;; like a gpr <-> fpul move.
+;; The mac_gp type for r/!c might look a bit odd, but it actually schedules
+;; like a mac -> gpr move.
  (define_insn "fpu_switch"
-  [(set (match_operand:PSI 0 "register_operand" "=c,c,r,c,c,r,m,r")
-       (match_operand:PSI 1 "general_movsrc_operand" "c,>,m,m,r,r,r,!c"))]
-  "TARGET_SH4
+  [(set (match_operand:PSI 0 "general_movdst_operand" "=c,c,r,c,c,r,m,r,<")
+       (match_operand:PSI 1 "general_movsrc_operand" "c,>,m,m,r,r,r,!c,c"))]
+  "TARGET_SH3E
     && (! reload_completed
         || true_regnum (operands[0]) != FPSCR_REG
         || GET_CODE (operands[1]) != MEM
@@ -7998,10 +8106,10 @@
         lds     %1,fpscr
         mov     %1,%0
         mov.l   %1,%0
-       sts     fpscr,%0"
-  [(set_attr "length" "0,2,2,4,2,2,2,2")
-   (set_attr "type" "dfp_conv,dfp_conv,load,dfp_conv,dfp_conv,move,store,gp_fpul")
-   (set_attr "insn_class" "ldsmem_to_fpscr,*,*,lds_to_fpscr,*,*,*,*")])
+       sts     fpscr,%0
+       sts.l   fpscr,%0"
+  [(set_attr "length" "0,2,2,4,2,2,2,2,2")
+   (set_attr "type" "nil,mem_fpscr,load,mem_fpscr,gp_fpscr,move,store,mac_gp,store")])
  
  (define_split
    [(set (reg:PSI FPSCR_REG)
@@ -8047,10 +8155,10 @@
    [(set (match_operand:SF 0 "arith_reg_operand" "")
         (plus:SF (match_operand:SF 1 "arith_reg_operand" "")
                  (match_operand:SF 2 "arith_reg_operand" "")))]
-  "TARGET_SH3E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH3E)
+  if (TARGET_SH2E)
      {
        expand_sf_binop (&gen_addsf3_i, operands);
        DONE;
@@ -8101,7 +8209,7 @@
          (vec_concat:V2SF
           (vec_select:SF
            (match_dup 0)
-          (parallel [(not:BI (match_operand 4 "const_int_operand" "n"))]))
+          (parallel [(match_operand 7 "const_int_operand" "n")]))
           (match_operator:SF 3 "binary_float_operator"
             [(vec_select:SF (match_operand:V2SF 1 "fp_arith_reg_operand" "f")
                             (parallel [(match_operand 5
@@ -8109,11 +8217,11 @@
              (vec_select:SF (match_operand:V2SF 2 "fp_arith_reg_operand" "f")
                             (parallel [(match_operand 6
                                         "const_int_operand" "n")]))]))
-        (parallel [(not:BI (match_dup 4)) (match_dup 4)])))]
-  "TARGET_SHMEDIA_FPU"
+        (parallel [(match_dup 7) (match_operand 4 "const_int_operand" "n")])))]
+  "TARGET_SHMEDIA_FPU && INTVAL (operands[4]) != INTVAL (operands[7])"
    "#"
-  "TARGET_SHMEDIA_FPU && reload_completed"
-  [(set (match_dup 7) (match_dup 8))]
+  "&& reload_completed"
+  [(set (match_dup 8) (match_dup 9))]
    "
  {
    int endian = TARGET_LITTLE_ENDIAN ? 0 : 1;
@@ -8124,10 +8232,10 @@
                          (true_regnum (operands[2])
                           + (INTVAL (operands[6]) ^ endian)));
  
-  operands[7] = gen_rtx_REG (SFmode,
+  operands[8] = gen_rtx_REG (SFmode,
                              (true_regnum (operands[0])
                               + (INTVAL (operands[4]) ^ endian)));
-  operands[8] = gen_rtx (GET_CODE (operands[3]), SFmode, op1, op2);
+  operands[9] = gen_rtx (GET_CODE (operands[3]), SFmode, op1, op2);
  }"
    [(set_attr "type" "fparith_media")])
  
@@ -8136,7 +8244,7 @@
         (plus:SF (match_operand:SF 1 "arith_reg_operand" "%0")
                  (match_operand:SF 2 "arith_reg_operand" "f")))
     (use (match_operand:PSI 3 "fpscr_operand" "c"))]
-  "TARGET_SH3E"
+  "TARGET_SH2E"
    "fadd        %2,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8145,10 +8253,10 @@
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
         (minus:SF (match_operand:SF 1 "fp_arith_reg_operand" "")
                   (match_operand:SF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH3E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH3E)
+  if (TARGET_SH2E)
      {
        expand_sf_binop (&gen_subsf3_i, operands);
        DONE;
@@ -8168,7 +8276,7 @@
         (minus:SF (match_operand:SF 1 "fp_arith_reg_operand" "0")
                  (match_operand:SF 2 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 3 "fpscr_operand" "c"))]
-  "TARGET_SH3E"
+  "TARGET_SH2E"
    "fsub        %2,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8182,12 +8290,12 @@
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
         (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "")
                  (match_operand:SF 2 "fp_arith_reg_operand" "")))]
-  "TARGET_SH3E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
    if (TARGET_SH4)
      expand_sf_binop (&gen_mulsf3_i4, operands);
-  else if (TARGET_SH3E)
+  else if (TARGET_SH2E)
      emit_insn (gen_mulsf3_ie (operands[0], operands[1], operands[2]));
    if (! TARGET_SHMEDIA)
      DONE;
@@ -8206,7 +8314,7 @@
         (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%0")
                  (match_operand:SF 2 "fp_arith_reg_operand" "f")))
     (use (match_operand:PSI 3 "fpscr_operand" "c"))]
-  "TARGET_SH3E"
+  "TARGET_SH2E"
    "fmul        %2,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8215,7 +8323,7 @@
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
         (mult:SF (match_operand:SF 1 "fp_arith_reg_operand" "%0")
                  (match_operand:SF 2 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH3E && ! TARGET_SH4"
+  "TARGET_SH2E && ! TARGET_SH4"
    "fmul        %2,%0"
    [(set_attr "type" "fp")])
  
@@ -8234,7 +8342,7 @@
                           (match_operand:SF 2 "fp_arith_reg_operand" "f"))
                  (match_operand:SF 3 "arith_reg_operand" "0")))
     (use (match_operand:PSI 4 "fpscr_operand" "c"))]
-  "TARGET_SH3E && ! TARGET_SH4"
+  "TARGET_SH2E && ! TARGET_SH4"
    "fmac        fr0,%2,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8243,10 +8351,10 @@
    [(set (match_operand:SF 0 "arith_reg_operand" "")
         (div:SF (match_operand:SF 1 "arith_reg_operand" "")
                 (match_operand:SF 2 "arith_reg_operand" "")))]
-  "TARGET_SH3E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH3E)
+  if (TARGET_SH2E)
      {
        expand_sf_binop (&gen_divsf3_i, operands);
        DONE;
@@ -8266,7 +8374,7 @@
         (div:SF (match_operand:SF 1 "arith_reg_operand" "0")
                  (match_operand:SF 2 "arith_reg_operand" "f")))
     (use (match_operand:PSI 3 "fpscr_operand" "c"))]
-  "TARGET_SH3E"
+  "TARGET_SH2E"
    "fdiv        %2,%0"
    [(set_attr "type" "fdiv")
     (set_attr "fp_mode" "single")])
@@ -8281,7 +8389,7 @@
  (define_expand "floatsisf2"
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
         (float:SF (match_operand:SI 1 "fpul_operand" "")))]
-  "TARGET_SH3E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
    if (TARGET_SH4)
@@ -8310,7 +8418,7 @@
  (define_insn "*floatsisf2_ie"
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
         (float:SF (match_operand:SI 1 "fpul_operand" "y")))]
-  "TARGET_SH3E && ! TARGET_SH4"
+  "TARGET_SH2E && ! TARGET_SH4"
    "float       %1,%0"
    [(set_attr "type" "fp")])
  
@@ -8324,7 +8432,7 @@
  (define_expand "fix_truncsfsi2"
    [(set (match_operand:SI 0 "fpul_operand" "=y")
         (fix:SI (match_operand:SF 1 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH3E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
    if (TARGET_SH4)
@@ -8347,7 +8455,7 @@
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
    "TARGET_SH4"
    "ftrc        %1,%0"
-  [(set_attr "type" "fp")
+  [(set_attr "type" "ftrc_s")
     (set_attr "fp_mode" "single")])
  
  ;; ??? This pattern is used nowhere.  fix_truncsfsi2 always expands to
@@ -8375,7 +8483,7 @@
  (define_insn "*fixsfsi"
    [(set (match_operand:SI 0 "fpul_operand" "=y")
         (fix:SI (match_operand:SF 1 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH3E && ! TARGET_SH4"
+  "TARGET_SH2E && ! TARGET_SH4"
    "ftrc        %1,%0"
    [(set_attr "type" "fp")])
  
@@ -8383,7 +8491,7 @@
    [(set (reg:SI T_REG)
         (gt:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
                (match_operand:SF 1 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH3E && ! TARGET_SH4"
+  "TARGET_SH2E && ! TARGET_SH4"
    "fcmp/gt     %1,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8392,7 +8500,7 @@
    [(set (reg:SI T_REG)
         (eq:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
                (match_operand:SF 1 "fp_arith_reg_operand" "f")))]
-  "TARGET_SH3E && ! TARGET_SH4"
+  "TARGET_SH2E && ! TARGET_SH4"
    "fcmp/eq     %1,%0"
    [(set_attr "type" "fp")
     (set_attr "fp_mode" "single")])
@@ -8402,7 +8510,7 @@
         (ior:SI (reg:SI T_REG)
                 (eq:SI (match_operand:SF 0 "fp_arith_reg_operand" "f")
                        (match_operand:SF 1 "fp_arith_reg_operand" "f"))))]
-  "TARGET_SH3E && TARGET_IEEE && ! TARGET_SH4"
+  "TARGET_SH2E && TARGET_IEEE && ! TARGET_SH4"
    "* return output_ieee_ccmpeq (insn, operands);"
    [(set_attr "length" "4")])
  
@@ -8474,7 +8582,7 @@
    [(set (reg:SI T_REG)
         (compare (match_operand:SF 0 "arith_operand" "")
                  (match_operand:SF 1 "arith_operand" "")))]
-  "TARGET_SH3E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
    sh_compare_op0 = operands[0];
@@ -8485,10 +8593,10 @@
  (define_expand "negsf2"
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
         (neg:SF (match_operand:SF 1 "fp_arith_reg_operand" "")))]
-  "TARGET_SH3E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH3E)
+  if (TARGET_SH2E)
      {
        expand_sf_unop (&gen_negsf2_i, operands);
        DONE;
@@ -8506,7 +8614,7 @@
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
         (neg:SF (match_operand:SF 1 "fp_arith_reg_operand" "0")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH3E"
+  "TARGET_SH2E"
    "fneg        %0"
    [(set_attr "type" "fmove")
     (set_attr "fp_mode" "single")])
@@ -8543,10 +8651,10 @@
  (define_expand "abssf2"
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "")
         (abs:SF (match_operand:SF 1 "fp_arith_reg_operand" "")))]
-  "TARGET_SH3E || TARGET_SHMEDIA_FPU"
+  "TARGET_SH2E || TARGET_SHMEDIA_FPU"
    "
  {
-  if (TARGET_SH3E)
+  if (TARGET_SH2E)
      {
        expand_sf_unop (&gen_abssf2_i, operands);
        DONE;
@@ -8564,7 +8672,7 @@
    [(set (match_operand:SF 0 "fp_arith_reg_operand" "=f")
         (abs:SF (match_operand:SF 1 "fp_arith_reg_operand" "0")))
     (use (match_operand:PSI 2 "fpscr_operand" "c"))]
-  "TARGET_SH3E"
+  "TARGET_SH2E"
    "fabs        %0"
    [(set_attr "type" "fmove")
     (set_attr "fp_mode" "single")])
@@ -8769,6 +8877,7 @@
    "TARGET_SH4"
    "ftrc        %1,%0"
    [(set_attr "type" "dfp_conv")
+   (set_attr "dfp_comp" "no")
     (set_attr "fp_mode" "double")])
  
  ;; ??? This pattern is used nowhere.  fix_truncdfsi2 always expands to
@@ -9198,7 +9307,7 @@
         (plus:SI (match_dup 0) (match_operand:SI 1 "register_operand" "r")))
     (set (mem:SF (match_dup 0))
         (match_operand:SF 2 "general_movsrc_operand" ""))]
-  "TARGET_SH3E && REGNO (operands[0]) == 0
+  "TARGET_SH2E && REGNO (operands[0]) == 0
     && ((GET_CODE (operands[2]) == REG
          && FP_OR_XD_REGISTER_P (REGNO (operands[2])))
         || (GET_CODE (operands[2]) == SUBREG
@@ -9212,7 +9321,7 @@
     (set (match_operand:SF 2 "general_movdst_operand" "")
  
         (mem:SF (match_dup 0)))]
-  "TARGET_SH3E && REGNO (operands[0]) == 0
+  "TARGET_SH2E && REGNO (operands[0]) == 0
     && ((GET_CODE (operands[2]) == REG
         && FP_OR_XD_REGISTER_P (REGNO (operands[2])))
         || (GET_CODE (operands[2]) == SUBREG
@@ -9253,16 +9362,16 @@
  
  (define_insn "movv8qi_i"
    [(set (match_operand:V8QI 0 "general_movdst_operand" "=r,r,r,rl,m")
-       (match_operand:V8QI 1 "general_movsrc_operand" "r,JSU,nW,m,rl"))]
+       (match_operand:V8QI 1 "general_movsrc_operand" "r,JSU,nW,m,rlU"))]
    "TARGET_SHMEDIA
     && (register_operand (operands[0], V8QImode)
-       || register_operand (operands[1], V8QImode))"
+       || sh_register_operand (operands[1], V8QImode))"
    "@
         add     %1, r63, %0
         movi    %1, %0
         #
         ld%M1.q %m1, %0
-       st%M0.q %m0, %1"
+       st%M0.q %m0, %N1"
    [(set_attr "type"   "arith_media,arith_media,*,load_media,store_media")
     (set_attr "length" "4,4,16,4,4")])
  
@@ -9283,7 +9392,9 @@
     && VECTOR_MODE_SUPPORTED_P (GET_MODE (operands[0]))
     && GET_MODE_SIZE (GET_MODE (operands[0])) == 8
     && (XVECEXP (operands[1], 0, 0) != const0_rtx
-       || XVECEXP (operands[1], 0, 1) != const0_rtx)"
+       || XVECEXP (operands[1], 0, 1) != const0_rtx)
+   && (XVECEXP (operands[1], 0, 0) != constm1_rtx
+       || XVECEXP (operands[1], 0, 1) != constm1_rtx)"
    [(set (match_dup 0) (match_dup 1))
     (match_dup 2)]
    "
@@ -9294,7 +9405,11 @@
    if (unit_size > 2)
      operands[2] = gen_mshflo_l (operands[0], operands[0], operands[0]);
    else
-    operands[2] = gen_mperm_w0 (operands[0], operands[0]);
+    {
+      if (unit_size < 2)
+       operands[0] = gen_rtx_REG (V4HImode, true_regnum (operands[0]));
+      operands[2] = gen_mperm_w0 (operands[0], operands[0]);
+    }
    operands[0] = gen_rtx_REG (DImode, true_regnum (operands[0]));
    operands[1] = XVECEXP (operands[1], 0, 0);
    if (unit_size < 2)
@@ -9339,16 +9454,16 @@
  
  (define_insn "movv2hi_i"
    [(set (match_operand:V2HI 0 "general_movdst_operand" "=r,r,r,rl,m")
-       (match_operand:V2HI 1 "general_movsrc_operand" "r,JSU,nW,m,rl"))]
+       (match_operand:V2HI 1 "general_movsrc_operand" "r,JSU,nW,m,rlU"))]
    "TARGET_SHMEDIA
     && (register_operand (operands[0], V2HImode)
-       || register_operand (operands[1], V2HImode))"
+       || sh_register_operand (operands[1], V2HImode))"
    "@
         addz.l  %1, r63, %0
         movi    %1, %0
         #
         ld%M1.l %m1, %0
-       st%M0.l %m0, %1"
+       st%M0.l %m0, %N1"
    [(set_attr "type"   "arith_media,arith_media,*,load_media,store_media")
     (set_attr "length" "4,4,16,4,4")])
  
@@ -9360,16 +9475,16 @@
  
  (define_insn "movv4hi_i"
    [(set (match_operand:V4HI 0 "general_movdst_operand" "=r,r,r,rl,m")
-       (match_operand:V4HI 1 "general_movsrc_operand" "r,JSU,nW,m,rl"))]
+       (match_operand:V4HI 1 "general_movsrc_operand" "r,JSU,nW,m,rlU"))]
    "TARGET_SHMEDIA
     && (register_operand (operands[0], V4HImode)
-       || register_operand (operands[1], V4HImode))"
+       || sh_register_operand (operands[1], V4HImode))"
    "@
         add     %1, r63, %0
         movi    %1, %0
         #
         ld%M1.q %m1, %0
-       st%M0.q %m0, %1"
+       st%M0.q %m0, %N1"
    [(set_attr "type"   "arith_media,arith_media,*,load_media,store_media")
     (set_attr "length" "4,4,16,4,4")])
  
@@ -9381,16 +9496,16 @@
  
  (define_insn "movv2si_i"
    [(set (match_operand:V2SI 0 "general_movdst_operand" "=r,r,r,rl,m")
-       (match_operand:V2SI 1 "general_movsrc_operand" "r,JSU,nW,m,rl"))]
+       (match_operand:V2SI 1 "general_movsrc_operand" "r,JSU,nW,m,rlU"))]
    "TARGET_SHMEDIA
     && (register_operand (operands[0], V2SImode)
-       || register_operand (operands[1], V2SImode))"
+       || sh_register_operand (operands[1], V2SImode))"
    "@
         add     %1, r63, %0
         #
         #
         ld%M1.q %m1, %0
-       st%M0.q %m0, %1"
+       st%M0.q %m0, %N1"
    [(set_attr "type"   "arith_media,arith_media,*,load_media,store_media")
     (set_attr "length" "4,4,16,4,4")])
  
@@ -9861,6 +9976,7 @@
  {
    emit_insn ((TARGET_LITTLE_ENDIAN ? gen_mperm_w_little : gen_mperm_w_big)
              (operands[0], operands[1], operands[2]));
+  DONE;
  }")
  
  ; This use of vec_select isn't exactly correct according to rtl.texi
@@ -9870,11 +9986,11 @@
         (vec_select:V4HI
          (match_operand:V4HI 1 "arith_reg_operand" "r")
          (parallel
-         [(zero_extract (match_operand:QI 2 "extend_reg_or_0_operand" "rU")
-                        (const_int 2) (const_int 0))
-          (zero_extract (match_dup 2) (const_int 2) (const_int 2))
-          (zero_extract (match_dup 2) (const_int 2) (const_int 4))
-          (zero_extract (match_dup 2) (const_int 2) (const_int 6))])))]
+         [(zero_extract:QI (match_operand:QI 2 "extend_reg_or_0_operand" "rU")
+                           (const_int 2) (const_int 0))
+          (zero_extract:QI (match_dup 2) (const_int 2) (const_int 2))
+          (zero_extract:QI (match_dup 2) (const_int 2) (const_int 4))
+          (zero_extract:QI (match_dup 2) (const_int 2) (const_int 6))])))]
    "TARGET_SHMEDIA && TARGET_LITTLE_ENDIAN"
    "mperm.w     %1, %N2, %0"
    [(set_attr "type" "arith_media")])
@@ -9884,12 +10000,13 @@
         (vec_select:V4HI
          (match_operand:V4HI 1 "arith_reg_operand" "r")
          (parallel
-         [(zero_extract (not:QI (match_operand:QI 2
-                                 "extend_reg_or_0_operand" "rU"))
-                        (const_int 2) (const_int 0))
-          (zero_extract (not:QI (match_dup 2)) (const_int 2) (const_int 2))
-          (zero_extract (not:QI (match_dup 2)) (const_int 2) (const_int 4))
-          (zero_extract (not:QI (match_dup 2)) (const_int 2) (const_int 6))])))]
+         [(zero_extract:QI (not:QI (match_operand:QI 2
+                                    "extend_reg_or_0_operand" "rU"))
+                           (const_int 2) (const_int 0))
+          (zero_extract:QI (not:QI (match_dup 2)) (const_int 2) (const_int 2))
+          (zero_extract:QI (not:QI (match_dup 2)) (const_int 2) (const_int 4))
+          (zero_extract:QI (not:QI (match_dup 2))
+                           (const_int 2) (const_int 6))])))]
    "TARGET_SHMEDIA && ! TARGET_LITTLE_ENDIAN"
    "mperm.w     %1, %N2, %0"
    [(set_attr "type" "arith_media")])
@@ -10158,7 +10275,7 @@
         (vec_select:V4HI
          (vec_concat:V4HI (match_operand:V2HI 1 "extend_reg_or_0_operand" "rU")
                           (match_operand:V2HI 2 "extend_reg_or_0_operand" "rU"))
-        (parallel [(const_int 0) (const_int 2) (const_int 1) (const_int 3)])))]
+        (parallel [(const_int 2) (const_int 0) (const_int 3) (const_int 1)])))]
    "TARGET_SHMEDIA"
    "mshflo.w    %N1, %N2, %0"
    [(set_attr "type" "arith_media")])
@@ -10512,14 +10629,15 @@
    rtx discratch = gen_reg_rtx (DImode);
    rtx last;
  
-  emit_insn (gen_adddi3z_media (discratch, operands[1],
-                               force_reg (SImode, GEN_INT (-1))));
-  emit_insn (gen_andcdi3 (discratch, discratch,
-                         simplify_gen_subreg (DImode, operands[1],
-                                              SImode, 0)));
+  emit_insn (gen_adddi3 (discratch,
+                        simplify_gen_subreg (DImode, operands[1], SImode, 0),
+                        GEN_INT (-1)));
+  emit_insn (gen_andcdi3 (discratch,
+                         simplify_gen_subreg (DImode, operands[1], SImode, 0),
+                         discratch));
    emit_insn (gen_nsbsi (scratch, discratch));
    last = emit_insn (gen_subsi3 (operands[0],
-                               force_reg (SImode, GEN_INT (-64)), scratch));
+                               force_reg (SImode, GEN_INT (63)), scratch));
    REG_NOTES (last)
      = gen_rtx_EXPR_LIST (REG_EQUAL,
                          gen_rtx_FFS (SImode, operands[0]), REG_NOTES (last));
@@ -10575,18 +10693,22 @@
  
  (define_cpu_unit "f1_1,f1_2" "fpu_pipe")
  
-;; The floating point units.
+;; The floating point units (except FS - F2 always precedes it.)
  
-(define_cpu_unit "F1,F2,F3,FS" "fpu_pipe")
+(define_cpu_unit "F0,F1,F2,F3" "fpu_pipe")
  
  ;; This is basically the MA unit of SH4
  ;; used in LOAD/STORE pipeline.
  
  (define_cpu_unit "memory" "inst_pipeline")
  
+;; However, there are LS group insns that don't use it, even ones that
+;; complete in 0 cycles.  So we use an extra unit for the issue of LS insns.
+(define_cpu_unit "load_store" "inst_pipeline")
+
  ;; The address calculator used for branch instructions.
-;; This will be reserved with "issue" of branch instructions
-;; and this is to make sure that  no two branch instructions 
+;; This will be reserved after "issue" of branch instructions
+;; and this is to make sure that no two branch instructions 
  ;; can be issued in parallel. 
  
  (define_cpu_unit "pcr_addrcalc" "inst_pipeline")
@@ -10597,115 +10719,228 @@
  (define_reservation  "issue"  "pipe_01|pipe_02")
  
  ;; This is to express the locking of D stage.
+;; Note that the issue of a CO group insn also effectively locks the D stage.
  
  (define_reservation  "d_lock" "pipe_01+pipe_02")
  
+;; Every FE instruction but fipr / ftrv starts with issue and this.
+(define_reservation "F01" "F0+F1")
+
  ;; This is to simplify description where F1,F2,FS
  ;; are used simultaneously.
  
-(define_reservation "fpu" "F1+F2+FS")
+(define_reservation "fpu" "F1+F2")
  
  ;; This is to highlight the fact that f1 
  ;; cannot overlap with F1.
  
  (exclusion_set  "f1_1,f1_2" "F1")
  
+(define_insn_reservation "nil" 0 (eq_attr "type" "nil") "nothing")
+
  ;; Although reg moves have a latency of zero 
  ;; we need to highlight that they use D stage
  ;; for one cycle.
  
+;; Group:      MT
+
  (define_insn_reservation "reg_mov" 0
-               (eq_attr "type" "move,fmove")
-              "issue")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "move"))
+  "issue")
+
+;; Group:      LS
  
-;; Other MT  group intructions(1 step operations)
+(define_insn_reservation "freg_mov" 0
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fmove"))
+  "issue+load_store")
+
+;; We don't model all pipeline stages; we model the issue ('D') stage
+;; inasmuch as we allow only two instructions to issue simultaneously,
+;; and CO instructions prevent any simultaneous issue of another instruction.
+;; (This uses pipe_01 and pipe_02).
+;; Double issue of EX insns is prevented by using the int unit in the EX stage.
+;; Double issue of EX / BR insns is prevented by using the int unit /
+;; pcr_addrcalc unit in the EX stage.
+;; Double issue of BR / LS instructions is prevented by using the
+;; pcr_addrcalc / load_store unit in the issue cycle.
+;; Double issue of FE instructions is prevented by using F0 in the first
+;; pipeline stage after the first D stage.
+;; There is no need to describe the [ES]X / [MN]A / S stages after a D stage
+;; (except in the cases outlined above), nor to describe the FS stage after
+;; the F2 stage.
+
+;; Other MT  group instructions(1 step operations)
  ;; Group:      MT
  ;; Latency:    1
  ;; Issue Rate:         1
  
  (define_insn_reservation "mt" 1
-                      (eq_attr "insn_class" "mt_group")
-                      "issue,nothing")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "mt_group"))
+  "issue")
  
  ;; Fixed Point Arithmetic Instructions(1 step operations)
  ;; Group:      EX
  ;; Latency:    1
  ;; Issue Rate:         1
  
-(define_insn_reservation "simple_arith" 1 
-            (eq_attr "insn_class" "ex_group")
-            "issue,int")
+(define_insn_reservation "sh4_simple_arith" 1 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "insn_class" "ex_group"))
+  "issue,int")
+
+;; Load and store instructions have no alignment peculiarities for the SH4,
+;; but they use the load-store unit, which they share with the fmove type
+;; insns (fldi[01]; fmov frn,frm; flds; fsts; fabs; fneg) .
+;; Loads have a latency of two.
+;; However, call insns can only paired with a preceding insn, and have
+;; a delay slot, so that we want two more insns to be scheduled between the
+;; load of the function address and the call.  This is equivalent to a
+;; latency of three.
+;; ADJUST_COST can only properly handle reductions of the cost, so we
+;; use a latency of three here, which gets multiplied by 10 to yield 30.
+;; We only do this for SImode loads of general registers, to make the work
+;; for ADJUST_COST easier.
  
  ;; Load Store instructions. (MOV.[BWL]@(d,GBR)
  ;; Group:      LS
  ;; Latency:    2
  ;; Issue Rate:         1
  
-(define_insn_reservation "load_store" 2
-       (eq_attr "type" "load,load_si,pcload,pcload_si,store")
-       "issue,memory*2")
+(define_insn_reservation "sh4_load" 2
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "load,pcload"))
+  "issue+load_store,nothing,memory")
+
+;; calls / sfuncs need an extra instruction for their delay slot.
+;; Moreover, estimating the latency for SImode loads as 3 will also allow
+;; adjust_cost to meaningfully bump it back up to 3 if they load the shift
+;; count of a dynamic shift.
+(define_insn_reservation "sh4_load_si" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "load_si,pcload_si"))
+  "issue+load_store,nothing,memory")
+
+;; (define_bypass 2 "sh4_load_si" "!sh4_call")
+
+;; The load latency is upped to three higher if the dependent insn does
+;; double precision computation.  We want the 'default' latency to reflect
+;; that increased latency because otherwise the insn priorities won't
+;; allow proper scheduling.
+(define_insn_reservation "sh4_fload" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fload,pcfload"))
+  "issue+load_store,nothing,memory")
+
+;; (define_bypass 2 "sh4_fload" "!")
+
+(define_insn_reservation "sh4_store" 1
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "store"))
+  "issue+load_store,nothing,memory")
+
+;; Load Store instructions.
+;; Group:      LS
+;; Latency:    1
+;; Issue Rate:         1
+
+(define_insn_reservation "sh4_gp_fpul" 1
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "gp_fpul"))
+  "issue+load_store")
+
+;; Load Store instructions.
+;; Group:      LS
+;; Latency:    3
+;; Issue Rate:         1
+
+(define_insn_reservation "sh4_fpul_gp" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fpul_gp"))
+  "issue+load_store")
  
  ;; Branch (BF,BF/S,BT,BT/S,BRA)
  ;; Group:      BR
-;; Latency:    2 (or 1) Actually Observed to be 5/7
+;; Latency when taken:         2 (or 1)
  ;; Issue Rate:         1
  ;; The latency is 1 when displacement is 0.
-;; This reservation can be further broken into 2
-;;    1. branch_zero : One with latency 1 and in the TEST 
-;;       part it also checks for 0 (ZERO) displacement 
-;;    2. branch: Latency 2.
+;; We can't really do much with the latency, even if we could express it,
+;; but the pairing restrictions are useful to take into account.
+;; ??? If the branch is likely, we might want to fill the delay slot;
+;; if the branch is likely, but not very likely, should we pretend to use
+;; a resource that CO instructions use, to get a pairable delay slot insn?
  
-(define_insn_reservation "branch_zero"  5
-             (and (eq_attr "type" "cbranch")
-                 (eq_attr "length" "2"))
-             "(issue+pcr_addrcalc),pcr_addrcalc,nothing")
-
-(define_insn_reservation "branch"  7
-             (eq_attr "type" "cbranch")
-             "(issue+pcr_addrcalc),pcr_addrcalc,nothing")
+(define_insn_reservation "sh4_branch"  1
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "cbranch,jump"))
+  "issue+pcr_addrcalc")
  
  ;; Branch Far (JMP,RTS,BRAF)
  ;; Group:      CO
  ;; Latency:    3
  ;; Issue Rate:         2
-;;    Since issue stage (D stage) is blocked for 2nd cycle, 
-;;    cpu_unit  int  is reserved since it might be required for far
-;;    address calculation.
+;; ??? Scheduling happens before branch shortening, and hence jmp and braf
+;; can't be distinguished from bra for the "jump" pattern.
  
-(define_insn_reservation "branch_far" 12
-         (and (eq_attr "type" "jump,return")
-             (eq_attr "length" "6"))
-         "d_lock*2,int+pcr_addrcalc,pcr_addrcalc")
+(define_insn_reservation "sh4_return" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "return,jump_ind"))
+         "d_lock*2")
  
  ;; RTE
  ;; Group:      CO
-;; atency:     5
+;; Latency:    5
  ;; Issue Rate:         5
  ;; this instruction can be executed in any of the pipelines 
  ;; and blocks the pipeline for next 4 stages.
  
-(define_insn_reservation "return_from_exp" 5
-          (eq_attr "type" "rte")
-         "(issue+pcr_addrcalc),d_lock*4,int+pcr_addrcalc,nothing")
+(define_insn_reservation "sh4_return_from_exp" 5
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "rte"))
+  "d_lock*5")
  
  ;; OCBP, OCBWB
  ;; Group:      CO
-;; Latency:    5
+;; Latency:    1-5
  ;; Issue Rate:         1
  
-(define_insn_reservation "ocbwb"  5
-          (eq_attr "insn_class" "cwb") 
-          "issue,(int+memory),memory*5")
+;; cwb is used for the sequence ocbwb @%0; extu.w %0,%2; or %1,%2; mov.l %0,@%2
+;; ocbwb on its own would be "d_lock,nothing,memory*5"
+(define_insn_reservation "ocbwb"  6
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "cwb"))
+  "d_lock*2,(d_lock+memory)*3,issue+load_store+memory,memory*2")
                 
  ;; LDS to PR,JSR
  ;; Group:      CO
  ;; Latency:    3
  ;; Issue Rate:         2
  ;; The SX stage is blocked for last 2 cycles.
+;; OTOH, the only time that has an effect for insns generated by the compiler
+;; is when lds to PR is followed by sts from PR - and that is highly unlikely -
+;; or when we are doing a function call - and we don't do inter-function
+;; scheduling.  For the function call case, it's really best that we end with
+;; something that models an rts.
  
-(define_insn_reservation "lds_to_pr" 3 
-          (eq_attr "type" "prset,call,sfunc") 
-          "(issue+pcr_addrcalc),(issue+int+pcr_addrcalc),(int+pcr_addrcalc)*2")
+(define_insn_reservation "sh4_lds_to_pr" 3 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "prset") )
+  "d_lock*2")
+
+;; calls introduce a longisch delay that is likely to flush the pipelines
+;; of the caller's instructions.  Ordinary functions tend to end with a
+;; load to restore a register (in the delay slot of rts), while sfuncs
+;; tend to end with an EX or MT insn.  But that is not actually relevant,
+;; since there are no instructions that contend for memory access early.
+;; We could, of course, provide exact scheduling information for specific
+;; sfuncs, if that should prove useful.
+
+(define_insn_reservation "sh4_call" 16 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "call,sfunc"))
+  "d_lock*16")
  
  ;; LDS.L to PR 
  ;; Group:      CO
@@ -10714,8 +10949,9 @@
  ;; The SX unit is blocked for last 2 cycles.
   
  (define_insn_reservation "ldsmem_to_pr"  3
-      (eq_attr "type" "pload") 
-     "(issue+pcr_addrcalc),(issue+int+pcr_addrcalc),(int+memory+pcr_addrcalc),(int+pcr_addrcalc)")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "pload"))
+  "d_lock*2")
  
  ;; STS from PR
  ;; Group:      CO
@@ -10724,17 +10960,19 @@
  ;; The SX unit in second and third cycles.
  
  (define_insn_reservation "sts_from_pr" 2
-        (eq_attr "type" "prget")
-       "(issue+pcr_addrcalc),(pipe_01+int+pcr_addrcalc),(int+pcr_addrcalc),nothing")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "prget"))
+  "d_lock*2")
  
  ;; STS.L from PR
  ;; Group:      CO
  ;; Latency:    2
  ;; Issue Rate:         2
  
-(define_insn_reservation "prload_mem" 2 
-          (eq_attr "type" "pstore")
-           "(issue+pcr_addrcalc),(pipe_01+int+pcr_addrcalc),(int+memory+pcr_addrcalc),memory")
+(define_insn_reservation "sh4_prstore_mem" 2 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "pstore"))
+  "d_lock*2,nothing,memory")
  
  ;; LDS to FPSCR
  ;; Group:      CO
@@ -10742,9 +10980,10 @@
  ;; Issue Rate:         1
  ;; F1 is blocked for last three cycles. 
  
-(define_insn_reservation "fpscr_store" 4
-        (eq_attr "insn_class" "lds_to_fpscr")
-       "issue,int,F1*3")
+(define_insn_reservation "fpscr_load" 4
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "gp_fpscr"))
+  "d_lock,nothing,F1*3")
  
  ;; LDS.L to FPSCR
  ;; Group:      CO
@@ -10753,9 +10992,10 @@
  ;; Issue Rate:         1
  ;; F1 is blocked for last three cycles.
  
-(define_insn_reservation "fpscr_store_mem" 4
-        (eq_attr "insn_class"  "ldsmem_to_fpscr") 
-        "issue,(int+memory),(F1+memory),F1*2")
+(define_insn_reservation "fpscr_load_mem" 4
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type"  "mem_fpscr"))
+  "d_lock,nothing,(F1+memory),F1*2")
  
  \f
  ;; Fixed point multiplication (DMULS.L DMULU.L MUL.L MULS.W,MULU.W)
@@ -10764,28 +11004,49 @@
  ;; Issue Rate:         1
  
  (define_insn_reservation "multi" 4
-       (eq_attr "type" "smpy,dmpy")
-       "issue,(issue+int+f1_1),(int+f1_1),(f1_1|f1_2)*2,F2,FS")
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "smpy,dmpy"))
+  "d_lock,(d_lock+f1_1),(f1_1|f1_2)*3,F2")
+
+;; Fixed STS from MACL / MACH
+;; Group:      CO
+;; Latency:    3
+;; Issue Rate:         1
+
+(define_insn_reservation "sh4_mac_gp" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "mac_gp"))
+  "d_lock")
  
  
  ;; Single precision floating point computation FCMP/EQ,
-;; FCP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRVHG, FSCHG
+;; FCMP/GT, FADD, FLOAT, FMAC, FMUL, FSUB, FTRC, FRVHG, FSCHG
  ;; Group:      FE
-;; Latency:    4
+;; Latency:    3/4
  ;; Issue Rate:         1
  
-(define_insn_reservation "fp_arith"  4
-              (eq_attr "type" "fp")
-             "issue,F1,F2,FS")
+(define_insn_reservation "fp_arith"  3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fp"))
+  "issue,F01,F2")
+
+(define_insn_reservation "fp_arith_ftrc"  3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "ftrc_s"))
+  "issue,F01,F2")
+
+(define_bypass 1 "fp_arith_ftrc" "sh4_fpul_gp")
  
  ;; Single Precision FDIV/SQRT
  ;; Group:      FE
-;; Latency:    12/13
+;; Latency:    12/13 (FDIV); 11/12 (FSQRT)
  ;; Issue Rate:         1
+;; We describe fdiv here; fsqrt is actually one cycle faster.
  
-(define_insn_reservation "fp_div" 13
-               (eq_attr "type" "fdiv")
-               "issue,F1+F3,F1+F2+F3,F3*7,F1+F3,F2,FS")
+(define_insn_reservation "fp_div" 12
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "fdiv"))
+  "issue,F01+F3,F2+F3,F3*7,F1+F3,F2")
  
  ;; Double Precision floating point computation
  ;; (FCNVDS, FCNVSD, FLOAT, FTRC)
@@ -10793,34 +11054,51 @@
  ;; Latency:    (3,4)/5
  ;; Issue Rate:         1
  
-(define_insn_reservation "dp_float" 5
-         (eq_attr "type" "dfp_conv")
-        "issue,F1,F1+F2,F2+FS,FS")
+(define_insn_reservation "dp_float" 4
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "dfp_conv"))
+  "issue,F01,F1+F2,F2")
  
-;; Double-precision floating-point (FADD ,FMUL,FSUB) 
+;; Double-precision floating-point (FADD,FMUL,FSUB) 
  ;; Group:      FE
  ;; Latency:    (7,8)/9
  ;; Issue Rate:         1
  
-(define_insn_reservation "fp_double_arith" 9
-        (eq_attr "type" "dfp_arith")
-       "issue,F1,F1+F2,fpu*4,F2+FS,FS")
+(define_insn_reservation "fp_double_arith" 8
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "dfp_arith"))
+  "issue,F01,F1+F2,fpu*4,F2")
  
  ;; Double-precision FCMP (FCMP/EQ,FCMP/GT) 
-;; Group:      FE
+;; Group:      CO
  ;; Latency:    3/5
  ;; Issue Rate:         2
  
-(define_insn_reservation "fp_double_cmp" 5 
-        (eq_attr "type" "dfp_cmp")
-       "issue,(issue+F1),F1+F2,F2+FS,FS")
+(define_insn_reservation "fp_double_cmp" 3 
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "dfp_cmp"))
+  "d_lock,(d_lock+F01),F1+F2,F2")
  
  ;; Double precision FDIV/SQRT
  ;; Group:      FE
  ;; Latency:    (24,25)/26
  ;; Issue Rate:         1
  
-(define_insn_reservation "dp_div" 26
-        (eq_attr "type" "dfdiv")
-       "issue,F1+F3,F1+F2+F3,F2+F3+FS,F3*16,F1+F3,F1+F2+F3,fpu+F3,F2+FS,FS")
+(define_insn_reservation "dp_div" 25
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "dfdiv"))
+  "issue,F01+F3,F1+F2+F3,F2+F3,F3*16,F1+F3,(fpu+F3)*2,F2")
+
  
+;; Use the branch-not-taken case to model arith3 insns.  For the branch taken
+;; case, we'd get a d_lock instead of issue at the end.
+(define_insn_reservation "arith3" 3
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "arith3"))
+  "issue,d_lock+pcr_addrcalc,issue")
+
+;; arith3b insns schedule the same no matter if the branch is taken or not.
+(define_insn_reservation "arith3b" 2
+  (and (eq_attr "pipe_model" "sh4")
+       (eq_attr "type" "arith3"))
+  "issue,d_lock+pcr_addrcalc")