* target-def.h (TARGET_ASM_OUTPUT_MI_THUNK): Default to NULL.

[pf3gnuchains/gcc-fork.git] / gcc / config / pa / pa.md
diff --git a/gcc/config/pa/pa.md b/gcc/config/pa/pa.md

index aa0c305..bbd44fa 100644 (file)
--- a/gcc/config/pa/pa.md
+++ b/gcc/config/pa/pa.md
@@ -44,7 +44,7 @@
  ;;
  ;; FIXME: Add 800 scheduling for completeness?
  
-(define_attr "cpu" "700,7100,7100LC,7200,8000" (const (symbol_ref "pa_cpu_attr")))
+(define_attr "cpu" "700,7100,7100LC,7200,7300,8000" (const (symbol_ref "pa_cpu_attr")))
  
  ;; Length (in # of bytes).
  (define_attr "length" ""
@@ -139,35 +139,10 @@
                        (const_int 0)))
    [(eq_attr "in_branch_delay" "true") (nil) (nil)])
  
-;; Function units of the HPPA. The following data is for the 700 CPUs
-;; (Mustang CPU + Timex FPU aka PA-89) because that's what I have the docs for.
-;; Scheduling instructions for PA-83 machines according to the Snake
-;; constraints shouldn't hurt.
-
-;; (define_function_unit {name} {num-units} {n-users} {test}
-;;                       {ready-delay} {issue-delay} [{conflict-list}])
-
-;; The integer ALU.
-;; (Noted only for documentation; units that take one cycle do not need to
-;; be specified.)
-
-;; (define_function_unit "alu" 1 0
-;;  (and (eq_attr "type" "unary,shift,nullshift,binary,move,address")
-;;      (eq_attr "cpu" "700"))
-;;  1 0)
-
-
  ;; Memory. Disregarding Cache misses, the Mustang memory times are:
  ;; load: 2, fpload: 3
  ;; store, fpstore: 3, no D-cache operations should be scheduled.
  
-(define_function_unit "pa700memory" 1 0
-  (and (eq_attr "type" "load,fpload")
-       (eq_attr "cpu" "700")) 2 0)
-(define_function_unit "pa700memory" 1 0 
-  (and (eq_attr "type" "store,fpstore")
-       (eq_attr "cpu" "700")) 3 3)
-
  ;; The Timex (aka 700) has two floating-point units: ALU, and MUL/DIV/SQRT.
  ;; Timings:
  ;; Instruction Time    Unit    Minimum Distance (unit contention)
@@ -186,46 +161,100 @@
  ;; fdiv,dbl    12      MPY     12
  ;; fsqrt,sgl   14      MPY     14
  ;; fsqrt,dbl   18      MPY     18
+;;
+;; We don't model fmpyadd/fmpysub properly as those instructions
+;; keep both the FP ALU and MPY units busy.  Given that these
+;; processors are obsolete, I'm not going to spend the time to
+;; model those instructions correctly.
+
+(define_automaton "pa700")
+(define_cpu_unit "dummy_700,mem_700,fpalu_700,fpmpy_700" "pa700")
  
-(define_function_unit "pa700fp_alu" 1 0
+(define_insn_reservation "W0" 4
    (and (eq_attr "type" "fpcc")
-       (eq_attr "cpu" "700")) 4 2)
-(define_function_unit "pa700fp_alu" 1 0
+       (eq_attr "cpu" "700"))
+  "fpalu_700*2")
+
+(define_insn_reservation "W1" 3
    (and (eq_attr "type" "fpalu")
-       (eq_attr "cpu" "700")) 3 2)
-(define_function_unit "pa700fp_mpy" 1 0
+       (eq_attr "cpu" "700"))
+  "fpalu_700*2")
+
+(define_insn_reservation "W2" 3
    (and (eq_attr "type" "fpmulsgl,fpmuldbl")
-       (eq_attr "cpu" "700")) 3 2)
-(define_function_unit "pa700fp_mpy" 1 0
+       (eq_attr "cpu" "700"))
+  "fpmpy_700*2")
+
+(define_insn_reservation "W3" 10
    (and (eq_attr "type" "fpdivsgl")
-       (eq_attr "cpu" "700")) 10 10)
-(define_function_unit "pa700fp_mpy" 1 0
+       (eq_attr "cpu" "700"))
+  "fpmpy_700*10")
+
+(define_insn_reservation "W4" 12
    (and (eq_attr "type" "fpdivdbl")
-       (eq_attr "cpu" "700")) 12 12)
-(define_function_unit "pa700fp_mpy" 1 0
+       (eq_attr "cpu" "700"))
+  "fpmpy_700*12")
+
+(define_insn_reservation "W5" 14
    (and (eq_attr "type" "fpsqrtsgl")
-       (eq_attr "cpu" "700")) 14 14)
-(define_function_unit "pa700fp_mpy" 1 0
+       (eq_attr "cpu" "700"))
+  "fpmpy_700*14")
+
+(define_insn_reservation "W6" 18
    (and (eq_attr "type" "fpsqrtdbl")
-       (eq_attr "cpu" "700")) 18 18)
+       (eq_attr "cpu" "700"))
+  "fpmpy_700*18")
+
+(define_insn_reservation "W7" 2
+  (and (eq_attr "type" "load")
+       (eq_attr "cpu" "700"))
+  "mem_700")
+
+(define_insn_reservation "W8" 2
+  (and (eq_attr "type" "fpload")
+       (eq_attr "cpu" "700"))
+  "mem_700")
+
+(define_insn_reservation "W9" 3
+  (and (eq_attr "type" "store")
+       (eq_attr "cpu" "700"))
+  "mem_700*3")
+
+(define_insn_reservation "W10" 3
+  (and (eq_attr "type" "fpstore")
+       (eq_attr "cpu" "700"))
+  "mem_700*3")
+
+(define_insn_reservation "W11" 1
+  (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpdivdbl,fpsqrtsgl,fpsqrtdbl,load,fpload,store,fpstore")
+       (eq_attr "cpu" "700"))
+  "dummy_700")
+
+;; We have a bypass for all computations in the FP unit which feed an
+;; FP store as long as the sizes are the same.
+(define_bypass 2 "W1,W2" "W10" "hppa_fpstore_bypass_p")
+(define_bypass 9 "W3" "W10" "hppa_fpstore_bypass_p")
+(define_bypass 11 "W4" "W10" "hppa_fpstore_bypass_p")
+(define_bypass 13 "W5" "W10" "hppa_fpstore_bypass_p")
+(define_bypass 17 "W6" "W10" "hppa_fpstore_bypass_p")
+
+;; We have an "anti-bypass" for FP loads which feed an FP store.
+(define_bypass 4 "W8" "W10" "hppa_fpstore_bypass_p")
  
  ;; Function units for the 7100 and 7150.  The 7100/7150 can dual-issue
  ;; floating point computations with non-floating point computations (fp loads
  ;; and stores are not fp computations).
  ;;
-
  ;; Memory. Disregarding Cache misses, memory loads take two cycles; stores also
  ;; take two cycles, during which no Dcache operations should be scheduled.
  ;; Any special cases are handled in pa_adjust_cost.  The 7100, 7150 and 7100LC
  ;; all have the same memory characteristics if one disregards cache misses.
-(define_function_unit "pa7100memory" 1 0
-  (and (eq_attr "type" "load,fpload")
-       (eq_attr "cpu" "7100,7100LC")) 2 0)
-(define_function_unit "pa7100memory" 1 0 
-  (and (eq_attr "type" "store,fpstore")
-       (eq_attr "cpu" "7100,7100LC")) 2 2)
-
+;;
  ;; The 7100/7150 has three floating-point units: ALU, MUL, and DIV.
+;; There's no value in modeling the ALU and MUL separately though
+;; since there can never be a functional unit conflict given the
+;; latency and issue rates for those units.
+;;
  ;; Timings:
  ;; Instruction Time    Unit    Minimum Distance (unit contention)
  ;; fcpy                2       ALU     1
@@ -244,40 +273,64 @@
  ;; fsqrt,sgl   8       DIV     8
  ;; fsqrt,dbl   15      DIV     15
  
-(define_function_unit "pa7100fp_alu" 1 0
-  (and (eq_attr "type" "fpcc,fpalu")
-       (eq_attr "cpu" "7100")) 2 1)
-(define_function_unit "pa7100fp_mpy" 1 0
-  (and (eq_attr "type" "fpmulsgl,fpmuldbl")
-       (eq_attr "cpu" "7100")) 2 1)
-(define_function_unit "pa7100fp_div" 1 0
-  (and (eq_attr "type" "fpdivsgl,fpsqrtsgl")
-       (eq_attr "cpu" "7100")) 8 8)
-(define_function_unit "pa7100fp_div" 1 0
-  (and (eq_attr "type" "fpdivdbl,fpsqrtdbl")
-       (eq_attr "cpu" "7100")) 15 15)
+(define_automaton "pa7100")
+(define_cpu_unit "i_7100, f_7100,fpmac_7100,fpdivsqrt_7100,mem_7100" "pa7100")
  
-;; To encourage dual issue we define function units corresponding to
-;; the instructions which can be dual issued.    This is a rather crude
-;; approximation, the "pa7100nonflop" test in particular could be refined.
-(define_function_unit "pa7100flop" 1 1
-  (and
-    (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
-    (eq_attr "cpu" "7100")) 1 1)
-
-(define_function_unit "pa7100nonflop" 1 1
-  (and
-    (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
-    (eq_attr "cpu" "7100")) 1 1)
+(define_insn_reservation "X0" 2
+  (and (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
+       (eq_attr "cpu" "7100"))
+  "f_7100,fpmac_7100")
  
+(define_insn_reservation "X1" 8
+  (and (eq_attr "type" "fpdivsgl,fpsqrtsgl")
+       (eq_attr "cpu" "7100"))
+  "f_7100+fpdivsqrt_7100,fpdivsqrt_7100*7")
  
-;; Memory subsystem works just like 7100/7150 (except for cache miss times which
-;; we don't model here).  
+(define_insn_reservation "X2" 15
+  (and (eq_attr "type" "fpdivdbl,fpsqrtdbl")
+       (eq_attr "cpu" "7100"))
+  "f_7100+fpdivsqrt_7100,fpdivsqrt_7100*14")
+
+(define_insn_reservation "X3" 2
+  (and (eq_attr "type" "load")
+       (eq_attr "cpu" "7100"))
+  "i_7100+mem_7100")
+
+(define_insn_reservation "X4" 2
+  (and (eq_attr "type" "fpload")
+       (eq_attr "cpu" "7100"))
+  "i_7100+mem_7100")
+
+(define_insn_reservation "X5" 2
+  (and (eq_attr "type" "store")
+       (eq_attr "cpu" "7100"))
+  "i_7100+mem_7100,mem_7100")
+
+(define_insn_reservation "X6" 2
+  (and (eq_attr "type" "fpstore")
+       (eq_attr "cpu" "7100"))
+  "i_7100+mem_7100,mem_7100")
+
+(define_insn_reservation "X7" 1
+  (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore")
+       (eq_attr "cpu" "7100"))
+  "i_7100")
+
+;; We have a bypass for all computations in the FP unit which feed an
+;; FP store as long as the sizes are the same.
+(define_bypass 1 "X0" "X6" "hppa_fpstore_bypass_p")
+(define_bypass 7 "X1" "X6" "hppa_fpstore_bypass_p")
+(define_bypass 14 "X2" "X6" "hppa_fpstore_bypass_p")
+
+;; We have an "anti-bypass" for FP loads which feed an FP store.
+(define_bypass 3 "X4" "X6" "hppa_fpstore_bypass_p")
  
  ;; The 7100LC has three floating-point units: ALU, MUL, and DIV.
-;; Note divides and sqrt flops lock the cpu until the flop is
-;; finished.  fmpy and xmpyu (fmpyi) lock the cpu for one cycle.
-;; There's no way to avoid the penalty.
+;; There's no value in modeling the ALU and MUL separately though
+;; since there can never be a functional unit conflict that
+;; can be avoided given the latency, issue rates and mandatory
+;; one cycle cpu-wide lock for a double precision fp multiply.
+;;
  ;; Timings:
  ;; Instruction Time    Unit    Minimum Distance (unit contention)
  ;; fcpy                2       ALU     1
@@ -299,106 +352,199 @@
  ;; fdiv,dbl    15      DIV     15
  ;; fsqrt,sgl   8       DIV     8
  ;; fsqrt,dbl   15      DIV     15
+;;
+;; The PA7200 is just like the PA7100LC except that there is
+;; no store-store penalty.
+;;
+;; The PA7300 is just like the PA7200 except that there is
+;; no store-load penalty.
+;;
+;; Note there are some aspects of the 7100LC we are not modeling
+;; at the moment.  I'll be reviewing the 7100LC scheduling info
+;; shortly and updating this description.
+;;
+;;   load-load pairs
+;;   store-store pairs
+;;   other issue modeling
+
+(define_automaton "pa7100lc")
+(define_cpu_unit "i0_7100lc, i1_7100lc, f_7100lc" "pa7100lc")
+(define_cpu_unit "fpmac_7100lc" "pa7100lc")
+(define_cpu_unit "mem_7100lc" "pa7100lc")
+
+;; Double precision multiplies lock the entire CPU for one
+;; cycle.  There is no way to avoid this lock and trying to
+;; schedule around the lock is pointless and thus there is no
+;; value in trying to model this lock.
+;;
+;; Not modeling the lock allows us to treat fp multiplies just
+;; like any other FP alu instruction.  It allows for a smaller
+;; DFA and may reduce register pressure.
+(define_insn_reservation "Y0" 2
+  (and (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
+       (eq_attr "cpu" "7100LC,7200,7300"))
+  "f_7100lc,fpmac_7100lc")
+
+;; fp division and sqrt instructions lock the entire CPU for
+;; 7 cycles (single precision) or 14 cycles (double precision).
+;; There is no way to avoid this lock and trying to schedule
+;; around the lock is pointless and thus there is no value in
+;; trying to model this lock.  Not modeling the lock allows
+;; for a smaller DFA and may reduce register pressure.
+(define_insn_reservation "Y1" 1
+  (and (eq_attr "type" "fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
+       (eq_attr "cpu" "7100LC,7200,7300"))
+  "f_7100lc")
+
+(define_insn_reservation "Y2" 2
+  (and (eq_attr "type" "load")
+       (eq_attr "cpu" "7100LC,7200,7300"))
+  "i1_7100lc+mem_7100lc")
+
+(define_insn_reservation "Y3" 2
+  (and (eq_attr "type" "fpload")
+       (eq_attr "cpu" "7100LC,7200,7300"))
+  "i1_7100lc+mem_7100lc")
+
+(define_insn_reservation "Y4" 2
+  (and (eq_attr "type" "store")
+       (eq_attr "cpu" "7100LC"))
+  "i1_7100lc+mem_7100lc,mem_7100lc")
+
+(define_insn_reservation "Y5" 2
+  (and (eq_attr "type" "fpstore")
+       (eq_attr "cpu" "7100LC"))
+  "i1_7100lc+mem_7100lc,mem_7100lc")
+
+(define_insn_reservation "Y6" 1
+  (and (eq_attr "type" "shift,nullshift")
+       (eq_attr "cpu" "7100LC,7200,7300"))
+  "i1_7100lc")
+
+(define_insn_reservation "Y7" 1
+  (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore,shift,nullshift")
+       (eq_attr "cpu" "7100LC,7200,7300"))
+  "(i0_7100lc|i1_7100lc)")
+
+;; The 7200 has a store-load penalty
+(define_insn_reservation "Y8" 2
+  (and (eq_attr "type" "store")
+       (eq_attr "cpu" "7200"))
+  "i1_7100lc,mem_7100lc")
+
+(define_insn_reservation "Y9" 2
+  (and (eq_attr "type" "fpstore")
+       (eq_attr "cpu" "7200"))
+  "i1_7100lc,mem_7100lc")
+
+;; The 7300 has no penalty for store-store or store-load
+(define_insn_reservation "Y10" 2
+  (and (eq_attr "type" "store")
+       (eq_attr "cpu" "7300"))
+  "i1_7100lc")
+
+(define_insn_reservation "Y11" 2
+  (and (eq_attr "type" "fpstore")
+       (eq_attr "cpu" "7300"))
+  "i1_7100lc")
+
+;; We have an "anti-bypass" for FP loads which feed an FP store.
+(define_bypass 3 "Y3" "Y5,Y9,Y11" "hppa_fpstore_bypass_p")
  
-(define_function_unit "pa7100LCfp_alu" 1 0
-  (and (eq_attr "type" "fpcc,fpalu")
-       (eq_attr "cpu" "7100LC,7200")) 2 1)
-(define_function_unit "pa7100LCfp_mpy" 1 0
-  (and (eq_attr "type" "fpmulsgl")
-       (eq_attr "cpu" "7100LC,7200")) 2 1)
-(define_function_unit "pa7100LCfp_mpy" 1 0
-  (and (eq_attr "type" "fpmuldbl")
-       (eq_attr "cpu" "7100LC,7200")) 3 2)
-(define_function_unit "pa7100LCfp_div" 1 0
-  (and (eq_attr "type" "fpdivsgl,fpsqrtsgl")
-       (eq_attr "cpu" "7100LC,7200")) 8 8)
-(define_function_unit "pa7100LCfp_div" 1 0
-  (and (eq_attr "type" "fpdivdbl,fpsqrtdbl")
-       (eq_attr "cpu" "7100LC,7200")) 15 15)
-
-;; Define the various functional units for dual-issue.
-
-;; There's only one floating point unit.
-(define_function_unit "pa7100LCflop" 1 1
-  (and
-    (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
-    (eq_attr "cpu" "7100LC,7200")) 1 1)
-
-;; Shifts and memory ops execute in only one of the integer ALUs
-(define_function_unit "pa7100LCshiftmem" 1 1
-  (and
-    (eq_attr "type" "shift,nullshift,load,fpload,store,fpstore")
-    (eq_attr "cpu" "7100LC,7200")) 1 1)
-
-;; We have two basic ALUs.
-(define_function_unit "pa7100LCalu" 2 1
-  (and
-    (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
-   (eq_attr "cpu" "7100LC,7200")) 1 1)
-
-;; I don't have complete information on the PA7200; however, most of
-;; what I've heard makes it look like a 7100LC without the store-store
-;; penalty.  So that's how we'll model it.
-
-;; Memory. Disregarding Cache misses, memory loads and stores take
-;; two cycles.  Any special cases are handled in pa_adjust_cost.
-(define_function_unit "pa7200memory" 1 0
-  (and (eq_attr "type" "load,fpload,store,fpstore")
-       (eq_attr "cpu" "7200")) 2 0)
-
-;; I don't have detailed information on the PA7200 FP pipeline, so I
-;; treat it just like the 7100LC pipeline.
-;; Similarly for the multi-issue fake units.
-
-;; 
  ;; Scheduling for the PA8000 is somewhat different than scheduling for a
  ;; traditional architecture.
  ;;
  ;; The PA8000 has a large (56) entry reorder buffer that is split between
  ;; memory and non-memory operations.
  ;;
-;; The PA800 can issue two memory and two non-memory operations per cycle to
-;; the function units.  Similarly, the PA8000 can retire two memory and two
-;; non-memory operations per cycle.
+;; The PA8000 can issue two memory and two non-memory operations per cycle to
+;; the function units, with the exception of branches and multi-output
+;; instructions.  The PA8000 can retire two non-memory operations per cycle
+;; and two memory operations per cycle, only one of which may be a store.
  ;;
  ;; Given the large reorder buffer, the processor can hide most latencies.
  ;; According to HP, they've got the best results by scheduling for retirement
  ;; bandwidth with limited latency scheduling for floating point operations.
  ;; Latency for integer operations and memory references is ignored.
  ;;
-;; We claim floating point operations have a 2 cycle latency and are
-;; fully pipelined, except for div and sqrt which are not pipelined.
  ;;
-;; It is not necessary to define the shifter and integer alu units.
+;; We claim floating point operations have a 2 cycle latency and are
+;; fully pipelined, except for div and sqrt which are not pipelined and
+;; take from 17 to 31 cycles to complete.
  ;;
-;; These first two define_unit_unit descriptions model retirement from
-;; the reorder buffer.
-(define_function_unit "pa8000lsu" 2 1
+;; It's worth noting that there is no way to saturate all the functional
+;; units on the PA8000 as there is not enough issue bandwidth.
+
+(define_automaton "pa8000")
+(define_cpu_unit "inm0_8000, inm1_8000, im0_8000, im1_8000" "pa8000")
+(define_cpu_unit "rnm0_8000, rnm1_8000, rm0_8000, rm1_8000" "pa8000")
+(define_cpu_unit "store_8000" "pa8000")
+(define_cpu_unit "f0_8000, f1_8000" "pa8000")
+(define_cpu_unit "fdivsqrt0_8000, fdivsqrt1_8000" "pa8000")
+(define_reservation "inm_8000" "inm0_8000 | inm1_8000")
+(define_reservation "im_8000" "im0_8000 | im1_8000")
+(define_reservation "rnm_8000" "rnm0_8000 | rnm1_8000")
+(define_reservation "rm_8000" "rm0_8000 | rm1_8000")
+(define_reservation "f_8000" "f0_8000 | f1_8000")
+(define_reservation "fdivsqrt_8000" "fdivsqrt0_8000 | fdivsqrt1_8000")
+
+;; We can issue any two memops per cycle, but we can only retire
+;; one memory store per cycle.  We assume that the reorder buffer
+;; will hide any memory latencies per HP's recommendation.
+(define_insn_reservation "Z0" 0
    (and
-    (eq_attr "type" "load,fpload,store,fpstore")
-    (eq_attr "cpu" "8000")) 1 1)
+    (eq_attr "type" "load,fpload")
+    (eq_attr "cpu" "8000"))
+  "im_8000,rm_8000")
  
-(define_function_unit "pa8000alu" 2 1
+(define_insn_reservation "Z1" 0
    (and
-    (eq_attr "type" "!load,fpload,store,fpstore")
-    (eq_attr "cpu" "8000")) 1 1)
-
-;; Claim floating point ops have a 2 cycle latency, excluding div and
-;; sqrt, which are not pipelined and issue to different units.
-(define_function_unit "pa8000fmac" 2 0
+    (eq_attr "type" "store,fpstore")
+    (eq_attr "cpu" "8000"))
+  "im_8000,rm_8000+store_8000")
+
+;; We can issue and retire two non-memory operations per cycle with
+;; a few exceptions (branches).  This group catches those we want
+;; to assume have zero latency.
+(define_insn_reservation "Z2" 0
    (and
-    (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
-    (eq_attr "cpu" "8000")) 2 1)
+    (eq_attr "type" "!load,fpload,store,fpstore,uncond_branch,branch,cbranch,fbranch,call,dyncall,multi,milli,parallel_branch,fpcc,fpalu,fpmulsgl,fpmuldbl,fpsqrtsgl,fpsqrtdbl,fpdivsgl,fpdivdbl")
+    (eq_attr "cpu" "8000"))
+  "inm_8000,rnm_8000")
  
-(define_function_unit "pa8000fdiv" 2 1
+;; Branches use both slots in the non-memory issue and
+;; retirement unit.
+(define_insn_reservation "Z3" 0
    (and
-    (eq_attr "type" "fpdivsgl,fpsqrtsgl")
-    (eq_attr "cpu" "8000")) 17 17)
+    (eq_attr "type" "uncond_branch,branch,cbranch,fbranch,call,dyncall,multi,milli,parallel_branch")
+    (eq_attr "cpu" "8000"))
+  "inm0_8000+inm1_8000,rnm0_8000+rnm1_8000")
+
+;; We partial latency schedule the floating point units.
+;; They can issue/retire two at a time in the non-memory
+;; units.  We fix their latency at 2 cycles and they
+;; are fully pipelined.
+(define_insn_reservation "Z4" 1
+ (and
+   (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
+   (eq_attr "cpu" "8000"))
+ "inm_8000,f_8000,rnm_8000")
+
+;; The fdivsqrt units are not pipelined and have a very long latency.  
+;; To keep the DFA from exploding, we do not show all the
+;; reservations for the divsqrt unit.
+(define_insn_reservation "Z5" 17
+ (and
+   (eq_attr "type" "fpdivsgl,fpsqrtsgl")
+   (eq_attr "cpu" "8000"))
+ "inm_8000,fdivsqrt_8000*6,rnm_8000")
+
+(define_insn_reservation "Z6" 31
+ (and
+   (eq_attr "type" "fpdivdbl,fpsqrtdbl")
+   (eq_attr "cpu" "8000"))
+ "inm_8000,fdivsqrt_8000*6,rnm_8000")
  
-(define_function_unit "pa8000fdiv" 2 1
-  (and
-    (eq_attr "type" "fpdivdbl,fpsqrtdbl")
-    (eq_attr "cpu" "8000")) 31 31)
  
  \f
  ;; Compare instructions.
@@ -482,6 +628,26 @@
    [(set_attr "length" "4")
     (set_attr "type" "fpcc")])
  
+;; The following two patterns are optimization placeholders.  In almost
+;; all cases, the user of the condition code will be simplified and the
+;; original condition code setting insn should be eliminated.
+
+(define_insn "*setccfp0"
+  [(set (reg:CCFP 0)
+       (const_int 0))]
+  "! TARGET_SOFT_FLOAT"
+  "fcmp,dbl,!= %%fr0,%%fr0"
+  [(set_attr "length" "4")
+   (set_attr "type" "fpcc")])
+
+(define_insn "*setccfp1"
+  [(set (reg:CCFP 0)
+       (const_int 1))]
+  "! TARGET_SOFT_FLOAT"
+  "fcmp,dbl,= %%fr0,%%fr0"
+  [(set_attr "length" "4")
+   (set_attr "type" "fpcc")])
+
  ;; scc insns.
  
  (define_expand "seq"
@@ -2281,17 +2447,19 @@
    ""
    "*
  {
-  rtx label_rtx = gen_label_rtx ();
    rtx xoperands[3];
    extern FILE *asm_out_file;
  
    xoperands[0] = operands[0];
    xoperands[1] = operands[1];
-  xoperands[2] = label_rtx;
+  if (TARGET_SOM || ! TARGET_GAS)
+    xoperands[2] = gen_label_rtx ();
+
    output_asm_insn (\"{bl|b,l} .+8,%0\", xoperands);
    output_asm_insn (\"{depi|depwi} 0,31,2,%0\", xoperands);
-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
-                            CODE_LABEL_NUMBER (label_rtx));
+  if (TARGET_SOM || ! TARGET_GAS)
+    ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
+                              CODE_LABEL_NUMBER (xoperands[2]));
  
    /* If we're trying to load the address of a label that happens to be
       close, then we can use a shorter sequence.  */
@@ -2302,12 +2470,24 @@
      {
        /* Prefixing with R% here is wrong, it extracts just 11 bits and is
          always non-negative.  */
-      output_asm_insn (\"ldo %1-%2(%0),%0\", xoperands);
+      if (TARGET_SOM || ! TARGET_GAS)
+       output_asm_insn (\"ldo %1-%2(%0),%0\", xoperands);
+      else
+       output_asm_insn (\"ldo %1-$PIC_pcrel$0+8(%0),%0\", xoperands);
      }
    else
      {
-      output_asm_insn (\"addil L%%%1-%2,%0\", xoperands);
-      output_asm_insn (\"ldo R%%%1-%2(%0),%0\", xoperands);
+      if (TARGET_SOM || ! TARGET_GAS)
+       {
+         output_asm_insn (\"addil L%%%1-%2,%0\", xoperands);
+         output_asm_insn (\"ldo R%%%1-%2(%0),%0\", xoperands);
+       }
+      else
+       {
+         output_asm_insn (\"addil L%%%1-$PIC_pcrel$0+8,%0\", xoperands);
+         output_asm_insn (\"ldo R%%%1-$PIC_pcrel$0+12(%0),%0\",
+                          xoperands);
+       }
      }
    return \"\";
  }"
@@ -3653,22 +3833,14 @@
  (define_expand "adddi3"
    [(set (match_operand:DI 0 "register_operand" "")
         (plus:DI (match_operand:DI 1 "register_operand" "")
-                (match_operand:DI 2 "arith_operand" "")))]
+                (match_operand:DI 2 "adddi3_operand" "")))]
    ""
    "")
  
-;; We allow arith_operand for operands2, even though strictly speaking it
-;; we would prefer to us arith11_operand since that's what the hardware
-;; can actually support.
-;;
-;; But the price of the extra reload in that case is worth the simplicity
-;; we get by allowing a trivial adddi3 expander to be used for both
-;; PA64 and PA32.
-
  (define_insn ""
    [(set (match_operand:DI 0 "register_operand" "=r")
         (plus:DI (match_operand:DI 1 "register_operand" "%r")
-                (match_operand:DI 2 "arith_operand" "rI")))]
+                (match_operand:DI 2 "arith11_operand" "rI")))]
    "!TARGET_64BIT"
    "*
  {
@@ -4636,7 +4808,8 @@
      emit_insn (gen_negdf2_fast (operands[0], operands[1]));
    else
      {
-      operands[2] = force_reg (DFmode, immed_real_const_1 (dconstm1, DFmode));
+      operands[2] = force_reg (DFmode,
+       CONST_DOUBLE_FROM_REAL_VALUE (dconstm1, DFmode));
        emit_insn (gen_muldf3 (operands[0], operands[1], operands[2]));
      }
    DONE;
@@ -4666,7 +4839,8 @@
      emit_insn (gen_negsf2_fast (operands[0], operands[1]));
    else
      {
-      operands[2] = force_reg (SFmode, immed_real_const_1 (dconstm1, SFmode));
+      operands[2] = force_reg (SFmode,
+       CONST_DOUBLE_FROM_REAL_VALUE (dconstm1, SFmode));
        emit_insn (gen_mulsf3 (operands[0], operands[1], operands[2]));
      }
    DONE;
@@ -5568,6 +5742,21 @@
    [(set_attr "type" "branch")
     (set_attr "length" "4")])
  
+;; Use the PIC register to ensure it's restored after a
+;; call in PIC mode.  This is used for eh returns which
+;; bypass the return stub.
+(define_insn "return_external_pic"
+  [(return)
+   (use (match_operand 0 "register_operand" "r"))
+   (use (reg:SI 2))
+   (clobber (reg:SI 1))]
+  "flag_pic
+   && current_function_calls_eh_return
+   && true_regnum (operands[0]) == PIC_OFFSET_TABLE_REGNUM"
+  "ldsid (%%sr0,%%r2),%%r1\;mtsp %%r1,%%sr0\;be%* 0(%%sr0,%%r2)"
+  [(set_attr "type" "branch")
+   (set_attr "length" "12")])
+
  (define_expand "prologue"
    [(const_int 0)]
    ""
@@ -5590,15 +5779,24 @@
    /* Try to use the trivial return first.  Else use the full
       epilogue.  */
    if (hppa_can_use_return_insn_p ())
-   emit_jump_insn (gen_return ());
+    emit_jump_insn (gen_return ());
    else
      {
        rtx x;
  
        hppa_expand_epilogue ();
        if (flag_pic)
-       x = gen_return_internal_pic (gen_rtx_REG (word_mode,
-                                                 PIC_OFFSET_TABLE_REGNUM));
+       {
+         rtx pic = gen_rtx_REG (word_mode, PIC_OFFSET_TABLE_REGNUM);
+
+         /* EH returns bypass the normal return stub.  Thus, we must do an
+            interspace branch to return from functions that call eh_return.
+            This is only a problem for returns from shared code.  */
+         if (current_function_calls_eh_return)
+           x = gen_return_external_pic (pic);
+         else
+           x = gen_return_internal_pic (pic);
+       }
        else
         x = gen_return_internal ();
        emit_jump_insn (x);
@@ -5687,14 +5885,23 @@
      {
        rtx xoperands[2];
        xoperands[0] = operands[0];
-      xoperands[1] = gen_label_rtx ();
+      if (TARGET_SOM || ! TARGET_GAS)
+       {
+         xoperands[1] = gen_label_rtx ();
  
-      output_asm_insn (\"{bl|b,l} .+8,%%r1\\n\\taddil L'%l0-%l1,%%r1\",
-                      xoperands);
-      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
-                                 CODE_LABEL_NUMBER (xoperands[1]));
-      output_asm_insn (\"ldo R'%l0-%l1(%%r1),%%r1\\n\\tbv %%r0(%%r1)\",
-                      xoperands);
+         output_asm_insn (\"{bl|b,l} .+8,%%r1\\n\\taddil L'%l0-%l1,%%r1\",
+                          xoperands);
+         ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
+                                    CODE_LABEL_NUMBER (xoperands[1]));
+         output_asm_insn (\"ldo R'%l0-%l1(%%r1),%%r1\", xoperands);
+       }
+      else
+       {
+         output_asm_insn (\"{bl|b,l} .+8,%%r1\", xoperands);
+         output_asm_insn (\"addil L'%l0-$PIC_pcrel$0+4,%%r1\", xoperands);
+         output_asm_insn (\"ldo R'%l0-$PIC_pcrel$0+8(%%r1),%%r1\", xoperands);
+       }
+      output_asm_insn (\"bv %%r0(%%r1)\", xoperands);
      }
    else
      output_asm_insn (\"ldil L'%l0,%%r1\\n\\tbe R'%l0(%%sr4,%%r1)\", operands);;
@@ -5856,18 +6063,19 @@
    [(set_attr "type" "call")
     (set (attr "length")
  ;;       If we're sure that we can either reach the target or that the
-;;      linker can use a long-branch stub, then the length is 4 bytes.
+;;      linker can use a long-branch stub, then the length is at most
+;;      8 bytes.
  ;;
-;;      For long-calls the length will be either 52 bytes (non-pic)
-;;      or 68 bytes (pic).  */
+;;      For long-calls the length will be at most 68 bytes (non-pic)
+;;      or 84 bytes (pic).  */
  ;;      Else we have to use a long-call;
        (if_then_else (lt (plus (symbol_ref "total_code_bytes") (pc))
                         (const_int 240000))
-                   (const_int 4)
+                   (const_int 8)
                     (if_then_else (eq (symbol_ref "flag_pic")
                                       (const_int 0))
-                                 (const_int 52)
-                                 (const_int 68))))])
+                                 (const_int 68)
+                                 (const_int 84))))])
  
  (define_insn "call_internal_reg_64bit"
    [(call (mem:SI (match_operand:DI 0 "register_operand" "r"))
@@ -5917,12 +6125,22 @@
  
    /* If we're generating PIC code.  */
    xoperands[0] = operands[0];
-  xoperands[1] = gen_label_rtx ();
+  if (TARGET_SOM || ! TARGET_GAS)
+    xoperands[1] = gen_label_rtx ();
    output_asm_insn (\"{bl|b,l} .+8,%%r1\", xoperands);
-  output_asm_insn (\"addil L%%$$dyncall-%1,%%r1\", xoperands);
-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
-                            CODE_LABEL_NUMBER (xoperands[1]));
-  output_asm_insn (\"ldo R%%$$dyncall-%1(%%r1),%%r1\", xoperands);
+  if (TARGET_SOM || ! TARGET_GAS)
+    {
+      output_asm_insn (\"addil L%%$$dyncall-%1,%%r1\", xoperands);
+      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
+                                CODE_LABEL_NUMBER (xoperands[1]));
+      output_asm_insn (\"ldo R%%$$dyncall-%1(%%r1),%%r1\", xoperands);
+    }
+  else
+    {
+      output_asm_insn (\"addil L%%$$dyncall-$PIC_pcrel$0+4,%%r1\", xoperands);
+      output_asm_insn (\"ldo R%%$$dyncall-$PIC_pcrel$0+8(%%r1),%%r1\",
+                      xoperands);
+    }
    output_asm_insn (\"blr %%r0,%%r2\", xoperands);
    output_asm_insn (\"bv,n %%r0(%%r1)\\n\\tnop\", xoperands);
    return \"\";
@@ -6029,18 +6247,19 @@
    [(set_attr "type" "call")
     (set (attr "length")
  ;;       If we're sure that we can either reach the target or that the
-;;      linker can use a long-branch stub, then the length is 4 bytes.
+;;      linker can use a long-branch stub, then the length is at most
+;;      8 bytes.
  ;;
-;;      For long-calls the length will be either 52 bytes (non-pic)
-;;      or 68 bytes (pic).  */
+;;      For long-calls the length will be at most 68 bytes (non-pic)
+;;      or 84 bytes (pic).  */
  ;;      Else we have to use a long-call;
        (if_then_else (lt (plus (symbol_ref "total_code_bytes") (pc))
                         (const_int 240000))
-                   (const_int 4)
+                   (const_int 8)
                     (if_then_else (eq (symbol_ref "flag_pic")
                                       (const_int 0))
-                                 (const_int 52)
-                                 (const_int 68))))])
+                                 (const_int 68)
+                                 (const_int 84))))])
  
  (define_insn "call_value_internal_reg_64bit"
    [(set (match_operand 0 "" "=rf")
@@ -6092,12 +6311,22 @@
  
    /* If we're generating PIC code.  */
    xoperands[0] = operands[1];
-  xoperands[1] = gen_label_rtx ();
+  if (TARGET_SOM || ! TARGET_GAS)
+    xoperands[1] = gen_label_rtx ();
    output_asm_insn (\"{bl|b,l} .+8,%%r1\", xoperands);
-  output_asm_insn (\"addil L%%$$dyncall-%1,%%r1\", xoperands);
-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
-                            CODE_LABEL_NUMBER (xoperands[1]));
-  output_asm_insn (\"ldo R%%$$dyncall-%1(%%r1),%%r1\", xoperands);
+  if (TARGET_SOM || ! TARGET_GAS)
+    {
+      output_asm_insn (\"addil L%%$$dyncall-%1,%%r1\", xoperands);
+      ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
+                                CODE_LABEL_NUMBER (xoperands[1]));
+      output_asm_insn (\"ldo R%%$$dyncall-%1(%%r1),%%r1\", xoperands);
+    }
+  else
+    {
+      output_asm_insn (\"addil L%%$$dyncall-$PIC_pcrel$0+4,%%r1\", xoperands);
+      output_asm_insn (\"ldo R%%$$dyncall-$PIC_pcrel$0+8(%%r1),%%r1\",
+                      xoperands);
+    }
    output_asm_insn (\"blr %%r0,%%r2\", xoperands);
    output_asm_insn (\"bv,n %%r0(%%r1)\\n\\tnop\", xoperands);
    return \"\";
@@ -6200,18 +6429,19 @@
    [(set_attr "type" "call")
     (set (attr "length")
  ;;       If we're sure that we can either reach the target or that the
-;;      linker can use a long-branch stub, then the length is 4 bytes.
+;;      linker can use a long-branch stub, then the length is at most
+;;      8 bytes.
  ;;
-;;      For long-calls the length will be either 52 bytes (non-pic)
-;;      or 68 bytes (pic).  */
+;;      For long-calls the length will be at most 68 bytes (non-pic)
+;;      or 84 bytes (pic).  */
  ;;      Else we have to use a long-call;
        (if_then_else (lt (plus (symbol_ref "total_code_bytes") (pc))
                         (const_int 240000))
-                   (const_int 4)
+                   (const_int 8)
                     (if_then_else (eq (symbol_ref "flag_pic")
                                       (const_int 0))
-                                 (const_int 52)
-                                 (const_int 68))))])
+                                 (const_int 68)
+                                 (const_int 84))))])
  
  (define_expand "sibcall_value"
    [(parallel [(set (match_operand 0 "" "")
@@ -6258,18 +6488,19 @@
    [(set_attr "type" "call")
     (set (attr "length")
  ;;       If we're sure that we can either reach the target or that the
-;;      linker can use a long-branch stub, then the length is 4 bytes.
+;;      linker can use a long-branch stub, then the length is at most
+;;      8 bytes.
  ;;
-;;      For long-calls the length will be either 52 bytes (non-pic)
-;;      or 68 bytes (pic).  */
+;;      For long-calls the length will be at most 68 bytes (non-pic)
+;;      or 84 bytes (pic).  */
  ;;      Else we have to use a long-call;
        (if_then_else (lt (plus (symbol_ref "total_code_bytes") (pc))
                         (const_int 240000))
-                   (const_int 4)
+                   (const_int 8)
                     (if_then_else (eq (symbol_ref "flag_pic")
                                       (const_int 0))
-                                 (const_int 52)
-                                 (const_int 68))))])
+                                 (const_int 68)
+                                 (const_int 84))))])
  
  (define_insn "nop"
    [(const_int 0)]
@@ -6390,8 +6621,13 @@
      emit_insn (gen_extzv_64 (operands[0], operands[1],
                              operands[2], operands[3]));
    else
-    emit_insn (gen_extzv_32 (operands[0], operands[1],
-                            operands[2], operands[3]));
+    {
+      if (! uint5_operand (operands[2], SImode)
+         || ! uint5_operand (operands[3], SImode))
+       FAIL;
+      emit_insn (gen_extzv_32 (operands[0], operands[1],
+                              operands[2], operands[3]));
+    }
    DONE;
  }")
  
@@ -6447,8 +6683,13 @@
      emit_insn (gen_extv_64 (operands[0], operands[1],
                             operands[2], operands[3]));
    else
-    emit_insn (gen_extv_32 (operands[0], operands[1],
-                           operands[2], operands[3]));
+    {
+      if (! uint5_operand (operands[2], SImode)
+         || ! uint5_operand (operands[3], SImode))
+       FAIL;
+      emit_insn (gen_extv_32 (operands[0], operands[1],
+                             operands[2], operands[3]));
+    }
    DONE;
  }")
  
@@ -6505,8 +6746,13 @@
      emit_insn (gen_insv_64 (operands[0], operands[1],
                             operands[2], operands[3]));
    else
-    emit_insn (gen_insv_32 (operands[0], operands[1],
-                           operands[2], operands[3]));
+    {
+      if (! uint5_operand (operands[2], SImode)
+         || ! uint5_operand (operands[3], SImode))
+       FAIL;
+      emit_insn (gen_insv_32 (operands[0], operands[1],
+                             operands[2], operands[3]));
+    }
    DONE;
  }")
  
@@ -7216,16 +7462,18 @@
  ;; restore the PIC register.
  (define_expand "exception_receiver"
    [(const_int 4)]
-  "!TARGET_PORTABLE_RUNTIME && flag_pic"
+  "flag_pic"
    "
  {
-  /* Load the PIC register from the stack slot (in our caller's
-     frame).  */
-  emit_move_insn (pic_offset_table_rtx,
-                 gen_rtx_MEM (SImode,
-                              plus_constant (stack_pointer_rtx, -32)));
-  emit_insn (gen_rtx (USE, VOIDmode, pic_offset_table_rtx));
-  emit_insn (gen_blockage ());
+  /* On the 64-bit port, we need a blockage because there is
+     confusion regarding the dependence of the restore on the
+     frame pointer.  As a result, the frame pointer and pic
+     register restores sometimes are interchanged erroneously.  */
+  if (TARGET_64BIT)
+    emit_insn (gen_blockage ());
+  /* Restore the PIC register using hppa_pic_save_rtx ().  The
+     PIC register is not saved in the frame in 64-bit ABI.  */
+  emit_move_insn (pic_offset_table_rtx, hppa_pic_save_rtx ());
    DONE;
  }")
  
@@ -7234,6 +7482,8 @@
    "flag_pic"
    "
  {
+  if (TARGET_64BIT)
+    emit_insn (gen_blockage ());
    /* Restore the PIC register.  Hopefully, this will always be from
       a stack slot.  The only registers that are valid after a
       builtin_longjmp are the stack and frame pointers.  */