;;
;; FIXME: Add 800 scheduling for completeness?
-(define_attr "cpu" "700,7100,7100LC,7200,8000" (const (symbol_ref "pa_cpu_attr")))
+(define_attr "cpu" "700,7100,7100LC,7200,7300,8000" (const (symbol_ref "pa_cpu_attr")))
;; Length (in # of bytes).
(define_attr "length" ""
(const_int 0)))
[(eq_attr "in_branch_delay" "true") (nil) (nil)])
-;; Function units of the HPPA. The following data is for the 700 CPUs
-;; (Mustang CPU + Timex FPU aka PA-89) because that's what I have the docs for.
-;; Scheduling instructions for PA-83 machines according to the Snake
-;; constraints shouldn't hurt.
-
-;; (define_function_unit {name} {num-units} {n-users} {test}
-;; {ready-delay} {issue-delay} [{conflict-list}])
-
-;; The integer ALU.
-;; (Noted only for documentation; units that take one cycle do not need to
-;; be specified.)
-
-;; (define_function_unit "alu" 1 0
-;; (and (eq_attr "type" "unary,shift,nullshift,binary,move,address")
-;; (eq_attr "cpu" "700"))
-;; 1 0)
-
-
;; Memory. Disregarding Cache misses, the Mustang memory times are:
;; load: 2, fpload: 3
;; store, fpstore: 3, no D-cache operations should be scheduled.
-(define_function_unit "pa700memory" 1 0
- (and (eq_attr "type" "load,fpload")
- (eq_attr "cpu" "700")) 2 0)
-(define_function_unit "pa700memory" 1 0
- (and (eq_attr "type" "store,fpstore")
- (eq_attr "cpu" "700")) 3 3)
-
;; The Timex (aka 700) has two floating-point units: ALU, and MUL/DIV/SQRT.
;; Timings:
;; Instruction Time Unit Minimum Distance (unit contention)
;; fdiv,dbl 12 MPY 12
;; fsqrt,sgl 14 MPY 14
;; fsqrt,dbl 18 MPY 18
+;;
+;; We don't model fmpyadd/fmpysub properly as those instructions
+;; keep both the FP ALU and MPY units busy. Given that these
+;; processors are obsolete, I'm not going to spend the time to
+;; model those instructions correctly.
+
+(define_automaton "pa700")
+(define_cpu_unit "dummy_700,mem_700,fpalu_700,fpmpy_700" "pa700")
-(define_function_unit "pa700fp_alu" 1 0
+(define_insn_reservation "W0" 4
(and (eq_attr "type" "fpcc")
- (eq_attr "cpu" "700")) 4 2)
-(define_function_unit "pa700fp_alu" 1 0
+ (eq_attr "cpu" "700"))
+ "fpalu_700*2")
+
+(define_insn_reservation "W1" 3
(and (eq_attr "type" "fpalu")
- (eq_attr "cpu" "700")) 3 2)
-(define_function_unit "pa700fp_mpy" 1 0
+ (eq_attr "cpu" "700"))
+ "fpalu_700*2")
+
+(define_insn_reservation "W2" 3
(and (eq_attr "type" "fpmulsgl,fpmuldbl")
- (eq_attr "cpu" "700")) 3 2)
-(define_function_unit "pa700fp_mpy" 1 0
+ (eq_attr "cpu" "700"))
+ "fpmpy_700*2")
+
+(define_insn_reservation "W3" 10
(and (eq_attr "type" "fpdivsgl")
- (eq_attr "cpu" "700")) 10 10)
-(define_function_unit "pa700fp_mpy" 1 0
+ (eq_attr "cpu" "700"))
+ "fpmpy_700*10")
+
+(define_insn_reservation "W4" 12
(and (eq_attr "type" "fpdivdbl")
- (eq_attr "cpu" "700")) 12 12)
-(define_function_unit "pa700fp_mpy" 1 0
+ (eq_attr "cpu" "700"))
+ "fpmpy_700*12")
+
+(define_insn_reservation "W5" 14
(and (eq_attr "type" "fpsqrtsgl")
- (eq_attr "cpu" "700")) 14 14)
-(define_function_unit "pa700fp_mpy" 1 0
+ (eq_attr "cpu" "700"))
+ "fpmpy_700*14")
+
+(define_insn_reservation "W6" 18
(and (eq_attr "type" "fpsqrtdbl")
- (eq_attr "cpu" "700")) 18 18)
+ (eq_attr "cpu" "700"))
+ "fpmpy_700*18")
+
+(define_insn_reservation "W7" 2
+ (and (eq_attr "type" "load")
+ (eq_attr "cpu" "700"))
+ "mem_700")
+
+(define_insn_reservation "W8" 2
+ (and (eq_attr "type" "fpload")
+ (eq_attr "cpu" "700"))
+ "mem_700")
+
+(define_insn_reservation "W9" 3
+ (and (eq_attr "type" "store")
+ (eq_attr "cpu" "700"))
+ "mem_700*3")
+
+(define_insn_reservation "W10" 3
+ (and (eq_attr "type" "fpstore")
+ (eq_attr "cpu" "700"))
+ "mem_700*3")
+
+(define_insn_reservation "W11" 1
+ (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpdivdbl,fpsqrtsgl,fpsqrtdbl,load,fpload,store,fpstore")
+ (eq_attr "cpu" "700"))
+ "dummy_700")
+
+;; We have a bypass for all computations in the FP unit which feed an
+;; FP store as long as the sizes are the same.
+(define_bypass 2 "W1,W2" "W10" "hppa_fpstore_bypass_p")
+(define_bypass 9 "W3" "W10" "hppa_fpstore_bypass_p")
+(define_bypass 11 "W4" "W10" "hppa_fpstore_bypass_p")
+(define_bypass 13 "W5" "W10" "hppa_fpstore_bypass_p")
+(define_bypass 17 "W6" "W10" "hppa_fpstore_bypass_p")
+
+;; We have an "anti-bypass" for FP loads which feed an FP store.
+(define_bypass 4 "W8" "W10" "hppa_fpstore_bypass_p")
;; Function units for the 7100 and 7150. The 7100/7150 can dual-issue
;; floating point computations with non-floating point computations (fp loads
;; and stores are not fp computations).
;;
-
;; Memory. Disregarding Cache misses, memory loads take two cycles; stores also
;; take two cycles, during which no Dcache operations should be scheduled.
;; Any special cases are handled in pa_adjust_cost. The 7100, 7150 and 7100LC
;; all have the same memory characteristics if one disregards cache misses.
-(define_function_unit "pa7100memory" 1 0
- (and (eq_attr "type" "load,fpload")
- (eq_attr "cpu" "7100,7100LC")) 2 0)
-(define_function_unit "pa7100memory" 1 0
- (and (eq_attr "type" "store,fpstore")
- (eq_attr "cpu" "7100,7100LC")) 2 2)
-
+;;
;; The 7100/7150 has three floating-point units: ALU, MUL, and DIV.
+;; There's no value in modeling the ALU and MUL separately though
+;; since there can never be a functional unit conflict given the
+;; latency and issue rates for those units.
+;;
;; Timings:
;; Instruction Time Unit Minimum Distance (unit contention)
;; fcpy 2 ALU 1
;; fsqrt,sgl 8 DIV 8
;; fsqrt,dbl 15 DIV 15
-(define_function_unit "pa7100fp_alu" 1 0
- (and (eq_attr "type" "fpcc,fpalu")
- (eq_attr "cpu" "7100")) 2 1)
-(define_function_unit "pa7100fp_mpy" 1 0
- (and (eq_attr "type" "fpmulsgl,fpmuldbl")
- (eq_attr "cpu" "7100")) 2 1)
-(define_function_unit "pa7100fp_div" 1 0
- (and (eq_attr "type" "fpdivsgl,fpsqrtsgl")
- (eq_attr "cpu" "7100")) 8 8)
-(define_function_unit "pa7100fp_div" 1 0
- (and (eq_attr "type" "fpdivdbl,fpsqrtdbl")
- (eq_attr "cpu" "7100")) 15 15)
+(define_automaton "pa7100")
+(define_cpu_unit "i_7100, f_7100,fpmac_7100,fpdivsqrt_7100,mem_7100" "pa7100")
-;; To encourage dual issue we define function units corresponding to
-;; the instructions which can be dual issued. This is a rather crude
-;; approximation, the "pa7100nonflop" test in particular could be refined.
-(define_function_unit "pa7100flop" 1 1
- (and
- (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
- (eq_attr "cpu" "7100")) 1 1)
-
-(define_function_unit "pa7100nonflop" 1 1
- (and
- (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
- (eq_attr "cpu" "7100")) 1 1)
+(define_insn_reservation "X0" 2
+ (and (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
+ (eq_attr "cpu" "7100"))
+ "f_7100,fpmac_7100")
+(define_insn_reservation "X1" 8
+ (and (eq_attr "type" "fpdivsgl,fpsqrtsgl")
+ (eq_attr "cpu" "7100"))
+ "f_7100+fpdivsqrt_7100,fpdivsqrt_7100*7")
-;; Memory subsystem works just like 7100/7150 (except for cache miss times which
-;; we don't model here).
+(define_insn_reservation "X2" 15
+ (and (eq_attr "type" "fpdivdbl,fpsqrtdbl")
+ (eq_attr "cpu" "7100"))
+ "f_7100+fpdivsqrt_7100,fpdivsqrt_7100*14")
+
+(define_insn_reservation "X3" 2
+ (and (eq_attr "type" "load")
+ (eq_attr "cpu" "7100"))
+ "i_7100+mem_7100")
+
+(define_insn_reservation "X4" 2
+ (and (eq_attr "type" "fpload")
+ (eq_attr "cpu" "7100"))
+ "i_7100+mem_7100")
+
+(define_insn_reservation "X5" 2
+ (and (eq_attr "type" "store")
+ (eq_attr "cpu" "7100"))
+ "i_7100+mem_7100,mem_7100")
+
+(define_insn_reservation "X6" 2
+ (and (eq_attr "type" "fpstore")
+ (eq_attr "cpu" "7100"))
+ "i_7100+mem_7100,mem_7100")
+
+(define_insn_reservation "X7" 1
+ (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore")
+ (eq_attr "cpu" "7100"))
+ "i_7100")
+
+;; We have a bypass for all computations in the FP unit which feed an
+;; FP store as long as the sizes are the same.
+(define_bypass 1 "X0" "X6" "hppa_fpstore_bypass_p")
+(define_bypass 7 "X1" "X6" "hppa_fpstore_bypass_p")
+(define_bypass 14 "X2" "X6" "hppa_fpstore_bypass_p")
+
+;; We have an "anti-bypass" for FP loads which feed an FP store.
+(define_bypass 3 "X4" "X6" "hppa_fpstore_bypass_p")
;; The 7100LC has three floating-point units: ALU, MUL, and DIV.
-;; Note divides and sqrt flops lock the cpu until the flop is
-;; finished. fmpy and xmpyu (fmpyi) lock the cpu for one cycle.
-;; There's no way to avoid the penalty.
+;; There's no value in modeling the ALU and MUL separately though
+;; since there can never be a functional unit conflict that
+;; can be avoided given the latency, issue rates and mandatory
+;; one cycle cpu-wide lock for a double precision fp multiply.
+;;
;; Timings:
;; Instruction Time Unit Minimum Distance (unit contention)
;; fcpy 2 ALU 1
;; fdiv,dbl 15 DIV 15
;; fsqrt,sgl 8 DIV 8
;; fsqrt,dbl 15 DIV 15
+;;
+;; The PA7200 is just like the PA7100LC except that there is
+;; no store-store penalty.
+;;
+;; The PA7300 is just like the PA7200 except that there is
+;; no store-load penalty.
+;;
+;; Note there are some aspects of the 7100LC we are not modeling
+;; at the moment. I'll be reviewing the 7100LC scheduling info
+;; shortly and updating this description.
+;;
+;; load-load pairs
+;; store-store pairs
+;; other issue modeling
+
+(define_automaton "pa7100lc")
+(define_cpu_unit "i0_7100lc, i1_7100lc, f_7100lc" "pa7100lc")
+(define_cpu_unit "fpmac_7100lc" "pa7100lc")
+(define_cpu_unit "mem_7100lc" "pa7100lc")
+
+;; Double precision multiplies lock the entire CPU for one
+;; cycle. There is no way to avoid this lock and trying to
+;; schedule around the lock is pointless and thus there is no
+;; value in trying to model this lock.
+;;
+;; Not modeling the lock allows us to treat fp multiplies just
+;; like any other FP alu instruction. It allows for a smaller
+;; DFA and may reduce register pressure.
+(define_insn_reservation "Y0" 2
+ (and (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
+ (eq_attr "cpu" "7100LC,7200,7300"))
+ "f_7100lc,fpmac_7100lc")
+
+;; fp division and sqrt instructions lock the entire CPU for
+;; 7 cycles (single precision) or 14 cycles (double precision).
+;; There is no way to avoid this lock and trying to schedule
+;; around the lock is pointless and thus there is no value in
+;; trying to model this lock. Not modeling the lock allows
+;; for a smaller DFA and may reduce register pressure.
+(define_insn_reservation "Y1" 1
+ (and (eq_attr "type" "fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
+ (eq_attr "cpu" "7100LC,7200,7300"))
+ "f_7100lc")
+
+(define_insn_reservation "Y2" 2
+ (and (eq_attr "type" "load")
+ (eq_attr "cpu" "7100LC,7200,7300"))
+ "i1_7100lc+mem_7100lc")
+
+(define_insn_reservation "Y3" 2
+ (and (eq_attr "type" "fpload")
+ (eq_attr "cpu" "7100LC,7200,7300"))
+ "i1_7100lc+mem_7100lc")
+
+(define_insn_reservation "Y4" 2
+ (and (eq_attr "type" "store")
+ (eq_attr "cpu" "7100LC"))
+ "i1_7100lc+mem_7100lc,mem_7100lc")
+
+(define_insn_reservation "Y5" 2
+ (and (eq_attr "type" "fpstore")
+ (eq_attr "cpu" "7100LC"))
+ "i1_7100lc+mem_7100lc,mem_7100lc")
+
+(define_insn_reservation "Y6" 1
+ (and (eq_attr "type" "shift,nullshift")
+ (eq_attr "cpu" "7100LC,7200,7300"))
+ "i1_7100lc")
+
+(define_insn_reservation "Y7" 1
+ (and (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl,load,fpload,store,fpstore,shift,nullshift")
+ (eq_attr "cpu" "7100LC,7200,7300"))
+ "(i0_7100lc|i1_7100lc)")
+
+;; The 7200 has a store-load penalty
+(define_insn_reservation "Y8" 2
+ (and (eq_attr "type" "store")
+ (eq_attr "cpu" "7200"))
+ "i1_7100lc,mem_7100lc")
+
+(define_insn_reservation "Y9" 2
+ (and (eq_attr "type" "fpstore")
+ (eq_attr "cpu" "7200"))
+ "i1_7100lc,mem_7100lc")
+
+;; The 7300 has no penalty for store-store or store-load
+(define_insn_reservation "Y10" 2
+ (and (eq_attr "type" "store")
+ (eq_attr "cpu" "7300"))
+ "i1_7100lc")
+
+(define_insn_reservation "Y11" 2
+ (and (eq_attr "type" "fpstore")
+ (eq_attr "cpu" "7300"))
+ "i1_7100lc")
+
+;; We have an "anti-bypass" for FP loads which feed an FP store.
+(define_bypass 3 "Y3" "Y5,Y9,Y11" "hppa_fpstore_bypass_p")
-(define_function_unit "pa7100LCfp_alu" 1 0
- (and (eq_attr "type" "fpcc,fpalu")
- (eq_attr "cpu" "7100LC,7200")) 2 1)
-(define_function_unit "pa7100LCfp_mpy" 1 0
- (and (eq_attr "type" "fpmulsgl")
- (eq_attr "cpu" "7100LC,7200")) 2 1)
-(define_function_unit "pa7100LCfp_mpy" 1 0
- (and (eq_attr "type" "fpmuldbl")
- (eq_attr "cpu" "7100LC,7200")) 3 2)
-(define_function_unit "pa7100LCfp_div" 1 0
- (and (eq_attr "type" "fpdivsgl,fpsqrtsgl")
- (eq_attr "cpu" "7100LC,7200")) 8 8)
-(define_function_unit "pa7100LCfp_div" 1 0
- (and (eq_attr "type" "fpdivdbl,fpsqrtdbl")
- (eq_attr "cpu" "7100LC,7200")) 15 15)
-
-;; Define the various functional units for dual-issue.
-
-;; There's only one floating point unit.
-(define_function_unit "pa7100LCflop" 1 1
- (and
- (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
- (eq_attr "cpu" "7100LC,7200")) 1 1)
-
-;; Shifts and memory ops execute in only one of the integer ALUs
-(define_function_unit "pa7100LCshiftmem" 1 1
- (and
- (eq_attr "type" "shift,nullshift,load,fpload,store,fpstore")
- (eq_attr "cpu" "7100LC,7200")) 1 1)
-
-;; We have two basic ALUs.
-(define_function_unit "pa7100LCalu" 2 1
- (and
- (eq_attr "type" "!fpcc,fpalu,fpmulsgl,fpmuldbl,fpdivsgl,fpsqrtsgl,fpdivdbl,fpsqrtdbl")
- (eq_attr "cpu" "7100LC,7200")) 1 1)
-
-;; I don't have complete information on the PA7200; however, most of
-;; what I've heard makes it look like a 7100LC without the store-store
-;; penalty. So that's how we'll model it.
-
-;; Memory. Disregarding Cache misses, memory loads and stores take
-;; two cycles. Any special cases are handled in pa_adjust_cost.
-(define_function_unit "pa7200memory" 1 0
- (and (eq_attr "type" "load,fpload,store,fpstore")
- (eq_attr "cpu" "7200")) 2 0)
-
-;; I don't have detailed information on the PA7200 FP pipeline, so I
-;; treat it just like the 7100LC pipeline.
-;; Similarly for the multi-issue fake units.
-
-;;
;; Scheduling for the PA8000 is somewhat different than scheduling for a
;; traditional architecture.
;;
;; The PA8000 has a large (56) entry reorder buffer that is split between
;; memory and non-memory operations.
;;
-;; The PA800 can issue two memory and two non-memory operations per cycle to
-;; the function units. Similarly, the PA8000 can retire two memory and two
-;; non-memory operations per cycle.
+;; The PA8000 can issue two memory and two non-memory operations per cycle to
+;; the function units, with the exception of branches and multi-output
+;; instructions. The PA8000 can retire two non-memory operations per cycle
+;; and two memory operations per cycle, only one of which may be a store.
;;
;; Given the large reorder buffer, the processor can hide most latencies.
;; According to HP, they've got the best results by scheduling for retirement
;; bandwidth with limited latency scheduling for floating point operations.
;; Latency for integer operations and memory references is ignored.
;;
-;; We claim floating point operations have a 2 cycle latency and are
-;; fully pipelined, except for div and sqrt which are not pipelined.
;;
-;; It is not necessary to define the shifter and integer alu units.
+;; We claim floating point operations have a 2 cycle latency and are
+;; fully pipelined, except for div and sqrt which are not pipelined and
+;; take from 17 to 31 cycles to complete.
;;
-;; These first two define_unit_unit descriptions model retirement from
-;; the reorder buffer.
-(define_function_unit "pa8000lsu" 2 1
+;; It's worth noting that there is no way to saturate all the functional
+;; units on the PA8000 as there is not enough issue bandwidth.
+
+(define_automaton "pa8000")
+(define_cpu_unit "inm0_8000, inm1_8000, im0_8000, im1_8000" "pa8000")
+(define_cpu_unit "rnm0_8000, rnm1_8000, rm0_8000, rm1_8000" "pa8000")
+(define_cpu_unit "store_8000" "pa8000")
+(define_cpu_unit "f0_8000, f1_8000" "pa8000")
+(define_cpu_unit "fdivsqrt0_8000, fdivsqrt1_8000" "pa8000")
+(define_reservation "inm_8000" "inm0_8000 | inm1_8000")
+(define_reservation "im_8000" "im0_8000 | im1_8000")
+(define_reservation "rnm_8000" "rnm0_8000 | rnm1_8000")
+(define_reservation "rm_8000" "rm0_8000 | rm1_8000")
+(define_reservation "f_8000" "f0_8000 | f1_8000")
+(define_reservation "fdivsqrt_8000" "fdivsqrt0_8000 | fdivsqrt1_8000")
+
+;; We can issue any two memops per cycle, but we can only retire
+;; one memory store per cycle. We assume that the reorder buffer
+;; will hide any memory latencies per HP's recommendation.
+(define_insn_reservation "Z0" 0
(and
- (eq_attr "type" "load,fpload,store,fpstore")
- (eq_attr "cpu" "8000")) 1 1)
+ (eq_attr "type" "load,fpload")
+ (eq_attr "cpu" "8000"))
+ "im_8000,rm_8000")
-(define_function_unit "pa8000alu" 2 1
+(define_insn_reservation "Z1" 0
(and
- (eq_attr "type" "!load,fpload,store,fpstore")
- (eq_attr "cpu" "8000")) 1 1)
-
-;; Claim floating point ops have a 2 cycle latency, excluding div and
-;; sqrt, which are not pipelined and issue to different units.
-(define_function_unit "pa8000fmac" 2 0
+ (eq_attr "type" "store,fpstore")
+ (eq_attr "cpu" "8000"))
+ "im_8000,rm_8000+store_8000")
+
+;; We can issue and retire two non-memory operations per cycle with
+;; a few exceptions (branches). This group catches those we want
+;; to assume have zero latency.
+(define_insn_reservation "Z2" 0
(and
- (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
- (eq_attr "cpu" "8000")) 2 1)
+ (eq_attr "type" "!load,fpload,store,fpstore,uncond_branch,branch,cbranch,fbranch,call,dyncall,multi,milli,parallel_branch,fpcc,fpalu,fpmulsgl,fpmuldbl,fpsqrtsgl,fpsqrtdbl,fpdivsgl,fpdivdbl")
+ (eq_attr "cpu" "8000"))
+ "inm_8000,rnm_8000")
-(define_function_unit "pa8000fdiv" 2 1
+;; Branches use both slots in the non-memory issue and
+;; retirement unit.
+(define_insn_reservation "Z3" 0
(and
- (eq_attr "type" "fpdivsgl,fpsqrtsgl")
- (eq_attr "cpu" "8000")) 17 17)
+ (eq_attr "type" "uncond_branch,branch,cbranch,fbranch,call,dyncall,multi,milli,parallel_branch")
+ (eq_attr "cpu" "8000"))
+ "inm0_8000+inm1_8000,rnm0_8000+rnm1_8000")
+
+;; We partial latency schedule the floating point units.
+;; They can issue/retire two at a time in the non-memory
+;; units. We fix their latency at 2 cycles and they
+;; are fully pipelined.
+(define_insn_reservation "Z4" 1
+ (and
+ (eq_attr "type" "fpcc,fpalu,fpmulsgl,fpmuldbl")
+ (eq_attr "cpu" "8000"))
+ "inm_8000,f_8000,rnm_8000")
+
+;; The fdivsqrt units are not pipelined and have a very long latency.
+;; To keep the DFA from exploding, we do not show all the
+;; reservations for the divsqrt unit.
+(define_insn_reservation "Z5" 17
+ (and
+ (eq_attr "type" "fpdivsgl,fpsqrtsgl")
+ (eq_attr "cpu" "8000"))
+ "inm_8000,fdivsqrt_8000*6,rnm_8000")
+
+(define_insn_reservation "Z6" 31
+ (and
+ (eq_attr "type" "fpdivdbl,fpsqrtdbl")
+ (eq_attr "cpu" "8000"))
+ "inm_8000,fdivsqrt_8000*6,rnm_8000")
-(define_function_unit "pa8000fdiv" 2 1
- (and
- (eq_attr "type" "fpdivdbl,fpsqrtdbl")
- (eq_attr "cpu" "8000")) 31 31)
\f
;; Compare instructions.
[(set_attr "length" "4")
(set_attr "type" "fpcc")])
+;; The following two patterns are optimization placeholders. In almost
+;; all cases, the user of the condition code will be simplified and the
+;; original condition code setting insn should be eliminated.
+
+(define_insn "*setccfp0"
+ [(set (reg:CCFP 0)
+ (const_int 0))]
+ "! TARGET_SOFT_FLOAT"
+ "fcmp,dbl,!= %%fr0,%%fr0"
+ [(set_attr "length" "4")
+ (set_attr "type" "fpcc")])
+
+(define_insn "*setccfp1"
+ [(set (reg:CCFP 0)
+ (const_int 1))]
+ "! TARGET_SOFT_FLOAT"
+ "fcmp,dbl,= %%fr0,%%fr0"
+ [(set_attr "length" "4")
+ (set_attr "type" "fpcc")])
+
;; scc insns.
(define_expand "seq"
""
"*
{
- rtx label_rtx = gen_label_rtx ();
rtx xoperands[3];
extern FILE *asm_out_file;
xoperands[0] = operands[0];
xoperands[1] = operands[1];
- xoperands[2] = label_rtx;
+ if (TARGET_SOM || ! TARGET_GAS)
+ xoperands[2] = gen_label_rtx ();
+
output_asm_insn (\"{bl|b,l} .+8,%0\", xoperands);
output_asm_insn (\"{depi|depwi} 0,31,2,%0\", xoperands);
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
- CODE_LABEL_NUMBER (label_rtx));
+ if (TARGET_SOM || ! TARGET_GAS)
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
+ CODE_LABEL_NUMBER (xoperands[2]));
/* If we're trying to load the address of a label that happens to be
close, then we can use a shorter sequence. */
{
/* Prefixing with R% here is wrong, it extracts just 11 bits and is
always non-negative. */
- output_asm_insn (\"ldo %1-%2(%0),%0\", xoperands);
+ if (TARGET_SOM || ! TARGET_GAS)
+ output_asm_insn (\"ldo %1-%2(%0),%0\", xoperands);
+ else
+ output_asm_insn (\"ldo %1-$PIC_pcrel$0+8(%0),%0\", xoperands);
}
else
{
- output_asm_insn (\"addil L%%%1-%2,%0\", xoperands);
- output_asm_insn (\"ldo R%%%1-%2(%0),%0\", xoperands);
+ if (TARGET_SOM || ! TARGET_GAS)
+ {
+ output_asm_insn (\"addil L%%%1-%2,%0\", xoperands);
+ output_asm_insn (\"ldo R%%%1-%2(%0),%0\", xoperands);
+ }
+ else
+ {
+ output_asm_insn (\"addil L%%%1-$PIC_pcrel$0+8,%0\", xoperands);
+ output_asm_insn (\"ldo R%%%1-$PIC_pcrel$0+12(%0),%0\",
+ xoperands);
+ }
}
return \"\";
}"
(define_expand "adddi3"
[(set (match_operand:DI 0 "register_operand" "")
(plus:DI (match_operand:DI 1 "register_operand" "")
- (match_operand:DI 2 "arith_operand" "")))]
+ (match_operand:DI 2 "adddi3_operand" "")))]
""
"")
-;; We allow arith_operand for operands2, even though strictly speaking it
-;; we would prefer to us arith11_operand since that's what the hardware
-;; can actually support.
-;;
-;; But the price of the extra reload in that case is worth the simplicity
-;; we get by allowing a trivial adddi3 expander to be used for both
-;; PA64 and PA32.
-
(define_insn ""
[(set (match_operand:DI 0 "register_operand" "=r")
(plus:DI (match_operand:DI 1 "register_operand" "%r")
- (match_operand:DI 2 "arith_operand" "rI")))]
+ (match_operand:DI 2 "arith11_operand" "rI")))]
"!TARGET_64BIT"
"*
{
emit_insn (gen_negdf2_fast (operands[0], operands[1]));
else
{
- operands[2] = force_reg (DFmode, immed_real_const_1 (dconstm1, DFmode));
+ operands[2] = force_reg (DFmode,
+ CONST_DOUBLE_FROM_REAL_VALUE (dconstm1, DFmode));
emit_insn (gen_muldf3 (operands[0], operands[1], operands[2]));
}
DONE;
emit_insn (gen_negsf2_fast (operands[0], operands[1]));
else
{
- operands[2] = force_reg (SFmode, immed_real_const_1 (dconstm1, SFmode));
+ operands[2] = force_reg (SFmode,
+ CONST_DOUBLE_FROM_REAL_VALUE (dconstm1, SFmode));
emit_insn (gen_mulsf3 (operands[0], operands[1], operands[2]));
}
DONE;
[(set_attr "type" "branch")
(set_attr "length" "4")])
+;; Use the PIC register to ensure it's restored after a
+;; call in PIC mode. This is used for eh returns which
+;; bypass the return stub.
+(define_insn "return_external_pic"
+ [(return)
+ (use (match_operand 0 "register_operand" "r"))
+ (use (reg:SI 2))
+ (clobber (reg:SI 1))]
+ "flag_pic
+ && current_function_calls_eh_return
+ && true_regnum (operands[0]) == PIC_OFFSET_TABLE_REGNUM"
+ "ldsid (%%sr0,%%r2),%%r1\;mtsp %%r1,%%sr0\;be%* 0(%%sr0,%%r2)"
+ [(set_attr "type" "branch")
+ (set_attr "length" "12")])
+
(define_expand "prologue"
[(const_int 0)]
""
/* Try to use the trivial return first. Else use the full
epilogue. */
if (hppa_can_use_return_insn_p ())
- emit_jump_insn (gen_return ());
+ emit_jump_insn (gen_return ());
else
{
rtx x;
hppa_expand_epilogue ();
if (flag_pic)
- x = gen_return_internal_pic (gen_rtx_REG (word_mode,
- PIC_OFFSET_TABLE_REGNUM));
+ {
+ rtx pic = gen_rtx_REG (word_mode, PIC_OFFSET_TABLE_REGNUM);
+
+ /* EH returns bypass the normal return stub. Thus, we must do an
+ interspace branch to return from functions that call eh_return.
+ This is only a problem for returns from shared code. */
+ if (current_function_calls_eh_return)
+ x = gen_return_external_pic (pic);
+ else
+ x = gen_return_internal_pic (pic);
+ }
else
x = gen_return_internal ();
emit_jump_insn (x);
{
rtx xoperands[2];
xoperands[0] = operands[0];
- xoperands[1] = gen_label_rtx ();
+ if (TARGET_SOM || ! TARGET_GAS)
+ {
+ xoperands[1] = gen_label_rtx ();
- output_asm_insn (\"{bl|b,l} .+8,%%r1\\n\\taddil L'%l0-%l1,%%r1\",
- xoperands);
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
- CODE_LABEL_NUMBER (xoperands[1]));
- output_asm_insn (\"ldo R'%l0-%l1(%%r1),%%r1\\n\\tbv %%r0(%%r1)\",
- xoperands);
+ output_asm_insn (\"{bl|b,l} .+8,%%r1\\n\\taddil L'%l0-%l1,%%r1\",
+ xoperands);
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
+ CODE_LABEL_NUMBER (xoperands[1]));
+ output_asm_insn (\"ldo R'%l0-%l1(%%r1),%%r1\", xoperands);
+ }
+ else
+ {
+ output_asm_insn (\"{bl|b,l} .+8,%%r1\", xoperands);
+ output_asm_insn (\"addil L'%l0-$PIC_pcrel$0+4,%%r1\", xoperands);
+ output_asm_insn (\"ldo R'%l0-$PIC_pcrel$0+8(%%r1),%%r1\", xoperands);
+ }
+ output_asm_insn (\"bv %%r0(%%r1)\", xoperands);
}
else
output_asm_insn (\"ldil L'%l0,%%r1\\n\\tbe R'%l0(%%sr4,%%r1)\", operands);;
[(set_attr "type" "call")
(set (attr "length")
;; If we're sure that we can either reach the target or that the
-;; linker can use a long-branch stub, then the length is 4 bytes.
+;; linker can use a long-branch stub, then the length is at most
+;; 8 bytes.
;;
-;; For long-calls the length will be either 52 bytes (non-pic)
-;; or 68 bytes (pic). */
+;; For long-calls the length will be at most 68 bytes (non-pic)
+;; or 84 bytes (pic). */
;; Else we have to use a long-call;
(if_then_else (lt (plus (symbol_ref "total_code_bytes") (pc))
(const_int 240000))
- (const_int 4)
+ (const_int 8)
(if_then_else (eq (symbol_ref "flag_pic")
(const_int 0))
- (const_int 52)
- (const_int 68))))])
+ (const_int 68)
+ (const_int 84))))])
(define_insn "call_internal_reg_64bit"
[(call (mem:SI (match_operand:DI 0 "register_operand" "r"))
"TARGET_64BIT"
"*
{
- rtx xoperands[2];
-
/* ??? Needs more work. Length computation, split into multiple insns,
do not use %r22 directly, expose delay slot. */
return \"ldd 16(%0),%%r2\;ldd 24(%0),%%r27\;bve,l (%%r2),%%r2\;nop\";
/* If we're generating PIC code. */
xoperands[0] = operands[0];
- xoperands[1] = gen_label_rtx ();
+ if (TARGET_SOM || ! TARGET_GAS)
+ xoperands[1] = gen_label_rtx ();
output_asm_insn (\"{bl|b,l} .+8,%%r1\", xoperands);
- output_asm_insn (\"addil L%%$$dyncall-%1,%%r1\", xoperands);
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
- CODE_LABEL_NUMBER (xoperands[1]));
- output_asm_insn (\"ldo R%%$$dyncall-%1(%%r1),%%r1\", xoperands);
+ if (TARGET_SOM || ! TARGET_GAS)
+ {
+ output_asm_insn (\"addil L%%$$dyncall-%1,%%r1\", xoperands);
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
+ CODE_LABEL_NUMBER (xoperands[1]));
+ output_asm_insn (\"ldo R%%$$dyncall-%1(%%r1),%%r1\", xoperands);
+ }
+ else
+ {
+ output_asm_insn (\"addil L%%$$dyncall-$PIC_pcrel$0+4,%%r1\", xoperands);
+ output_asm_insn (\"ldo R%%$$dyncall-$PIC_pcrel$0+8(%%r1),%%r1\",
+ xoperands);
+ }
output_asm_insn (\"blr %%r0,%%r2\", xoperands);
output_asm_insn (\"bv,n %%r0(%%r1)\\n\\tnop\", xoperands);
return \"\";
[(set_attr "type" "call")
(set (attr "length")
;; If we're sure that we can either reach the target or that the
-;; linker can use a long-branch stub, then the length is 4 bytes.
+;; linker can use a long-branch stub, then the length is at most
+;; 8 bytes.
;;
-;; For long-calls the length will be either 52 bytes (non-pic)
-;; or 68 bytes (pic). */
+;; For long-calls the length will be at most 68 bytes (non-pic)
+;; or 84 bytes (pic). */
;; Else we have to use a long-call;
(if_then_else (lt (plus (symbol_ref "total_code_bytes") (pc))
(const_int 240000))
- (const_int 4)
+ (const_int 8)
(if_then_else (eq (symbol_ref "flag_pic")
(const_int 0))
- (const_int 52)
- (const_int 68))))])
+ (const_int 68)
+ (const_int 84))))])
(define_insn "call_value_internal_reg_64bit"
[(set (match_operand 0 "" "=rf")
/* If we're generating PIC code. */
xoperands[0] = operands[1];
- xoperands[1] = gen_label_rtx ();
+ if (TARGET_SOM || ! TARGET_GAS)
+ xoperands[1] = gen_label_rtx ();
output_asm_insn (\"{bl|b,l} .+8,%%r1\", xoperands);
- output_asm_insn (\"addil L%%$$dyncall-%1,%%r1\", xoperands);
- ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
- CODE_LABEL_NUMBER (xoperands[1]));
- output_asm_insn (\"ldo R%%$$dyncall-%1(%%r1),%%r1\", xoperands);
+ if (TARGET_SOM || ! TARGET_GAS)
+ {
+ output_asm_insn (\"addil L%%$$dyncall-%1,%%r1\", xoperands);
+ ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, \"L\",
+ CODE_LABEL_NUMBER (xoperands[1]));
+ output_asm_insn (\"ldo R%%$$dyncall-%1(%%r1),%%r1\", xoperands);
+ }
+ else
+ {
+ output_asm_insn (\"addil L%%$$dyncall-$PIC_pcrel$0+4,%%r1\", xoperands);
+ output_asm_insn (\"ldo R%%$$dyncall-$PIC_pcrel$0+8(%%r1),%%r1\",
+ xoperands);
+ }
output_asm_insn (\"blr %%r0,%%r2\", xoperands);
output_asm_insn (\"bv,n %%r0(%%r1)\\n\\tnop\", xoperands);
return \"\";
[(set_attr "type" "call")
(set (attr "length")
;; If we're sure that we can either reach the target or that the
-;; linker can use a long-branch stub, then the length is 4 bytes.
+;; linker can use a long-branch stub, then the length is at most
+;; 8 bytes.
;;
-;; For long-calls the length will be either 52 bytes (non-pic)
-;; or 68 bytes (pic). */
+;; For long-calls the length will be at most 68 bytes (non-pic)
+;; or 84 bytes (pic). */
;; Else we have to use a long-call;
(if_then_else (lt (plus (symbol_ref "total_code_bytes") (pc))
(const_int 240000))
- (const_int 4)
+ (const_int 8)
(if_then_else (eq (symbol_ref "flag_pic")
(const_int 0))
- (const_int 52)
- (const_int 68))))])
+ (const_int 68)
+ (const_int 84))))])
(define_expand "sibcall_value"
[(parallel [(set (match_operand 0 "" "")
[(set_attr "type" "call")
(set (attr "length")
;; If we're sure that we can either reach the target or that the
-;; linker can use a long-branch stub, then the length is 4 bytes.
+;; linker can use a long-branch stub, then the length is at most
+;; 8 bytes.
;;
-;; For long-calls the length will be either 52 bytes (non-pic)
-;; or 68 bytes (pic). */
+;; For long-calls the length will be at most 68 bytes (non-pic)
+;; or 84 bytes (pic). */
;; Else we have to use a long-call;
(if_then_else (lt (plus (symbol_ref "total_code_bytes") (pc))
(const_int 240000))
- (const_int 4)
+ (const_int 8)
(if_then_else (eq (symbol_ref "flag_pic")
(const_int 0))
- (const_int 52)
- (const_int 68))))])
+ (const_int 68)
+ (const_int 84))))])
(define_insn "nop"
[(const_int 0)]
emit_insn (gen_extzv_64 (operands[0], operands[1],
operands[2], operands[3]));
else
- emit_insn (gen_extzv_32 (operands[0], operands[1],
- operands[2], operands[3]));
+ {
+ if (! uint5_operand (operands[2], SImode)
+ || ! uint5_operand (operands[3], SImode))
+ FAIL;
+ emit_insn (gen_extzv_32 (operands[0], operands[1],
+ operands[2], operands[3]));
+ }
DONE;
}")
emit_insn (gen_extv_64 (operands[0], operands[1],
operands[2], operands[3]));
else
- emit_insn (gen_extv_32 (operands[0], operands[1],
- operands[2], operands[3]));
+ {
+ if (! uint5_operand (operands[2], SImode)
+ || ! uint5_operand (operands[3], SImode))
+ FAIL;
+ emit_insn (gen_extv_32 (operands[0], operands[1],
+ operands[2], operands[3]));
+ }
DONE;
}")
emit_insn (gen_insv_64 (operands[0], operands[1],
operands[2], operands[3]));
else
- emit_insn (gen_insv_32 (operands[0], operands[1],
- operands[2], operands[3]));
+ {
+ if (! uint5_operand (operands[2], SImode)
+ || ! uint5_operand (operands[3], SImode))
+ FAIL;
+ emit_insn (gen_insv_32 (operands[0], operands[1],
+ operands[2], operands[3]));
+ }
DONE;
}")
(if_then_else
(match_operator 2 "comparison_operator"
[(plus:SI
- (match_operand:SI 0 "reg_or_nonsymb_mem_operand" "+!r,!*f,!*m")
+ (match_operand:SI 0 "reg_before_reload_operand" "+!r,!*f,*m")
(match_operand:SI 1 "int5_operand" "L,L,L"))
(const_int 0)])
(label_ref (match_operand 3 "" ""))
[(match_operand:SI 1 "register_operand" "r,r,r,r") (const_int 0)])
(label_ref (match_operand 3 "" ""))
(pc)))
- (set (match_operand:SI 0 "register_operand" "=!r,!*f,!*m,!*q")
+ (set (match_operand:SI 0 "reg_before_reload_operand" "=!r,!*f,*m,!*q")
(match_dup 1))]
""
"* return output_movb (operands, insn, which_alternative, 0); "
[(match_operand:SI 1 "register_operand" "r,r,r,r") (const_int 0)])
(pc)
(label_ref (match_operand 3 "" ""))))
- (set (match_operand:SI 0 "register_operand" "=!r,!*f,!*m,!*q")
+ (set (match_operand:SI 0 "reg_before_reload_operand" "=!r,!*f,*m,!*q")
(match_dup 1))]
""
"* return output_movb (operands, insn, which_alternative, 1); "
;; restore the PIC register.
(define_expand "exception_receiver"
[(const_int 4)]
- "!TARGET_PORTABLE_RUNTIME && flag_pic"
+ "flag_pic"
"
{
- /* Load the PIC register from the stack slot (in our caller's
- frame). */
- emit_move_insn (pic_offset_table_rtx,
- gen_rtx_MEM (SImode,
- plus_constant (stack_pointer_rtx, -32)));
- emit_insn (gen_rtx (USE, VOIDmode, pic_offset_table_rtx));
- emit_insn (gen_blockage ());
+ /* On the 64-bit port, we need a blockage because there is
+ confusion regarding the dependence of the restore on the
+ frame pointer. As a result, the frame pointer and pic
+ register restores sometimes are interchanged erroneously. */
+ if (TARGET_64BIT)
+ emit_insn (gen_blockage ());
+ /* Restore the PIC register using hppa_pic_save_rtx (). The
+ PIC register is not saved in the frame in 64-bit ABI. */
+ emit_move_insn (pic_offset_table_rtx, hppa_pic_save_rtx ());
DONE;
}")
"flag_pic"
"
{
+ if (TARGET_64BIT)
+ emit_insn (gen_blockage ());
/* Restore the PIC register. Hopefully, this will always be from
a stack slot. The only registers that are valid after a
builtin_longjmp are the stack and frame pointers. */