1 ;; Scheduling for the Intel P6 family of processors
2 ;; Copyright (C) 2004, 2005 Free Software Foundation, Inc.
4 ;; This file is part of GCC.
6 ;; GCC is free software; you can redistribute it and/or modify
7 ;; it under the terms of the GNU General Public License as published by
8 ;; the Free Software Foundation; either version 2, or (at your option)
11 ;; GCC is distributed in the hope that it will be useful,
12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;; GNU General Public License for more details.
16 ;; You should have received a copy of the GNU General Public License
17 ;; along with GCC; see the file COPYING. If not, write to
18 ;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
19 ;; Boston, MA 02110-1301, USA. */
21 ;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
22 ;; and Xeon lines of CPUs. The DFA scheduler description in this file is
23 ;; based on information that can be found in the following three documents:
25 ;; "P6 Family of Processors Hardware Developer's Manual",
26 ;; Intel, September 1999.
28 ;; "Intel Architecture Optimization Manual",
29 ;; Intel, 1999 (Order Number: 245127-001).
31 ;; "How to optimize for the Pentium family of microprocessors",
34 ;; The P6 pipeline has three major components:
35 ;; 1) the FETCH/DECODE unit, an in-order issue front-end
36 ;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core
37 ;; 3) the RETIRE unit, an in-order retirement unit
39 ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
40 ;; retirement unit are naturally in-order.
44 ;; L1 ICACHE L1 DCACHE
46 ;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE
48 ;; INSTRUCTION POOL __________|_______/
49 ;; (inc. reorder buffer)
51 ;; Since the P6 CPUs execute instructions out-of-order, the most important
52 ;; consideration in performance tuning is making sure enough micro-ops are
53 ;; ready for execution in the out-of-order core, while not stalling the
57 ;; - Find a less crude way to model complex instructions, in
58 ;; particular how many cycles they take to be decoded.
59 ;; - Include decoder latencies in the total reservation latencies.
60 ;; This isn't necessary right now because we assume for every
61 ;; instruction that it never blocks a decoder.
62 ;; - Figure out where the p0 and p1 reservations come from. These
63 ;; appear not to be in the manual
64 ;; - Lots more because I'm sure this is still far from optimal :-)
66 ;; The ppro_idiv and ppro_fdiv automata are used to model issue
67 ;; latencies of idiv and fdiv type insns.
68 (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
70 ;; Simple instructions of the register-register form have only one uop.
71 ;; Load instructions are also only one uop. Store instructions decode to
72 ;; two uops, and simple read-modify instructions also take two uops.
73 ;; Simple instructions of the register-memory form have two to three uops.
74 ;; Simple read-modify-write instructions have four uops. The rules for
75 ;; the decoder are simple:
76 ;; - an instruction with 1 uop can be decoded by any of the three
77 ;; decoders in one cycle.
78 ;; - an instruction with 1 to 4 uops can be decoded only by decoder 0
79 ;; but still in only one cycle.
80 ;; - a complex (microcode) instruction can also only be decoded by
81 ;; decoder 0, and this takes an unspecified number of cycles.
83 ;; The goal is to schedule such that we have a few-one-one uops sequence
84 ;; in each cycle, to decode as many instructions per cycle as possible.
85 (define_cpu_unit "decoder0" "ppro_decoder")
86 (define_cpu_unit "decoder1" "ppro_decoder")
87 (define_cpu_unit "decoder2" "ppro_decoder")
89 ;; We first wish to find an instruction for decoder0, so exclude
90 ;; decoder1 and decoder2 from being reserved until decoder 0 is
92 (presence_set "decoder1" "decoder0")
93 (presence_set "decoder2" "decoder0")
95 ;; Most instructions can be decoded on any of the three decoders.
96 (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
98 ;; The out-of-order core has five pipelines. During each cycle, the core
99 ;; may dispatch zero or one uop on the port of any of the five pipelines
100 ;; so the maximum number of dispatched uops per cycle is 5. In practicer,
101 ;; 3 uops per cycle is more realistic.
103 ;; Two of the five pipelines contain several execution units:
105 ;; Port 0 Port 1 Port 2 Port 3 Port 4
106 ;; ALU ALU LOAD SAC SDA
112 ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
113 ;; JUE = Jump Execution Unit, AGU = Address Generation Unit)
115 (define_cpu_unit "p0,p1" "ppro_core")
116 (define_cpu_unit "p2" "ppro_load")
117 (define_cpu_unit "p3,p4" "ppro_store")
118 (define_cpu_unit "idiv" "ppro_idiv")
119 (define_cpu_unit "fdiv" "ppro_fdiv")
121 ;; Only the irregular instructions have to be modeled here. A load
122 ;; increases the latency by 2 or 3, or by nothing if the manual gives
123 ;; a latency already. Store latencies are not accounted for.
125 ;; The simple instructions follow a very regular pattern of 1 uop per
126 ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
127 ;; on port 4 and port 3. These instructions are modelled at the bottom
130 ;; For microcoded instructions we don't know how many uops are produced.
131 ;; These instructions are the "complex" ones in the Intel manuals. All
132 ;; we _do_ know is that they typically produce four or more uops, so
133 ;; they can only be decoded on decoder0. Modelling their latencies
134 ;; doesn't make sense because we don't know how these instructions are
135 ;; executed in the core. So we just model that they can only be decoded
136 ;; on decoder 0, and say that it takes a little while before the result
138 (define_insn_reservation "ppro_complex_insn" 6
139 (and (eq_attr "cpu" "pentiumpro,generic32")
140 (eq_attr "type" "other,multi,call,callv,str"))
143 ;; imov with memory operands does not use the integer units.
144 (define_insn_reservation "ppro_imov" 1
145 (and (eq_attr "cpu" "pentiumpro,generic32")
146 (and (eq_attr "memory" "none")
147 (eq_attr "type" "imov")))
150 (define_insn_reservation "ppro_imov_load" 4
151 (and (eq_attr "cpu" "pentiumpro,generic32")
152 (and (eq_attr "memory" "load")
153 (eq_attr "type" "imov")))
156 (define_insn_reservation "ppro_imov_store" 1
157 (and (eq_attr "cpu" "pentiumpro,generic32")
158 (and (eq_attr "memory" "store")
159 (eq_attr "type" "imov")))
162 ;; imovx always decodes to one uop, and also doesn't use the integer
163 ;; units if it has memory operands.
164 (define_insn_reservation "ppro_imovx" 1
165 (and (eq_attr "cpu" "pentiumpro,generic32")
166 (and (eq_attr "memory" "none")
167 (eq_attr "type" "imovx")))
170 (define_insn_reservation "ppro_imovx_load" 4
171 (and (eq_attr "cpu" "pentiumpro,generic32")
172 (and (eq_attr "memory" "load")
173 (eq_attr "type" "imovx")))
176 ;; lea executes on port 0 with latency one and throughput 1.
177 (define_insn_reservation "ppro_lea" 1
178 (and (eq_attr "cpu" "pentiumpro,generic32")
179 (and (eq_attr "memory" "none")
180 (eq_attr "type" "lea")))
183 ;; Shift and rotate execute on port 0 with latency and throughput 1.
184 ;; The load and store units need to be reserved when memory operands
186 (define_insn_reservation "ppro_shift_rotate" 1
187 (and (eq_attr "cpu" "pentiumpro,generic32")
188 (and (eq_attr "memory" "none")
189 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
192 (define_insn_reservation "ppro_shift_rotate_mem" 4
193 (and (eq_attr "cpu" "pentiumpro,generic32")
194 (and (eq_attr "memory" "!none")
195 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
196 "decoder0,p2+p0,p4+p3")
199 ;; The P6 has a sophisticated branch prediction mechanism to minimize
200 ;; latencies due to branching. In particular, it has a fast way to
201 ;; execute branches that are taken multiple times (such as in loops).
202 ;; Branches not taken suffer no penalty, and correctly predicted
203 ;; branches cost only one fetch cycle. Mispredicted branches are very
204 ;; costly: typically 15 cycles and possibly as many as 26 cycles.
206 ;; Unfortunately all this makes it quite difficult to properly model
207 ;; the latencies for the compiler. Here I've made the choice to be
208 ;; optimistic and assume branches are often predicted correctly, so
209 ;; they have latency 1, and the decoders are not blocked.
211 ;; In addition, the model assumes a branch always decodes to only 1 uop,
212 ;; which is not exactly true because there are a few instructions that
213 ;; decode to 2 uops or microcode. But this probably gives the best
214 ;; results because we can assume these instructions can decode on all
216 (define_insn_reservation "ppro_branch" 1
217 (and (eq_attr "cpu" "pentiumpro,generic32")
218 (and (eq_attr "memory" "none")
219 (eq_attr "type" "ibr")))
222 ;; ??? Indirect branches probably have worse latency than this.
223 (define_insn_reservation "ppro_indirect_branch" 6
224 (and (eq_attr "cpu" "pentiumpro,generic32")
225 (and (eq_attr "memory" "!none")
226 (eq_attr "type" "ibr")))
229 (define_insn_reservation "ppro_leave" 4
230 (and (eq_attr "cpu" "pentiumpro,generic32")
231 (eq_attr "type" "leave"))
232 "decoder0,p2+(p0|p1),(p0|p1)")
234 ;; imul has throughput one, but latency 4, and can only execute on port 0.
235 (define_insn_reservation "ppro_imul" 4
236 (and (eq_attr "cpu" "pentiumpro,generic32")
237 (and (eq_attr "memory" "none")
238 (eq_attr "type" "imul")))
241 (define_insn_reservation "ppro_imul_mem" 4
242 (and (eq_attr "cpu" "pentiumpro,generic32")
243 (and (eq_attr "memory" "!none")
244 (eq_attr "type" "imul")))
247 ;; div and idiv are very similar, so we model them the same.
248 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
249 ;; These issue latencies are modelled via the ppro_div automaton.
250 (define_insn_reservation "ppro_idiv_QI" 19
251 (and (eq_attr "cpu" "pentiumpro,generic32")
252 (and (eq_attr "memory" "none")
253 (and (eq_attr "mode" "QI")
254 (eq_attr "type" "idiv"))))
255 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
257 (define_insn_reservation "ppro_idiv_QI_load" 19
258 (and (eq_attr "cpu" "pentiumpro,generic32")
259 (and (eq_attr "memory" "load")
260 (and (eq_attr "mode" "QI")
261 (eq_attr "type" "idiv"))))
262 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
264 (define_insn_reservation "ppro_idiv_HI" 23
265 (and (eq_attr "cpu" "pentiumpro,generic32")
266 (and (eq_attr "memory" "none")
267 (and (eq_attr "mode" "HI")
268 (eq_attr "type" "idiv"))))
269 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
271 (define_insn_reservation "ppro_idiv_HI_load" 23
272 (and (eq_attr "cpu" "pentiumpro,generic32")
273 (and (eq_attr "memory" "load")
274 (and (eq_attr "mode" "HI")
275 (eq_attr "type" "idiv"))))
276 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
278 (define_insn_reservation "ppro_idiv_SI" 39
279 (and (eq_attr "cpu" "pentiumpro,generic32")
280 (and (eq_attr "memory" "none")
281 (and (eq_attr "mode" "SI")
282 (eq_attr "type" "idiv"))))
283 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
285 (define_insn_reservation "ppro_idiv_SI_load" 39
286 (and (eq_attr "cpu" "pentiumpro,generic32")
287 (and (eq_attr "memory" "load")
288 (and (eq_attr "mode" "SI")
289 (eq_attr "type" "idiv"))))
290 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
292 ;; Floating point operations always execute on port 0.
293 ;; ??? where do these latencies come from? fadd has latency 3 and
294 ;; has throughput "1/cycle (align with FADD)". What do they
295 ;; mean and how can we model that?
296 (define_insn_reservation "ppro_fop" 3
297 (and (eq_attr "cpu" "pentiumpro,generic32")
298 (and (eq_attr "memory" "none,unknown")
299 (eq_attr "type" "fop")))
302 (define_insn_reservation "ppro_fop_load" 5
303 (and (eq_attr "cpu" "pentiumpro,generic32")
304 (and (eq_attr "memory" "load")
305 (eq_attr "type" "fop")))
308 (define_insn_reservation "ppro_fop_store" 3
309 (and (eq_attr "cpu" "pentiumpro,generic32")
310 (and (eq_attr "memory" "store")
311 (eq_attr "type" "fop")))
312 "decoder0,p0,p0,p0+p4+p3")
314 (define_insn_reservation "ppro_fop_both" 5
315 (and (eq_attr "cpu" "pentiumpro,generic32")
316 (and (eq_attr "memory" "both")
317 (eq_attr "type" "fop")))
318 "decoder0,p2+p0,p0+p4+p3")
320 (define_insn_reservation "ppro_fsgn" 1
321 (and (eq_attr "cpu" "pentiumpro,generic32")
322 (eq_attr "type" "fsgn"))
325 (define_insn_reservation "ppro_fistp" 5
326 (and (eq_attr "cpu" "pentiumpro,generic32")
327 (eq_attr "type" "fistp"))
328 "decoder0,p0*2,p4+p3")
330 (define_insn_reservation "ppro_fcmov" 2
331 (and (eq_attr "cpu" "pentiumpro,generic32")
332 (eq_attr "type" "fcmov"))
335 (define_insn_reservation "ppro_fcmp" 1
336 (and (eq_attr "cpu" "pentiumpro,generic32")
337 (and (eq_attr "memory" "none")
338 (eq_attr "type" "fcmp")))
341 (define_insn_reservation "ppro_fcmp_load" 4
342 (and (eq_attr "cpu" "pentiumpro,generic32")
343 (and (eq_attr "memory" "load")
344 (eq_attr "type" "fcmp")))
347 (define_insn_reservation "ppro_fmov" 1
348 (and (eq_attr "cpu" "pentiumpro,generic32")
349 (and (eq_attr "memory" "none")
350 (eq_attr "type" "fmov")))
353 (define_insn_reservation "ppro_fmov_load" 1
354 (and (eq_attr "cpu" "pentiumpro,generic32")
355 (and (eq_attr "memory" "load")
356 (and (eq_attr "mode" "!XF")
357 (eq_attr "type" "fmov"))))
360 (define_insn_reservation "ppro_fmov_XF_load" 3
361 (and (eq_attr "cpu" "pentiumpro,generic32")
362 (and (eq_attr "memory" "load")
363 (and (eq_attr "mode" "XF")
364 (eq_attr "type" "fmov"))))
365 "decoder0,(p2+p0)*2")
367 (define_insn_reservation "ppro_fmov_store" 1
368 (and (eq_attr "cpu" "pentiumpro,generic32")
369 (and (eq_attr "memory" "store")
370 (and (eq_attr "mode" "!XF")
371 (eq_attr "type" "fmov"))))
374 (define_insn_reservation "ppro_fmov_XF_store" 3
375 (and (eq_attr "cpu" "pentiumpro,generic32")
376 (and (eq_attr "memory" "store")
377 (and (eq_attr "mode" "XF")
378 (eq_attr "type" "fmov"))))
379 "decoder0,(p0+p4),(p0+p3)")
381 ;; fmul executes on port 0 with latency 5. It has issue latency 2,
382 ;; but we don't model this.
383 (define_insn_reservation "ppro_fmul" 5
384 (and (eq_attr "cpu" "pentiumpro,generic32")
385 (and (eq_attr "memory" "none")
386 (eq_attr "type" "fmul")))
389 (define_insn_reservation "ppro_fmul_load" 6
390 (and (eq_attr "cpu" "pentiumpro,generic32")
391 (and (eq_attr "memory" "load")
392 (eq_attr "type" "fmul")))
395 ;; fdiv latencies depend on the mode of the operands. XFmode gives
396 ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
397 ;; Division by a power of 2 takes only 9 cycles, but we cannot model
398 ;; that. Throughput is equal to latency - 1, which we model using the
399 ;; ppro_div automaton.
400 (define_insn_reservation "ppro_fdiv_SF" 18
401 (and (eq_attr "cpu" "pentiumpro,generic32")
402 (and (eq_attr "memory" "none")
403 (and (eq_attr "mode" "SF")
404 (eq_attr "type" "fdiv,fpspc"))))
405 "decodern,p0+fdiv,fdiv*16")
407 (define_insn_reservation "ppro_fdiv_SF_load" 19
408 (and (eq_attr "cpu" "pentiumpro,generic32")
409 (and (eq_attr "memory" "load")
410 (and (eq_attr "mode" "SF")
411 (eq_attr "type" "fdiv,fpspc"))))
412 "decoder0,p2+p0+fdiv,fdiv*16")
414 (define_insn_reservation "ppro_fdiv_DF" 32
415 (and (eq_attr "cpu" "pentiumpro,generic32")
416 (and (eq_attr "memory" "none")
417 (and (eq_attr "mode" "DF")
418 (eq_attr "type" "fdiv,fpspc"))))
419 "decodern,p0+fdiv,fdiv*30")
421 (define_insn_reservation "ppro_fdiv_DF_load" 33
422 (and (eq_attr "cpu" "pentiumpro,generic32")
423 (and (eq_attr "memory" "load")
424 (and (eq_attr "mode" "DF")
425 (eq_attr "type" "fdiv,fpspc"))))
426 "decoder0,p2+p0+fdiv,fdiv*30")
428 (define_insn_reservation "ppro_fdiv_XF" 38
429 (and (eq_attr "cpu" "pentiumpro,generic32")
430 (and (eq_attr "memory" "none")
431 (and (eq_attr "mode" "XF")
432 (eq_attr "type" "fdiv,fpspc"))))
433 "decodern,p0+fdiv,fdiv*36")
435 (define_insn_reservation "ppro_fdiv_XF_load" 39
436 (and (eq_attr "cpu" "pentiumpro,generic32")
437 (and (eq_attr "memory" "load")
438 (and (eq_attr "mode" "XF")
439 (eq_attr "type" "fdiv,fpspc"))))
440 "decoder0,p2+p0+fdiv,fdiv*36")
442 ;; MMX instructions can execute on either port 0 or port 1 with a
443 ;; throughput of 1/cycle.
444 ;; on port 0: - ALU (latency 1)
445 ;; - Multiplier Unit (latency 3)
446 ;; on port 1: - ALU (latency 1)
447 ;; - Shift Unit (latency 1)
449 ;; MMX instructions are either of the type reg-reg, or read-modify, and
450 ;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
451 ;; so they behave as "simple" instructions that need no special modelling.
452 ;; We only have to model mmxshft and mmxmul.
453 (define_insn_reservation "ppro_mmx_shft" 1
454 (and (eq_attr "cpu" "pentiumpro,generic32")
455 (and (eq_attr "memory" "none")
456 (eq_attr "type" "mmxshft")))
459 (define_insn_reservation "ppro_mmx_shft_load" 2
460 (and (eq_attr "cpu" "pentiumpro,generic32")
461 (and (eq_attr "memory" "none")
462 (eq_attr "type" "mmxshft")))
465 (define_insn_reservation "ppro_mmx_mul" 3
466 (and (eq_attr "cpu" "pentiumpro,generic32")
467 (and (eq_attr "memory" "none")
468 (eq_attr "type" "mmxmul")))
471 (define_insn_reservation "ppro_mmx_mul_load" 3
472 (and (eq_attr "cpu" "pentiumpro,generic32")
473 (and (eq_attr "memory" "none")
474 (eq_attr "type" "mmxmul")))
477 (define_insn_reservation "ppro_sse_mmxcvt" 4
478 (and (eq_attr "cpu" "pentiumpro,generic32")
479 (and (eq_attr "mode" "DI")
480 (eq_attr "type" "mmxcvt")))
483 ;; FIXME: These are Pentium III only, but we cannot tell here if
484 ;; we're generating code for PentiumPro/Pentium II or Pentium III
485 ;; (define_insn_reservation "ppro_sse_mmxshft" 2
486 ;; (and (eq_attr "cpu" "pentiumpro,generic32")
487 ;; (and (eq_attr "mode" "DI")
488 ;; (eq_attr "type" "mmxshft")))
491 ;; SSE is very complicated, and takes a bit more effort.
492 ;; ??? I assumed that all SSE instructions decode on decoder0,
493 ;; but is this correct?
495 ;; The sfence instruction.
496 (define_insn_reservation "ppro_sse_sfence" 3
497 (and (eq_attr "cpu" "pentiumpro,generic32")
498 (and (eq_attr "memory" "unknown")
499 (eq_attr "type" "sse")))
502 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
503 (define_insn_reservation "ppro_sse_SF" 3
504 (and (eq_attr "cpu" "pentiumpro,generic32")
505 (and (eq_attr "mode" "SF")
506 (eq_attr "type" "sse")))
509 (define_insn_reservation "ppro_sse_add_SF" 3
510 (and (eq_attr "cpu" "pentiumpro,generic32")
511 (and (eq_attr "memory" "none")
512 (and (eq_attr "mode" "SF")
513 (eq_attr "type" "sseadd"))))
516 (define_insn_reservation "ppro_sse_add_SF_load" 3
517 (and (eq_attr "cpu" "pentiumpro,generic32")
518 (and (eq_attr "memory" "load")
519 (and (eq_attr "mode" "SF")
520 (eq_attr "type" "sseadd"))))
523 (define_insn_reservation "ppro_sse_cmp_SF" 3
524 (and (eq_attr "cpu" "pentiumpro,generic32")
525 (and (eq_attr "memory" "none")
526 (and (eq_attr "mode" "SF")
527 (eq_attr "type" "ssecmp"))))
530 (define_insn_reservation "ppro_sse_cmp_SF_load" 3
531 (and (eq_attr "cpu" "pentiumpro,generic32")
532 (and (eq_attr "memory" "load")
533 (and (eq_attr "mode" "SF")
534 (eq_attr "type" "ssecmp"))))
537 (define_insn_reservation "ppro_sse_comi_SF" 1
538 (and (eq_attr "cpu" "pentiumpro,generic32")
539 (and (eq_attr "memory" "none")
540 (and (eq_attr "mode" "SF")
541 (eq_attr "type" "ssecomi"))))
544 (define_insn_reservation "ppro_sse_comi_SF_load" 1
545 (and (eq_attr "cpu" "pentiumpro,generic32")
546 (and (eq_attr "memory" "load")
547 (and (eq_attr "mode" "SF")
548 (eq_attr "type" "ssecomi"))))
551 (define_insn_reservation "ppro_sse_mul_SF" 4
552 (and (eq_attr "cpu" "pentiumpro,generic32")
553 (and (eq_attr "memory" "none")
554 (and (eq_attr "mode" "SF")
555 (eq_attr "type" "ssemul"))))
558 (define_insn_reservation "ppro_sse_mul_SF_load" 4
559 (and (eq_attr "cpu" "pentiumpro,generic32")
560 (and (eq_attr "memory" "load")
561 (and (eq_attr "mode" "SF")
562 (eq_attr "type" "ssemul"))))
565 ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
566 (define_insn_reservation "ppro_sse_div_SF" 18
567 (and (eq_attr "cpu" "pentiumpro,generic32")
568 (and (eq_attr "memory" "none")
569 (and (eq_attr "mode" "SF")
570 (eq_attr "type" "ssediv"))))
573 (define_insn_reservation "ppro_sse_div_SF_load" 18
574 (and (eq_attr "cpu" "pentiumpro,generic32")
575 (and (eq_attr "memory" "none")
576 (and (eq_attr "mode" "SF")
577 (eq_attr "type" "ssediv"))))
578 "decoder0,(p2+p0),p0*16")
580 (define_insn_reservation "ppro_sse_icvt_SF" 4
581 (and (eq_attr "cpu" "pentiumpro,generic32")
582 (and (eq_attr "mode" "SF")
583 (eq_attr "type" "sseicvt")))
584 "decoder0,(p2+p1)*2")
586 (define_insn_reservation "ppro_sse_icvt_SI" 3
587 (and (eq_attr "cpu" "pentiumpro,generic32")
588 (and (eq_attr "mode" "SI")
589 (eq_attr "type" "sseicvt")))
592 (define_insn_reservation "ppro_sse_mov_SF" 3
593 (and (eq_attr "cpu" "pentiumpro,generic32")
594 (and (eq_attr "memory" "none")
595 (and (eq_attr "mode" "SF")
596 (eq_attr "type" "ssemov"))))
599 (define_insn_reservation "ppro_sse_mov_SF_load" 3
600 (and (eq_attr "cpu" "pentiumpro,generic32")
601 (and (eq_attr "memory" "load")
602 (and (eq_attr "mode" "SF")
603 (eq_attr "type" "ssemov"))))
604 "decoder0,p2+(p0|p1)")
606 (define_insn_reservation "ppro_sse_mov_SF_store" 3
607 (and (eq_attr "cpu" "pentiumpro,generic32")
608 (and (eq_attr "memory" "store")
609 (and (eq_attr "mode" "SF")
610 (eq_attr "type" "ssemov"))))
613 (define_insn_reservation "ppro_sse_V4SF" 4
614 (and (eq_attr "cpu" "pentiumpro,generic32")
615 (and (eq_attr "mode" "V4SF")
616 (eq_attr "type" "sse")))
619 (define_insn_reservation "ppro_sse_add_V4SF" 3
620 (and (eq_attr "cpu" "pentiumpro,generic32")
621 (and (eq_attr "memory" "none")
622 (and (eq_attr "mode" "V4SF")
623 (eq_attr "type" "sseadd"))))
626 (define_insn_reservation "ppro_sse_add_V4SF_load" 3
627 (and (eq_attr "cpu" "pentiumpro,generic32")
628 (and (eq_attr "memory" "load")
629 (and (eq_attr "mode" "V4SF")
630 (eq_attr "type" "sseadd"))))
631 "decoder0,(p2+p1)*2")
633 (define_insn_reservation "ppro_sse_cmp_V4SF" 3
634 (and (eq_attr "cpu" "pentiumpro,generic32")
635 (and (eq_attr "memory" "none")
636 (and (eq_attr "mode" "V4SF")
637 (eq_attr "type" "ssecmp"))))
640 (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
641 (and (eq_attr "cpu" "pentiumpro,generic32")
642 (and (eq_attr "memory" "load")
643 (and (eq_attr "mode" "V4SF")
644 (eq_attr "type" "ssecmp"))))
645 "decoder0,(p2+p1)*2")
647 (define_insn_reservation "ppro_sse_cvt_V4SF" 3
648 (and (eq_attr "cpu" "pentiumpro,generic32")
649 (and (eq_attr "memory" "none,unknown")
650 (and (eq_attr "mode" "V4SF")
651 (eq_attr "type" "ssecvt"))))
654 (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
655 (and (eq_attr "cpu" "pentiumpro,generic32")
656 (and (eq_attr "memory" "!none,unknown")
657 (and (eq_attr "mode" "V4SF")
658 (eq_attr "type" "ssecmp"))))
661 (define_insn_reservation "ppro_sse_mul_V4SF" 5
662 (and (eq_attr "cpu" "pentiumpro,generic32")
663 (and (eq_attr "memory" "none")
664 (and (eq_attr "mode" "V4SF")
665 (eq_attr "type" "ssemul"))))
668 (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
669 (and (eq_attr "cpu" "pentiumpro,generic32")
670 (and (eq_attr "memory" "load")
671 (and (eq_attr "mode" "V4SF")
672 (eq_attr "type" "ssemul"))))
673 "decoder0,(p2+p0)*2")
675 ;; FIXME: p0 really closed this long???
676 (define_insn_reservation "ppro_sse_div_V4SF" 48
677 (and (eq_attr "cpu" "pentiumpro,generic32")
678 (and (eq_attr "memory" "none")
679 (and (eq_attr "mode" "V4SF")
680 (eq_attr "type" "ssediv"))))
683 (define_insn_reservation "ppro_sse_div_V4SF_load" 48
684 (and (eq_attr "cpu" "pentiumpro,generic32")
685 (and (eq_attr "memory" "load")
686 (and (eq_attr "mode" "V4SF")
687 (eq_attr "type" "ssediv"))))
688 "decoder0,(p2+p0)*2,p0*32")
690 (define_insn_reservation "ppro_sse_log_V4SF" 2
691 (and (eq_attr "cpu" "pentiumpro,generic32")
692 (and (eq_attr "memory" "none")
693 (and (eq_attr "mode" "V4SF")
694 (eq_attr "type" "sselog,sselog1"))))
697 (define_insn_reservation "ppro_sse_log_V4SF_load" 2
698 (and (eq_attr "cpu" "pentiumpro,generic32")
699 (and (eq_attr "memory" "load")
700 (and (eq_attr "mode" "V4SF")
701 (eq_attr "type" "sselog,sselog1"))))
704 (define_insn_reservation "ppro_sse_mov_V4SF" 1
705 (and (eq_attr "cpu" "pentiumpro,generic32")
706 (and (eq_attr "memory" "none")
707 (and (eq_attr "mode" "V4SF")
708 (eq_attr "type" "ssemov"))))
709 "decoder0,(p0|p1)*2")
711 (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
712 (and (eq_attr "cpu" "pentiumpro,generic32")
713 (and (eq_attr "memory" "load")
714 (and (eq_attr "mode" "V4SF")
715 (eq_attr "type" "ssemov"))))
718 (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
719 (and (eq_attr "cpu" "pentiumpro,generic32")
720 (and (eq_attr "memory" "store")
721 (and (eq_attr "mode" "V4SF")
722 (eq_attr "type" "ssemov"))))
723 "decoder0,(p4+p3)*2")
725 ;; All other instructions are modelled as simple instructions.
726 ;; We have already modelled all i387 floating point instructions, so all
727 ;; other instructions execute on either port 0 or port 1. This includes
728 ;; the ALU units, and the MMX units.
730 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
731 ;; the three decoders.
732 (define_insn_reservation "ppro_insn" 1
733 (and (eq_attr "cpu" "pentiumpro,generic32")
734 (and (eq_attr "memory" "none,unknown")
735 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
738 ;; read-modify and register-memory instructions have 2 or three uops,
739 ;; so they have to be decoded on decoder0.
740 (define_insn_reservation "ppro_insn_load" 3
741 (and (eq_attr "cpu" "pentiumpro,generic32")
742 (and (eq_attr "memory" "load")
743 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
744 "decoder0,p2+(p0|p1)")
746 (define_insn_reservation "ppro_insn_store" 1
747 (and (eq_attr "cpu" "pentiumpro,generic32")
748 (and (eq_attr "memory" "store")
749 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
750 "decoder0,(p0|p1),p4+p3")
752 ;; read-modify-store instructions produce 4 uops so they have to be
753 ;; decoded on decoder0 as well.
754 (define_insn_reservation "ppro_insn_both" 4
755 (and (eq_attr "cpu" "pentiumpro,generic32")
756 (and (eq_attr "memory" "both")
757 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp")))
758 "decoder0,p2+(p0|p1),p4+p3")