gcc/config/rs6000/cell.md

   1 ;; Scheduling description for cell processor.
   2 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006
   3 ;; Free Software Foundation, Inc.
   4 ;; Contributed by Sony Computer Entertainment, Inc.,
   5
   6
   7 ;; This file is free software; you can redistribute it and/or modify it under
   8 ;; the terms of the GNU General Public License as published by the Free
   9 ;; Software Foundation; either version 2 of the License, or (at your option)
  10 ;; any later version.
  11
  12 ;; This file is distributed in the hope that it will be useful, but WITHOUT
  13 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 ;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 ;; for more details.
  16
  17 ;; You should have received a copy of the GNU General Public License
  18 ;; along with this file; see the file COPYING.  If not, write to the Free
  19 ;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  20 ;; 02110-1301, USA.
  21
  22 ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
  23
  24 ;; BE Architecture *DD3.0 and DD3.1*
  25 ;; This file simulate PPU processor unit backend of pipeline, maualP24.
  26 ;; manual P27, stall and flush points
  27 ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
  28 ;;  order, the grouped adress are aligned by 8
  29 ;; This file only simulate one thread situation
  30 ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
  31 ;;   and load/store unit)
  32 ;; VSU executes all scalar floating points insn(a float unit),
  33 ;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
  34
  35 ;; Dual issue combination
  36
  37 ;;      FXU     LSU     BR              VMX                    VMX
  38 ;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
  39 ;;FXU   X
  40 ;;LSU           X                       X                       X
  41 ;;BR                    X
  42 ;;VMX(sx,cx,vsu_fp,fp_arth)             X
  43 ;;VMX(perm,vsu_ls, fp_ls)                                       X
  44 ;;    X are illegal combination.
  45
  46 ;; Dual issue exceptions:
  47 ;;(1) nop-pipelined FXU instr in slot 0
  48 ;;(2) non-pipelined FPU inst in slot 0
  49 ;; CSI instr(contex-synchronizing insn)
  50 ;; Microcode insn
  51
  52 ;; BRU unit: bru(none register stall), bru_cr(cr register stall)
  53 ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
  54 ;;  vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
  55 ;;  nonpipelined simulation
  56 ;; micr insns will stall at least 7 cycles to get the first instr from ROM,
  57 ;;  micro instructions are not dual issued.
  58
  59 ;; slot0 is older than slot1
  60 ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
  61
  62 ;; There different stall point
  63 ;; IB2, only stall one thread if stall here, so try to stall here as much as
  64 ;; we can
  65 ;; condition(1) insert nop, OR and ORI instruction form
  66 ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
  67 ;;   CR0-access while stdcx, or stwcx
  68 ;; IS2 stall ;; Page91 for details
  69 ;; VQ8 stall
  70 ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
  71 ;;  the vsu issue queue
  72
  73 ;;(define_automaton "cellxu")
  74
  75 ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
  76
  77 ;; ndfa
  78 (define_automaton "cellxu,cellvsu,cellbru,cell_mis")
  79
  80 (define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
  81 (define_cpu_unit "bru_cell" "cellbru")
  82 (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
  83
  84 (define_cpu_unit "slot0,slot1" "cell_mis")
  85
  86 (absence_set "slot0" "slot1")
  87
  88 (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
  89 (define_reservation "slot01" "slot0|slot1")
  90
  91
  92 ;; Load/store
  93 ;; lmw, lswi, lswx are only generated for optimize for space, MC,
  94 ;;   these instr are not simulated
  95 (define_insn_reservation "cell-load" 2
  96   (and (eq_attr "type" "load")
  97        (eq_attr "cpu" "cell"))
  98   "slot01,lsu_cell")
  99
 100 ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
 101 ;;  if with 32bytes alignment, CMC
 102 (define_insn_reservation "cell-load-ux" 2
 103   (and (eq_attr "type" "load_ux,load_u")
 104        (eq_attr "cpu" "cell"))
 105   "slot01,fxu_cell+lsu_cell")
 106
 107 ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
 108 ;;   11/7, 11/8, 11/12
 109 (define_insn_reservation "cell-load-ext" 2
 110   (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
 111        (eq_attr "cpu" "cell"))
 112   "slot01,fxu_cell+lsu_cell")
 113
 114 ;;lfs,lfsx,lfd,lfdx, 1 cycle
 115 (define_insn_reservation "cell-fpload" 1
 116   (and (eq_attr "type" "fpload")
 117        (eq_attr "cpu" "cell"))
 118   "vsu2_cell+lsu_cell+slot01")
 119
 120 ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
 121 (define_insn_reservation "cell-fpload-update" 1
 122   (and (eq_attr "type" "fpload,fpload_u,fpload_ux")
 123        (eq_attr "cpu" "cell"))
 124   "fxu_cell+vsu2_cell+lsu_cell+slot01")
 125
 126 (define_insn_reservation "cell-vecload" 2
 127   (and (eq_attr "type" "vecload")
 128        (eq_attr "cpu" "cell"))
 129   "slot01,vsu2_cell+lsu_cell")
 130
 131 ;;st? stw(MC)
 132 (define_insn_reservation "cell-store" 1
 133   (and (eq_attr "type" "store")
 134        (eq_attr "cpu" "cell"))
 135   "lsu_cell+slot01")
 136
 137 ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
 138 (define_insn_reservation "cell-store-update" 1
 139   (and (eq_attr "type" "store_ux,store_u")
 140        (eq_attr "cpu" "cell"))
 141   "fxu_cell+lsu_cell+slot01")
 142
 143 (define_insn_reservation "cell-fpstore" 1
 144   (and (eq_attr "type" "fpstore")
 145        (eq_attr "cpu" "cell"))
 146   "vsu2_cell+lsu_cell+slot01")
 147
 148 (define_insn_reservation "cell-fpstore-update" 1
 149   (and (eq_attr "type" "fpstore_ux,fpstore_u")
 150        (eq_attr "cpu" "cell"))
 151   "vsu2_cell+fxu_cell+lsu_cell+slot01")
 152
 153 (define_insn_reservation "cell-vecstore" 1
 154   (and (eq_attr "type" "vecstore")
 155        (eq_attr "cpu" "cell"))
 156   "vsu2_cell+lsu_cell+slot01")
 157
 158 ;; Integer latency is 2 cycles
 159 (define_insn_reservation "cell-integer" 2
 160   (and (eq_attr "type" "integer,insert_dword,shift,trap,\
 161                         var_shift_rotate,cntlz,exts")
 162        (eq_attr "cpu" "cell"))
 163   "slot01,fxu_cell")
 164
 165 ;; Two integer latency is 4 cycles
 166 (define_insn_reservation "cell-two" 4
 167   (and (eq_attr "type" "two")
 168        (eq_attr "cpu" "cell"))
 169   "slot01,fxu_cell,fxu_cell*2")
 170
 171 ;; Three integer latency is 6 cycles
 172 (define_insn_reservation "cell-three" 6
 173   (and (eq_attr "type" "three")
 174        (eq_attr "cpu" "cell"))
 175   "slot01,fxu_cell,fxu_cell*4")
 176
 177 ;; rlwimi, alter cr0
 178 (define_insn_reservation "cell-insert" 2
 179   (and (eq_attr "type" "insert_word")
 180        (eq_attr "cpu" "cell"))
 181  "slot01,fxu_cell")
 182
 183 ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
 184 (define_insn_reservation "cell-cmp" 1
 185   (and (eq_attr "type" "cmp")
 186        (eq_attr "cpu" "cell"))
 187   "fxu_cell+slot01")
 188
 189 ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
 190 (define_insn_reservation "cell-fast-cmp" 2
 191   (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
 192                             var_delayed_compare")
 193             (eq_attr "cpu" "cell"))
 194         (eq_attr "cell_micro" "not"))
 195   "slot01,fxu_cell")
 196
 197 (define_insn_reservation "cell-cmp-microcoded" 9
 198   (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
 199                             var_delayed_compare")
 200             (eq_attr "cpu" "cell"))
 201         (eq_attr "cell_micro" "always"))
 202   "slot0+slot1,fxu_cell,fxu_cell*7")
 203
 204 ;; mulld
 205 (define_insn_reservation "cell-lmul" 15
 206   (and (eq_attr "type" "lmul")
 207        (eq_attr "cpu" "cell"))
 208   "slot1,nonpipeline,nonpipeline*13")
 209
 210 ;; mulld. is microcoded
 211 (define_insn_reservation "cell-lmul-cmp" 22
 212   (and (eq_attr "type" "lmul_compare")
 213        (eq_attr "cpu" "cell"))
 214   "slot0+slot1,nonpipeline,nonpipeline*20")
 215
 216 ;; mulli, 6 cycles
 217 (define_insn_reservation "cell-imul23" 6
 218   (and (eq_attr "type" "imul2,imul3")
 219        (eq_attr "cpu" "cell"))
 220   "slot1,nonpipeline,nonpipeline*4")
 221
 222 ;; mullw, 9
 223 (define_insn_reservation "cell-imul" 9
 224   (and (eq_attr "type" "imul")
 225        (eq_attr "cpu" "cell"))
 226   "slot1,nonpipeline,nonpipeline*7")
 227
 228 ;; divide
 229 (define_insn_reservation "cell-idiv" 32
 230   (and (eq_attr "type" "idiv")
 231        (eq_attr "cpu" "cell"))
 232   "slot1,nonpipeline,nonpipeline*30")
 233
 234 (define_insn_reservation "cell-ldiv" 64
 235   (and (eq_attr "type" "ldiv")
 236        (eq_attr "cpu" "cell"))
 237   "slot1,nonpipeline,nonpipeline*62")
 238
 239 ;;mflr and mfctr are pipelined
 240 (define_insn_reservation "cell-mfjmpr" 1
 241   (and (eq_attr "type" "mfjmpr")
 242        (eq_attr "cpu" "cell"))
 243   "slot01+bru_cell")
 244
 245 ;;mtlr and mtctr,
 246 ;;mtspr fully pipelined
 247 (define_insn_reservation "cell-mtjmpr" 1
 248  (and (eq_attr "type" "mtjmpr")
 249        (eq_attr "cpu" "cell"))
 250   "bru_cell+slot01")
 251
 252 ;; Branches
 253 ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
 254 ;; bcctr, bcctrl, latency 2, actually adjust by be to 4
 255 (define_insn_reservation "cell-branch" 1
 256   (and (eq_attr "type" "branch")
 257        (eq_attr "cpu" "cell"))
 258   "bru_cell+slot1")
 259
 260 (define_insn_reservation "cell-branchreg" 1
 261   (and (eq_attr "type" "jmpreg")
 262        (eq_attr "cpu" "cell"))
 263   "bru_cell+slot1")
 264
 265 ;; cr hazard
 266 ;; page 90, special cases for CR hazard, only one instr can access cr per cycle
 267 ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
 268 (define_insn_reservation "cell-crlogical" 1
 269   (and (eq_attr "type" "cr_logical,delayed_cr")
 270        (eq_attr "cpu" "cell"))
 271   "bru_cell+slot01")
 272
 273 ;; mfcrf and mfcr is about 34 cycles and nonpipelined
 274 (define_insn_reservation "cell-mfcr" 34
 275   (and (eq_attr "type" "mfcrf,mfcr")
 276        (eq_attr "cpu" "cell"))
 277    "slot1,nonpipeline,nonpipeline*32")
 278
 279 ;; mtcrf (1 field)
 280 (define_insn_reservation "cell-mtcrf" 1
 281   (and (eq_attr "type" "mtcr")
 282        (eq_attr "cpu" "cell"))
 283   "fxu_cell+slot01")
 284
 285 ; Basic FP latency is 10 cycles, thoughput is 1/cycle
 286 (define_insn_reservation "cell-fp" 10
 287   (and (eq_attr "type" "fp,dmul")
 288        (eq_attr "cpu" "cell"))
 289   "slot01,vsu1_cell,vsu1_cell*8")
 290
 291 (define_insn_reservation "cell-fpcompare" 1
 292   (and (eq_attr "type" "fpcompare")
 293        (eq_attr "cpu" "cell"))
 294   "vsu1_cell+slot01")
 295
 296 ;; sdiv thoughput 1/74, not pipelined but only in the FPU
 297 (define_insn_reservation "cell-sdiv" 74
 298   (and (eq_attr "type" "sdiv,ddiv")
 299        (eq_attr "cpu" "cell"))
 300   "slot1,nonpipeline,nonpipeline*72")
 301
 302 ;; fsqrt thoughput 1/84, not pipelined but only in the FPU
 303 (define_insn_reservation "cell-sqrt" 84
 304   (and (eq_attr "type" "ssqrt,dsqrt")
 305        (eq_attr "cpu" "cell"))
 306   "slot1,nonpipeline,nonpipeline*82")
 307
 308 ; VMX
 309 (define_insn_reservation "cell-vecsimple" 4
 310   (and (eq_attr "type" "vecsimple")
 311        (eq_attr "cpu" "cell"))
 312   "slot01,vsu1_cell,vsu1_cell*2")
 313
 314 ;; mult, div, madd
 315 (define_insn_reservation "cell-veccomplex" 10
 316   (and (eq_attr "type" "veccomplex")
 317        (eq_attr "cpu" "cell"))
 318   "slot01,vsu1_cell,vsu1_cell*8")
 319
 320 ;; TODO: add support for recording instructions
 321 (define_insn_reservation "cell-veccmp" 4
 322   (and (eq_attr "type" "veccmp")
 323        (eq_attr "cpu" "cell"))
 324   "slot01,vsu1_cell,vsu1_cell*2")
 325
 326 (define_insn_reservation "cell-vecfloat" 12
 327   (and (eq_attr "type" "vecfloat")
 328        (eq_attr "cpu" "cell"))
 329   "slot01,vsu1_cell,vsu1_cell*10")
 330
 331 (define_insn_reservation "cell-vecperm" 4
 332   (and (eq_attr "type" "vecperm")
 333        (eq_attr "cpu" "cell"))
 334   "slot01,vsu2_cell,vsu2_cell*2")
 335
 336 ;; New for 4.2, syncs
 337
 338 (define_insn_reservation "cell-sync" 11
 339   (and (eq_attr "type" "sync")
 340        (eq_attr "cpu" "cell"))
 341   "slot01,lsu_cell,lsu_cell*9")
 342
 343 (define_insn_reservation "cell-isync" 11
 344   (and (eq_attr "type" "isync")
 345        (eq_attr "cpu" "cell"))
 346   "slot01,lsu_cell,lsu_cell*9")
 347
 348 (define_insn_reservation "cell-load_l" 11
 349   (and (eq_attr "type" "load_l")
 350        (eq_attr "cpu" "cell"))
 351   "slot01,lsu_cell,lsu_cell*9")
 352
 353 (define_insn_reservation "cell-store_c" 11
 354   (and (eq_attr "type" "store_c")
 355        (eq_attr "cpu" "cell"))
 356   "slot01,lsu_cell,lsu_cell*9")
 357
 358 ;; RAW register dependency
 359
 360 ;; addi r3, r3, 1
 361 ;; lw r4,offset(r3)
 362 ;; there are 5 cycle deplay for r3 bypassing
 363 ;; there are 5 cycle delay for a dependent load after a load
 364 (define_bypass 5 "cell-integer" "cell-load")
 365 (define_bypass 5 "cell-integer" "cell-load-ext")
 366 (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
 367
 368 ;; there is a 6 cycle delay after a fp compare until you can use the cr.
 369 (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
 370
 371 ;; VXU float RAW
 372 (define_bypass 11 "cell-vecfloat" "cell-vecfloat")
 373
 374 ;; VXU and FPU
 375 (define_bypass 6 "cell-veccomplex" "cell-vecsimple")
 376 ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
 377 (define_bypass 3 "cell-vecfloat" "cell-veccomplex")
 378 ; this is not correct,
 379 ;;  this is a stall in general and not dependent on result
 380 (define_bypass 13 "cell-vecstore" "cell-fpstore")
 381 ; this is not correct, this can never be true, not dependent on result
 382 (define_bypass 7 "cell-fp" "cell-fpload")
 383 ;; vsu1 should avoid writing to the same target register as vsu2 insn
 384 ;;   within 12 cycles.
 385
 386 ;; WAW hazard
 387
 388 ;; the target of VSU estimate should not be reused within 10 dispatch groups
 389 ;; the target of VSU float should not be reused within 8 dispatch groups
 390 ;; the target of VSU complex should not be reused within 5 dispatch groups
 391 ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
 392
 393 ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
 394 ;;  ex4 stage(10 cycles)
 395 (define_bypass 10 "cell-mtjmpr" "cell-branchreg")
 396
 397 ;;Things are not simulated:
 398 ;; update instruction, update address gpr are not simulated
 399 ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
 400 ;;  insns
 401