gcc/config/rs6000/cell.md

   1 ;; Scheduling description for cell processor.
   2 ;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
   3 ;; Contributed by Sony Computer Entertainment, Inc.,
   4
   5
   6 ;; This file is free software; you can redistribute it and/or modify it under
   7 ;; the terms of the GNU General Public License as published by the Free
   8 ;; Software Foundation; either version 3 of the License, or (at your option)
   9 ;; any later version.
  10
  11 ;; This file is distributed in the hope that it will be useful, but WITHOUT
  12 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 ;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 ;; for more details.
  15
  16 ;; You should have received a copy of the GNU General Public License
  17 ;; along with GCC; see the file COPYING3.  If not see
  18 ;; <http://www.gnu.org/licenses/>.
  19
  20 ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
  21
  22 ;; BE Architecture *DD3.0 and DD3.1*
  23 ;; This file simulate PPU processor unit backend of pipeline, maualP24.
  24 ;; manual P27, stall and flush points
  25 ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
  26 ;;  order, the grouped address are aligned by 8
  27 ;; This file only simulate one thread situation
  28 ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
  29 ;;   and load/store unit)
  30 ;; VSU executes all scalar floating points insn(a float unit),
  31 ;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
  32
  33 ;; Dual issue combination
  34
  35 ;;      FXU     LSU     BR              VMX                    VMX
  36 ;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
  37 ;;FXU   X
  38 ;;LSU           X                       X                       X
  39 ;;BR                    X
  40 ;;VMX(sx,cx,vsu_fp,fp_arth)             X
  41 ;;VMX(perm,vsu_ls, fp_ls)                                       X
  42 ;;    X are illegal combination.
  43
  44 ;; Dual issue exceptions:
  45 ;;(1) nop-pipelined FXU instr in slot 0
  46 ;;(2) non-pipelined FPU inst in slot 0
  47 ;; CSI instr(contex-synchronizing insn)
  48 ;; Microcode insn
  49
  50 ;; BRU unit: bru(none register stall), bru_cr(cr register stall)
  51 ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
  52 ;;  vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
  53 ;;  nonpipelined simulation
  54 ;; micr insns will stall at least 7 cycles to get the first instr from ROM,
  55 ;;  micro instructions are not dual issued.
  56
  57 ;; slot0 is older than slot1
  58 ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
  59
  60 ;; There different stall point
  61 ;; IB2, only stall one thread if stall here, so try to stall here as much as
  62 ;; we can
  63 ;; condition(1) insert nop, OR and ORI instruction form
  64 ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
  65 ;;   CR0-access while stdcx, or stwcx
  66 ;; IS2 stall ;; Page91 for details
  67 ;; VQ8 stall
  68 ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
  69 ;;  the vsu issue queue
  70
  71 ;;(define_automaton "cellxu")
  72
  73 ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
  74
  75 ;; ndfa
  76 (define_automaton "cellxu,cellvsu,cellbru,cell_mis")
  77
  78 (define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
  79 (define_cpu_unit "bru_cell" "cellbru")
  80 (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
  81
  82 (define_cpu_unit "slot0,slot1" "cell_mis")
  83
  84 (absence_set "slot0" "slot1")
  85
  86 (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
  87 (define_reservation "slot01" "slot0|slot1")
  88
  89
  90 ;; Load/store
  91 ;; lmw, lswi, lswx are only generated for optimize for space, MC,
  92 ;;   these instr are not simulated
  93 (define_insn_reservation "cell-load" 2
  94   (and (eq_attr "type" "load")
  95        (eq_attr "sign_extend" "no")
  96        (eq_attr "update" "no")
  97        (eq_attr "cpu" "cell"))
  98   "slot01,lsu_cell")
  99
 100 ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
 101 ;;  if with 32bytes alignment, CMC
 102 (define_insn_reservation "cell-load-ux" 2
 103   (and (eq_attr "type" "load")
 104        (eq_attr "sign_extend" "no")
 105        (eq_attr "update" "yes")
 106        (eq_attr "cpu" "cell"))
 107   "slot01,fxu_cell+lsu_cell")
 108
 109 ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
 110 ;;   11/7, 11/8, 11/12
 111 (define_insn_reservation "cell-load-ext" 2
 112   (and (eq_attr "type" "load")
 113        (eq_attr "sign_extend" "yes")
 114        (eq_attr "cpu" "cell"))
 115   "slot01,fxu_cell+lsu_cell")
 116
 117 ;;lfs,lfsx,lfd,lfdx, 1 cycle
 118 (define_insn_reservation "cell-fpload" 1
 119   (and (eq_attr "type" "fpload")
 120        (eq_attr "update" "no")
 121        (eq_attr "cpu" "cell"))
 122   "vsu2_cell+lsu_cell+slot01")
 123
 124 ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
 125 (define_insn_reservation "cell-fpload-update" 1
 126   (and (eq_attr "type" "fpload")
 127        (eq_attr "update" "yes")
 128        (eq_attr "cpu" "cell"))
 129   "fxu_cell+vsu2_cell+lsu_cell+slot01")
 130
 131 (define_insn_reservation "cell-vecload" 2
 132   (and (eq_attr "type" "vecload")
 133        (eq_attr "cpu" "cell"))
 134   "slot01,vsu2_cell+lsu_cell")
 135
 136 ;;st? stw(MC)
 137 (define_insn_reservation "cell-store" 1
 138   (and (eq_attr "type" "store")
 139        (eq_attr "update" "no")
 140        (eq_attr "cpu" "cell"))
 141   "lsu_cell+slot01")
 142
 143 ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
 144 (define_insn_reservation "cell-store-update" 1
 145   (and (eq_attr "type" "store")
 146        (eq_attr "update" "yes")
 147        (eq_attr "cpu" "cell"))
 148   "fxu_cell+lsu_cell+slot01")
 149
 150 (define_insn_reservation "cell-fpstore" 1
 151   (and (eq_attr "type" "fpstore")
 152        (eq_attr "update" "no")
 153        (eq_attr "cpu" "cell"))
 154   "vsu2_cell+lsu_cell+slot01")
 155
 156 (define_insn_reservation "cell-fpstore-update" 1
 157   (and (eq_attr "type" "fpstore")
 158        (eq_attr "update" "yes")
 159        (eq_attr "cpu" "cell"))
 160   "vsu2_cell+fxu_cell+lsu_cell+slot01")
 161
 162 (define_insn_reservation "cell-vecstore" 1
 163   (and (eq_attr "type" "vecstore")
 164        (eq_attr "cpu" "cell"))
 165   "vsu2_cell+lsu_cell+slot01")
 166
 167 ;; Integer latency is 2 cycles
 168 (define_insn_reservation "cell-integer" 2
 169   (and (ior (eq_attr "type" "integer,trap,cntlz,exts,isel")
 170             (and (eq_attr "type" "add,logical,shift")
 171                  (eq_attr "dot" "no"))
 172             (and (eq_attr "type" "insert")
 173                  (eq_attr "size" "64")))
 174        (eq_attr "cpu" "cell"))
 175   "slot01,fxu_cell")
 176
 177 ;; Two integer latency is 4 cycles
 178 (define_insn_reservation "cell-two" 4
 179   (and (eq_attr "type" "two")
 180        (eq_attr "cpu" "cell"))
 181   "slot01,fxu_cell,fxu_cell*2")
 182
 183 ;; Three integer latency is 6 cycles
 184 (define_insn_reservation "cell-three" 6
 185   (and (eq_attr "type" "three")
 186        (eq_attr "cpu" "cell"))
 187   "slot01,fxu_cell,fxu_cell*4")
 188
 189 ;; rlwimi, alter cr0
 190 (define_insn_reservation "cell-insert" 2
 191   (and (eq_attr "type" "insert")
 192        (eq_attr "size" "32")
 193        (eq_attr "cpu" "cell"))
 194  "slot01,fxu_cell")
 195
 196 ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
 197 (define_insn_reservation "cell-cmp" 1
 198   (and (eq_attr "type" "cmp")
 199        (eq_attr "cpu" "cell"))
 200   "fxu_cell+slot01")
 201
 202 ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
 203 (define_insn_reservation "cell-fast-cmp" 2
 204   (and (ior (eq_attr "type" "compare")
 205             (and (eq_attr "type" "add,logical,shift")
 206                  (eq_attr "dot" "yes")))
 207        (eq_attr "cpu" "cell")
 208        (eq_attr "cell_micro" "not"))
 209   "slot01,fxu_cell")
 210
 211 (define_insn_reservation "cell-cmp-microcoded" 9
 212   (and (ior (eq_attr "type" "compare")
 213             (and (eq_attr "type" "add,logical,shift")
 214                  (eq_attr "dot" "yes")))
 215        (eq_attr "cpu" "cell")
 216        (eq_attr "cell_micro" "always"))
 217   "slot0+slot1,fxu_cell,fxu_cell*7")
 218
 219 ;; mulld
 220 (define_insn_reservation "cell-lmul" 15
 221   (and (eq_attr "type" "mul")
 222        (eq_attr "dot" "no")
 223        (eq_attr "size" "64")
 224        (eq_attr "cpu" "cell"))
 225   "slot1,nonpipeline,nonpipeline*13")
 226
 227 ;; mulld. is microcoded
 228 (define_insn_reservation "cell-lmul-cmp" 22
 229   (and (eq_attr "type" "mul")
 230        (eq_attr "dot" "yes")
 231        (eq_attr "size" "64")
 232        (eq_attr "cpu" "cell"))
 233   "slot0+slot1,nonpipeline,nonpipeline*20")
 234
 235 ;; mulli, 6 cycles
 236 (define_insn_reservation "cell-imul23" 6
 237   (and (eq_attr "type" "mul")
 238        (eq_attr "size" "8,16")
 239        (eq_attr "cpu" "cell"))
 240   "slot1,nonpipeline,nonpipeline*4")
 241
 242 ;; mullw, 9
 243 (define_insn_reservation "cell-imul" 9
 244   (and (eq_attr "type" "mul")
 245        (eq_attr "dot" "no")
 246        (eq_attr "size" "32")
 247        (eq_attr "cpu" "cell"))
 248   "slot1,nonpipeline,nonpipeline*7")
 249
 250 ;; divide
 251 (define_insn_reservation "cell-idiv" 32
 252   (and (eq_attr "type" "div")
 253        (eq_attr "size" "32")
 254        (eq_attr "cpu" "cell"))
 255   "slot1,nonpipeline,nonpipeline*30")
 256
 257 (define_insn_reservation "cell-ldiv" 64
 258   (and (eq_attr "type" "div")
 259        (eq_attr "size" "64")
 260        (eq_attr "cpu" "cell"))
 261   "slot1,nonpipeline,nonpipeline*62")
 262
 263 ;;mflr and mfctr are pipelined
 264 (define_insn_reservation "cell-mfjmpr" 1
 265   (and (eq_attr "type" "mfjmpr")
 266        (eq_attr "cpu" "cell"))
 267   "slot01+bru_cell")
 268
 269 ;;mtlr and mtctr,
 270 ;;mtspr fully pipelined
 271 (define_insn_reservation "cell-mtjmpr" 1
 272  (and (eq_attr "type" "mtjmpr")
 273        (eq_attr "cpu" "cell"))
 274   "bru_cell+slot01")
 275
 276 ;; Branches
 277 ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
 278 ;; bcctr, bcctrl, latency 2, actually adjust by be to 4
 279 (define_insn_reservation "cell-branch" 1
 280   (and (eq_attr "type" "branch")
 281        (eq_attr "cpu" "cell"))
 282   "bru_cell+slot1")
 283
 284 (define_insn_reservation "cell-branchreg" 1
 285   (and (eq_attr "type" "jmpreg")
 286        (eq_attr "cpu" "cell"))
 287   "bru_cell+slot1")
 288
 289 ;; cr hazard
 290 ;; page 90, special cases for CR hazard, only one instr can access cr per cycle
 291 ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
 292 (define_insn_reservation "cell-crlogical" 1
 293   (and (eq_attr "type" "cr_logical,delayed_cr")
 294        (eq_attr "cpu" "cell"))
 295   "bru_cell+slot01")
 296
 297 ;; mfcrf and mfcr is about 34 cycles and nonpipelined
 298 (define_insn_reservation "cell-mfcr" 34
 299   (and (eq_attr "type" "mfcrf,mfcr")
 300        (eq_attr "cpu" "cell"))
 301    "slot1,nonpipeline,nonpipeline*32")
 302
 303 ;; mtcrf (1 field)
 304 (define_insn_reservation "cell-mtcrf" 1
 305   (and (eq_attr "type" "mtcr")
 306        (eq_attr "cpu" "cell"))
 307   "fxu_cell+slot01")
 308
 309 ; Basic FP latency is 10 cycles, thoughput is 1/cycle
 310 (define_insn_reservation "cell-fp" 10
 311   (and (eq_attr "type" "fp,dmul")
 312        (eq_attr "cpu" "cell"))
 313   "slot01,vsu1_cell,vsu1_cell*8")
 314
 315 (define_insn_reservation "cell-fpcompare" 1
 316   (and (eq_attr "type" "fpcompare")
 317        (eq_attr "cpu" "cell"))
 318   "vsu1_cell+slot01")
 319
 320 ;; sdiv thoughput 1/74, not pipelined but only in the FPU
 321 (define_insn_reservation "cell-sdiv" 74
 322   (and (eq_attr "type" "sdiv,ddiv")
 323        (eq_attr "cpu" "cell"))
 324   "slot1,nonpipeline,nonpipeline*72")
 325
 326 ;; fsqrt thoughput 1/84, not pipelined but only in the FPU
 327 (define_insn_reservation "cell-sqrt" 84
 328   (and (eq_attr "type" "ssqrt,dsqrt")
 329        (eq_attr "cpu" "cell"))
 330   "slot1,nonpipeline,nonpipeline*82")
 331
 332 ; VMX
 333 (define_insn_reservation "cell-vecsimple" 4
 334   (and (eq_attr "type" "vecsimple")
 335        (eq_attr "cpu" "cell"))
 336   "slot01,vsu1_cell,vsu1_cell*2")
 337
 338 ;; mult, div, madd
 339 (define_insn_reservation "cell-veccomplex" 10
 340   (and (eq_attr "type" "veccomplex")
 341        (eq_attr "cpu" "cell"))
 342   "slot01,vsu1_cell,vsu1_cell*8")
 343
 344 ;; TODO: add support for recording instructions
 345 (define_insn_reservation "cell-veccmp" 4
 346   (and (eq_attr "type" "veccmp")
 347        (eq_attr "cpu" "cell"))
 348   "slot01,vsu1_cell,vsu1_cell*2")
 349
 350 (define_insn_reservation "cell-vecfloat" 12
 351   (and (eq_attr "type" "vecfloat")
 352        (eq_attr "cpu" "cell"))
 353   "slot01,vsu1_cell,vsu1_cell*10")
 354
 355 (define_insn_reservation "cell-vecperm" 4
 356   (and (eq_attr "type" "vecperm")
 357        (eq_attr "cpu" "cell"))
 358   "slot01,vsu2_cell,vsu2_cell*2")
 359
 360 ;; New for 4.2, syncs
 361
 362 (define_insn_reservation "cell-sync" 11
 363   (and (eq_attr "type" "sync")
 364        (eq_attr "cpu" "cell"))
 365   "slot01,lsu_cell,lsu_cell*9")
 366
 367 (define_insn_reservation "cell-isync" 11
 368   (and (eq_attr "type" "isync")
 369        (eq_attr "cpu" "cell"))
 370   "slot01,lsu_cell,lsu_cell*9")
 371
 372 (define_insn_reservation "cell-load_l" 11
 373   (and (eq_attr "type" "load_l")
 374        (eq_attr "cpu" "cell"))
 375   "slot01,lsu_cell,lsu_cell*9")
 376
 377 (define_insn_reservation "cell-store_c" 11
 378   (and (eq_attr "type" "store_c")
 379        (eq_attr "cpu" "cell"))
 380   "slot01,lsu_cell,lsu_cell*9")
 381
 382 ;; RAW register dependency
 383
 384 ;; addi r3, r3, 1
 385 ;; lw r4,offset(r3)
 386 ;; there are 5 cycle deplay for r3 bypassing
 387 ;; there are 5 cycle delay for a dependent load after a load
 388 (define_bypass 5 "cell-integer" "cell-load")
 389 (define_bypass 5 "cell-integer" "cell-load-ext")
 390 (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
 391
 392 ;; there is a 6 cycle delay after a fp compare until you can use the cr.
 393 (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
 394
 395 ;; VXU float RAW
 396 (define_bypass 11 "cell-vecfloat" "cell-vecfloat")
 397
 398 ;; VXU and FPU
 399 (define_bypass 6 "cell-veccomplex" "cell-vecsimple")
 400 ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
 401 (define_bypass 3 "cell-vecfloat" "cell-veccomplex")
 402 ; this is not correct,
 403 ;;  this is a stall in general and not dependent on result
 404 (define_bypass 13 "cell-vecstore" "cell-fpstore")
 405 ; this is not correct, this can never be true, not dependent on result
 406 (define_bypass 7 "cell-fp" "cell-fpload")
 407 ;; vsu1 should avoid writing to the same target register as vsu2 insn
 408 ;;   within 12 cycles.
 409
 410 ;; WAW hazard
 411
 412 ;; the target of VSU estimate should not be reused within 10 dispatch groups
 413 ;; the target of VSU float should not be reused within 8 dispatch groups
 414 ;; the target of VSU complex should not be reused within 5 dispatch groups
 415 ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
 416
 417 ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
 418 ;;  ex4 stage(10 cycles)
 419 (define_bypass 10 "cell-mtjmpr" "cell-branchreg")
 420
 421 ;;Things are not simulated:
 422 ;; update instruction, update address gpr are not simulated
 423 ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
 424 ;;  insns
 425