gcc/config/rs6000/cell.md

   1 ;; Scheduling description for cell processor.
   2 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2009
   3 ;; Free Software Foundation, Inc.
   4 ;; Contributed by Sony Computer Entertainment, Inc.,
   5
   6
   7 ;; This file is free software; you can redistribute it and/or modify it under
   8 ;; the terms of the GNU General Public License as published by the Free
   9 ;; Software Foundation; either version 3 of the License, or (at your option)
  10 ;; any later version.
  11
  12 ;; This file is distributed in the hope that it will be useful, but WITHOUT
  13 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 ;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 ;; for more details.
  16
  17 ;; You should have received a copy of the GNU General Public License
  18 ;; along with GCC; see the file COPYING3.  If not see
  19 ;; <http://www.gnu.org/licenses/>.
  20
  21 ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
  22
  23 ;; BE Architecture *DD3.0 and DD3.1*
  24 ;; This file simulate PPU processor unit backend of pipeline, maualP24.
  25 ;; manual P27, stall and flush points
  26 ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
  27 ;;  order, the grouped address are aligned by 8
  28 ;; This file only simulate one thread situation
  29 ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
  30 ;;   and load/store unit)
  31 ;; VSU executes all scalar floating points insn(a float unit),
  32 ;;   VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
  33
  34 ;; Dual issue combination
  35
  36 ;;      FXU     LSU     BR              VMX                    VMX
  37 ;;                             (sx,cx,vsu_fp,fp_arith)    (perm,vsu_ls,fp_ls)
  38 ;;FXU   X
  39 ;;LSU           X                       X                       X
  40 ;;BR                    X
  41 ;;VMX(sx,cx,vsu_fp,fp_arth)             X
  42 ;;VMX(perm,vsu_ls, fp_ls)                                       X
  43 ;;    X are illegal combination.
  44
  45 ;; Dual issue exceptions:
  46 ;;(1) nop-pipelined FXU instr in slot 0
  47 ;;(2) non-pipelined FPU inst in slot 0
  48 ;; CSI instr(contex-synchronizing insn)
  49 ;; Microcode insn
  50
  51 ;; BRU unit: bru(none register stall), bru_cr(cr register stall)
  52 ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
  53 ;;  vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
  54 ;;  nonpipelined simulation
  55 ;; micr insns will stall at least 7 cycles to get the first instr from ROM,
  56 ;;  micro instructions are not dual issued.
  57
  58 ;; slot0 is older than slot1
  59 ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
  60
  61 ;; There different stall point
  62 ;; IB2, only stall one thread if stall here, so try to stall here as much as
  63 ;; we can
  64 ;; condition(1) insert nop, OR and ORI instruction form
  65 ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
  66 ;;   CR0-access while stdcx, or stwcx
  67 ;; IS2 stall ;; Page91 for details
  68 ;; VQ8 stall
  69 ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
  70 ;;  the vsu issue queue
  71
  72 ;;(define_automaton "cellxu")
  73
  74 ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
  75
  76 ;; ndfa
  77 (define_automaton "cellxu,cellvsu,cellbru,cell_mis")
  78
  79 (define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
  80 (define_cpu_unit "bru_cell" "cellbru")
  81 (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
  82
  83 (define_cpu_unit "slot0,slot1" "cell_mis")
  84
  85 (absence_set "slot0" "slot1")
  86
  87 (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
  88 (define_reservation "slot01" "slot0|slot1")
  89
  90
  91 ;; Load/store
  92 ;; lmw, lswi, lswx are only generated for optimize for space, MC,
  93 ;;   these instr are not simulated
  94 (define_insn_reservation "cell-load" 2
  95   (and (eq_attr "type" "load")
  96        (eq_attr "cpu" "cell"))
  97   "slot01,lsu_cell")
  98
  99 ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
 100 ;;  if with 32bytes alignment, CMC
 101 (define_insn_reservation "cell-load-ux" 2
 102   (and (eq_attr "type" "load_ux,load_u")
 103        (eq_attr "cpu" "cell"))
 104   "slot01,fxu_cell+lsu_cell")
 105
 106 ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
 107 ;;   11/7, 11/8, 11/12
 108 (define_insn_reservation "cell-load-ext" 2
 109   (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
 110        (eq_attr "cpu" "cell"))
 111   "slot01,fxu_cell+lsu_cell")
 112
 113 ;;lfs,lfsx,lfd,lfdx, 1 cycle
 114 (define_insn_reservation "cell-fpload" 1
 115   (and (eq_attr "type" "fpload")
 116        (eq_attr "cpu" "cell"))
 117   "vsu2_cell+lsu_cell+slot01")
 118
 119 ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
 120 (define_insn_reservation "cell-fpload-update" 1
 121   (and (eq_attr "type" "fpload,fpload_u,fpload_ux")
 122        (eq_attr "cpu" "cell"))
 123   "fxu_cell+vsu2_cell+lsu_cell+slot01")
 124
 125 (define_insn_reservation "cell-vecload" 2
 126   (and (eq_attr "type" "vecload")
 127        (eq_attr "cpu" "cell"))
 128   "slot01,vsu2_cell+lsu_cell")
 129
 130 ;;st? stw(MC)
 131 (define_insn_reservation "cell-store" 1
 132   (and (eq_attr "type" "store")
 133        (eq_attr "cpu" "cell"))
 134   "lsu_cell+slot01")
 135
 136 ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
 137 (define_insn_reservation "cell-store-update" 1
 138   (and (eq_attr "type" "store_ux,store_u")
 139        (eq_attr "cpu" "cell"))
 140   "fxu_cell+lsu_cell+slot01")
 141
 142 (define_insn_reservation "cell-fpstore" 1
 143   (and (eq_attr "type" "fpstore")
 144        (eq_attr "cpu" "cell"))
 145   "vsu2_cell+lsu_cell+slot01")
 146
 147 (define_insn_reservation "cell-fpstore-update" 1
 148   (and (eq_attr "type" "fpstore_ux,fpstore_u")
 149        (eq_attr "cpu" "cell"))
 150   "vsu2_cell+fxu_cell+lsu_cell+slot01")
 151
 152 (define_insn_reservation "cell-vecstore" 1
 153   (and (eq_attr "type" "vecstore")
 154        (eq_attr "cpu" "cell"))
 155   "vsu2_cell+lsu_cell+slot01")
 156
 157 ;; Integer latency is 2 cycles
 158 (define_insn_reservation "cell-integer" 2
 159   (and (eq_attr "type" "integer,insert_dword,shift,trap,\
 160                         var_shift_rotate,cntlz,exts,isel")
 161        (eq_attr "cpu" "cell"))
 162   "slot01,fxu_cell")
 163
 164 ;; Two integer latency is 4 cycles
 165 (define_insn_reservation "cell-two" 4
 166   (and (eq_attr "type" "two")
 167        (eq_attr "cpu" "cell"))
 168   "slot01,fxu_cell,fxu_cell*2")
 169
 170 ;; Three integer latency is 6 cycles
 171 (define_insn_reservation "cell-three" 6
 172   (and (eq_attr "type" "three")
 173        (eq_attr "cpu" "cell"))
 174   "slot01,fxu_cell,fxu_cell*4")
 175
 176 ;; rlwimi, alter cr0
 177 (define_insn_reservation "cell-insert" 2
 178   (and (eq_attr "type" "insert_word")
 179        (eq_attr "cpu" "cell"))
 180  "slot01,fxu_cell")
 181
 182 ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
 183 (define_insn_reservation "cell-cmp" 1
 184   (and (eq_attr "type" "cmp")
 185        (eq_attr "cpu" "cell"))
 186   "fxu_cell+slot01")
 187
 188 ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
 189 (define_insn_reservation "cell-fast-cmp" 2
 190   (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
 191                             var_delayed_compare")
 192             (eq_attr "cpu" "cell"))
 193         (eq_attr "cell_micro" "not"))
 194   "slot01,fxu_cell")
 195
 196 (define_insn_reservation "cell-cmp-microcoded" 9
 197   (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
 198                             var_delayed_compare")
 199             (eq_attr "cpu" "cell"))
 200         (eq_attr "cell_micro" "always"))
 201   "slot0+slot1,fxu_cell,fxu_cell*7")
 202
 203 ;; mulld
 204 (define_insn_reservation "cell-lmul" 15
 205   (and (eq_attr "type" "lmul")
 206        (eq_attr "cpu" "cell"))
 207   "slot1,nonpipeline,nonpipeline*13")
 208
 209 ;; mulld. is microcoded
 210 (define_insn_reservation "cell-lmul-cmp" 22
 211   (and (eq_attr "type" "lmul_compare")
 212        (eq_attr "cpu" "cell"))
 213   "slot0+slot1,nonpipeline,nonpipeline*20")
 214
 215 ;; mulli, 6 cycles
 216 (define_insn_reservation "cell-imul23" 6
 217   (and (eq_attr "type" "imul2,imul3")
 218        (eq_attr "cpu" "cell"))
 219   "slot1,nonpipeline,nonpipeline*4")
 220
 221 ;; mullw, 9
 222 (define_insn_reservation "cell-imul" 9
 223   (and (eq_attr "type" "imul")
 224        (eq_attr "cpu" "cell"))
 225   "slot1,nonpipeline,nonpipeline*7")
 226
 227 ;; divide
 228 (define_insn_reservation "cell-idiv" 32
 229   (and (eq_attr "type" "idiv")
 230        (eq_attr "cpu" "cell"))
 231   "slot1,nonpipeline,nonpipeline*30")
 232
 233 (define_insn_reservation "cell-ldiv" 64
 234   (and (eq_attr "type" "ldiv")
 235        (eq_attr "cpu" "cell"))
 236   "slot1,nonpipeline,nonpipeline*62")
 237
 238 ;;mflr and mfctr are pipelined
 239 (define_insn_reservation "cell-mfjmpr" 1
 240   (and (eq_attr "type" "mfjmpr")
 241        (eq_attr "cpu" "cell"))
 242   "slot01+bru_cell")
 243
 244 ;;mtlr and mtctr,
 245 ;;mtspr fully pipelined
 246 (define_insn_reservation "cell-mtjmpr" 1
 247  (and (eq_attr "type" "mtjmpr")
 248        (eq_attr "cpu" "cell"))
 249   "bru_cell+slot01")
 250
 251 ;; Branches
 252 ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
 253 ;; bcctr, bcctrl, latency 2, actually adjust by be to 4
 254 (define_insn_reservation "cell-branch" 1
 255   (and (eq_attr "type" "branch")
 256        (eq_attr "cpu" "cell"))
 257   "bru_cell+slot1")
 258
 259 (define_insn_reservation "cell-branchreg" 1
 260   (and (eq_attr "type" "jmpreg")
 261        (eq_attr "cpu" "cell"))
 262   "bru_cell+slot1")
 263
 264 ;; cr hazard
 265 ;; page 90, special cases for CR hazard, only one instr can access cr per cycle
 266 ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
 267 (define_insn_reservation "cell-crlogical" 1
 268   (and (eq_attr "type" "cr_logical,delayed_cr")
 269        (eq_attr "cpu" "cell"))
 270   "bru_cell+slot01")
 271
 272 ;; mfcrf and mfcr is about 34 cycles and nonpipelined
 273 (define_insn_reservation "cell-mfcr" 34
 274   (and (eq_attr "type" "mfcrf,mfcr")
 275        (eq_attr "cpu" "cell"))
 276    "slot1,nonpipeline,nonpipeline*32")
 277
 278 ;; mtcrf (1 field)
 279 (define_insn_reservation "cell-mtcrf" 1
 280   (and (eq_attr "type" "mtcr")
 281        (eq_attr "cpu" "cell"))
 282   "fxu_cell+slot01")
 283
 284 ; Basic FP latency is 10 cycles, thoughput is 1/cycle
 285 (define_insn_reservation "cell-fp" 10
 286   (and (eq_attr "type" "fp,dmul")
 287        (eq_attr "cpu" "cell"))
 288   "slot01,vsu1_cell,vsu1_cell*8")
 289
 290 (define_insn_reservation "cell-fpcompare" 1
 291   (and (eq_attr "type" "fpcompare")
 292        (eq_attr "cpu" "cell"))
 293   "vsu1_cell+slot01")
 294
 295 ;; sdiv thoughput 1/74, not pipelined but only in the FPU
 296 (define_insn_reservation "cell-sdiv" 74
 297   (and (eq_attr "type" "sdiv,ddiv")
 298        (eq_attr "cpu" "cell"))
 299   "slot1,nonpipeline,nonpipeline*72")
 300
 301 ;; fsqrt thoughput 1/84, not pipelined but only in the FPU
 302 (define_insn_reservation "cell-sqrt" 84
 303   (and (eq_attr "type" "ssqrt,dsqrt")
 304        (eq_attr "cpu" "cell"))
 305   "slot1,nonpipeline,nonpipeline*82")
 306
 307 ; VMX
 308 (define_insn_reservation "cell-vecsimple" 4
 309   (and (eq_attr "type" "vecsimple")
 310        (eq_attr "cpu" "cell"))
 311   "slot01,vsu1_cell,vsu1_cell*2")
 312
 313 ;; mult, div, madd
 314 (define_insn_reservation "cell-veccomplex" 10
 315   (and (eq_attr "type" "veccomplex")
 316        (eq_attr "cpu" "cell"))
 317   "slot01,vsu1_cell,vsu1_cell*8")
 318
 319 ;; TODO: add support for recording instructions
 320 (define_insn_reservation "cell-veccmp" 4
 321   (and (eq_attr "type" "veccmp")
 322        (eq_attr "cpu" "cell"))
 323   "slot01,vsu1_cell,vsu1_cell*2")
 324
 325 (define_insn_reservation "cell-vecfloat" 12
 326   (and (eq_attr "type" "vecfloat")
 327        (eq_attr "cpu" "cell"))
 328   "slot01,vsu1_cell,vsu1_cell*10")
 329
 330 (define_insn_reservation "cell-vecperm" 4
 331   (and (eq_attr "type" "vecperm")
 332        (eq_attr "cpu" "cell"))
 333   "slot01,vsu2_cell,vsu2_cell*2")
 334
 335 ;; New for 4.2, syncs
 336
 337 (define_insn_reservation "cell-sync" 11
 338   (and (eq_attr "type" "sync")
 339        (eq_attr "cpu" "cell"))
 340   "slot01,lsu_cell,lsu_cell*9")
 341
 342 (define_insn_reservation "cell-isync" 11
 343   (and (eq_attr "type" "isync")
 344        (eq_attr "cpu" "cell"))
 345   "slot01,lsu_cell,lsu_cell*9")
 346
 347 (define_insn_reservation "cell-load_l" 11
 348   (and (eq_attr "type" "load_l")
 349        (eq_attr "cpu" "cell"))
 350   "slot01,lsu_cell,lsu_cell*9")
 351
 352 (define_insn_reservation "cell-store_c" 11
 353   (and (eq_attr "type" "store_c")
 354        (eq_attr "cpu" "cell"))
 355   "slot01,lsu_cell,lsu_cell*9")
 356
 357 ;; RAW register dependency
 358
 359 ;; addi r3, r3, 1
 360 ;; lw r4,offset(r3)
 361 ;; there are 5 cycle deplay for r3 bypassing
 362 ;; there are 5 cycle delay for a dependent load after a load
 363 (define_bypass 5 "cell-integer" "cell-load")
 364 (define_bypass 5 "cell-integer" "cell-load-ext")
 365 (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
 366
 367 ;; there is a 6 cycle delay after a fp compare until you can use the cr.
 368 (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
 369
 370 ;; VXU float RAW
 371 (define_bypass 11 "cell-vecfloat" "cell-vecfloat")
 372
 373 ;; VXU and FPU
 374 (define_bypass 6 "cell-veccomplex" "cell-vecsimple")
 375 ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
 376 (define_bypass 3 "cell-vecfloat" "cell-veccomplex")
 377 ; this is not correct,
 378 ;;  this is a stall in general and not dependent on result
 379 (define_bypass 13 "cell-vecstore" "cell-fpstore")
 380 ; this is not correct, this can never be true, not dependent on result
 381 (define_bypass 7 "cell-fp" "cell-fpload")
 382 ;; vsu1 should avoid writing to the same target register as vsu2 insn
 383 ;;   within 12 cycles.
 384
 385 ;; WAW hazard
 386
 387 ;; the target of VSU estimate should not be reused within 10 dispatch groups
 388 ;; the target of VSU float should not be reused within 8 dispatch groups
 389 ;; the target of VSU complex should not be reused within 5 dispatch groups
 390 ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
 391
 392 ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
 393 ;;  ex4 stage(10 cycles)
 394 (define_bypass 10 "cell-mtjmpr" "cell-branchreg")
 395
 396 ;;Things are not simulated:
 397 ;; update instruction, update address gpr are not simulated
 398 ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float
 399 ;;  insns
 400