1 ;; Scheduling description for cell processor.
2 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006
3 ;; Free Software Foundation, Inc.
4 ;; Contributed by Sony Computer Entertainment, Inc.,
7 ;; This file is free software; you can redistribute it and/or modify it under
8 ;; the terms of the GNU General Public License as published by the Free
9 ;; Software Foundation; either version 2 of the License, or (at your option)
12 ;; This file is distributed in the hope that it will be useful, but WITHOUT
13 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 ;; You should have received a copy of the GNU General Public License
18 ;; along with this file; see the file COPYING. If not, write to the Free
19 ;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
22 ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
24 ;; BE Architechture *DD3.0 and DD3.1*
25 ;; This file simulate PPU processor unit backend of pipeline, maualP24.
26 ;; manual P27, stall and flush points
27 ;; IU, XU, VSU, dipatcher decodes and dispatch 2 insns per cycle in program
28 ;; order, the grouped adress are aligned by 8
29 ;; This file only simulate one thread situation
30 ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
31 ;; and load/store unit)
32 ;; VSU executes all scalar floating points insn(a float unit),
33 ;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
35 ;; Dual issue combination
38 ;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls)
42 ;;VMX(sx,cx,vsu_fp,fp_arth) X
43 ;;VMX(perm,vsu_ls, fp_ls) X
44 ;; X are illegal combination.
46 ;; Dual issue exceptons:
47 ;;(1) nop-pipelined FXU instr in slot 0
48 ;;(2) non-pipelined FPU inst in slot 0
49 ;; CSI instr(contex-synchronizing insn)
52 ;; BRU unit: bru(none register stall), bru_cr(cr register stall)
53 ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
54 ;; vuf(vmx float), fpu(floats). fpu_div is hypthetical, it is for
55 ;; nonpipelined simulation
56 ;; micr insns will stall at least 7 cycles to get the first instr from ROM,
57 ;; micro instructions are not dual issued.
59 ;; slot0 is older than slot1
60 ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
62 ;; There different stall point
63 ;; IB2, only stall one thread if stall here, so try to stall here as much as
65 ;; condition(1) insert nop, OR and ORI instruction form
66 ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
67 ;; CR0-access while stdcx, or stwcx
68 ;; IS2 stall ;; Page91 for details
70 ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
71 ;; the vsu issue queue
73 ;;(define_automaton "cellxu")
75 ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
78 (define_automaton "cellxu,cellvsu,cellbru,cell_mis")
80 (define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
81 (define_cpu_unit "bru_cell" "cellbru")
82 (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
84 (define_cpu_unit "slot0,slot1" "cell_mis")
86 (absence_set "slot0" "slot1")
88 (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
89 (define_reservation "slot01" "slot0|slot1")
93 ;; lmw, lswi, lswx are only generated for optimize for space, MC,
94 ;; these instr are not simulated
95 (define_insn_reservation "cell-load" 2
96 (and (eq_attr "type" "load")
97 (eq_attr "cpu" "cell"))
100 ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
101 ;; if with 32bytes alignment, CMC
102 (define_insn_reservation "cell-load-ux" 2
103 (and (eq_attr "type" "load_ux,load_u")
104 (eq_attr "cpu" "cell"))
105 "slot01,fxu_cell+lsu_cell")
107 ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
109 (define_insn_reservation "cell-load-ext" 2
110 (and (eq_attr "type" "load_ext,load_ext_u,load_ext_ux")
111 (eq_attr "cpu" "cell"))
112 "slot01,fxu_cell+lsu_cell")
114 ;;lfs,lfsx,lfd,lfdx, 1 cycle
115 (define_insn_reservation "cell-fpload" 1
116 (and (eq_attr "type" "fpload")
117 (eq_attr "cpu" "cell"))
118 "vsu2_cell+lsu_cell+slot01")
120 ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
121 (define_insn_reservation "cell-fpload-update" 1
122 (and (eq_attr "type" "fpload,fpload_u,fpload_ux")
123 (eq_attr "cpu" "cell"))
124 "fxu_cell+vsu2_cell+lsu_cell+slot01")
126 (define_insn_reservation "cell-vecload" 2
127 (and (eq_attr "type" "vecload")
128 (eq_attr "cpu" "cell"))
129 "slot01,vsu2_cell+lsu_cell")
132 (define_insn_reservation "cell-store" 1
133 (and (eq_attr "type" "store")
134 (eq_attr "cpu" "cell"))
137 ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
138 (define_insn_reservation "cell-store-update" 1
139 (and (eq_attr "type" "store_ux,store_u")
140 (eq_attr "cpu" "cell"))
141 "fxu_cell+lsu_cell+slot01")
143 (define_insn_reservation "cell-fpstore" 1
144 (and (eq_attr "type" "fpstore")
145 (eq_attr "cpu" "cell"))
146 "vsu2_cell+lsu_cell+slot01")
148 (define_insn_reservation "cell-fpstore-update" 1
149 (and (eq_attr "type" "fpstore_ux,fpstore_u")
150 (eq_attr "cpu" "cell"))
151 "vsu2_cell+fxu_cell+lsu_cell+slot01")
153 (define_insn_reservation "cell-vecstore" 1
154 (and (eq_attr "type" "vecstore")
155 (eq_attr "cpu" "cell"))
156 "vsu2_cell+lsu_cell+slot01")
158 ;; Integer latency is 2 cycles
159 (define_insn_reservation "cell-integer" 2
160 (and (eq_attr "type" "integer,insert_dword,shift,trap,\
161 var_shift_rotate,cntlz,exts")
162 (eq_attr "cpu" "cell"))
165 ;; Two integer latency is 4 cycles
166 (define_insn_reservation "cell-two" 4
167 (and (eq_attr "type" "two")
168 (eq_attr "cpu" "cell"))
169 "slot01,fxu_cell,fxu_cell*2")
171 ;; Three integer latency is 6 cycles
172 (define_insn_reservation "cell-three" 6
173 (and (eq_attr "type" "three")
174 (eq_attr "cpu" "cell"))
175 "slot01,fxu_cell,fxu_cell*4")
178 (define_insn_reservation "cell-insert" 2
179 (and (eq_attr "type" "insert_word")
180 (eq_attr "cpu" "cell"))
183 ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
184 (define_insn_reservation "cell-cmp" 1
185 (and (eq_attr "type" "cmp")
186 (eq_attr "cpu" "cell"))
189 ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
190 (define_insn_reservation "cell-fast-cmp" 2
191 (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
192 var_delayed_compare")
193 (eq_attr "cpu" "cell"))
194 (eq_attr "cell_micro" "not"))
197 (define_insn_reservation "cell-cmp-microcoded" 9
198 (and (and (eq_attr "type" "fast_compare,delayed_compare,compare,\
199 var_delayed_compare")
200 (eq_attr "cpu" "cell"))
201 (eq_attr "cell_micro" "always"))
202 "slot0+slot1,fxu_cell,fxu_cell*7")
205 (define_insn_reservation "cell-lmul" 15
206 (and (eq_attr "type" "lmul")
207 (eq_attr "cpu" "cell"))
208 "slot1,nonpipeline,nonpipeline*13")
210 ;; mulld. is microcoded
211 (define_insn_reservation "cell-lmul-cmp" 22
212 (and (eq_attr "type" "lmul_compare")
213 (eq_attr "cpu" "cell"))
214 "slot0+slot1,nonpipeline,nonpipeline*20")
217 (define_insn_reservation "cell-imul23" 6
218 (and (eq_attr "type" "imul2,imul3")
219 (eq_attr "cpu" "cell"))
220 "slot1,nonpipeline,nonpipeline*4")
223 (define_insn_reservation "cell-imul" 9
224 (and (eq_attr "type" "imul")
225 (eq_attr "cpu" "cell"))
226 "slot1,nonpipeline,nonpipeline*7")
229 (define_insn_reservation "cell-idiv" 32
230 (and (eq_attr "type" "idiv")
231 (eq_attr "cpu" "cell"))
232 "slot1,nonpipeline,nonpipeline*30")
234 (define_insn_reservation "cell-ldiv" 64
235 (and (eq_attr "type" "ldiv")
236 (eq_attr "cpu" "cell"))
237 "slot1,nonpipeline,nonpipeline*62")
239 ;;mflr and mfctr are pipelined
240 (define_insn_reservation "cell-mfjmpr" 1
241 (and (eq_attr "type" "mfjmpr")
242 (eq_attr "cpu" "cell"))
246 ;;mtspr fully pipelined
247 (define_insn_reservation "cell-mtjmpr" 1
248 (and (eq_attr "type" "mtjmpr")
249 (eq_attr "cpu" "cell"))
253 ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
254 ;; bcctr, bcctrl, latency 2, actually adjust by be to 4
255 (define_insn_reservation "cell-branch" 1
256 (and (eq_attr "type" "branch")
257 (eq_attr "cpu" "cell"))
260 (define_insn_reservation "cell-branchreg" 1
261 (and (eq_attr "type" "jmpreg")
262 (eq_attr "cpu" "cell"))
266 ;; page 90, special cases for CR hazard, only one instr can access cr per cycle
267 ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
268 (define_insn_reservation "cell-crlogical" 1
269 (and (eq_attr "type" "cr_logical,delayed_cr")
270 (eq_attr "cpu" "cell"))
273 ;; mfcrf and mfcr is about 34 cycles and nonpipelined
274 (define_insn_reservation "cell-mfcr" 34
275 (and (eq_attr "type" "mfcrf,mfcr")
276 (eq_attr "cpu" "cell"))
277 "slot1,nonpipeline,nonpipeline*32")
280 (define_insn_reservation "cell-mtcrf" 1
281 (and (eq_attr "type" "mtcr")
282 (eq_attr "cpu" "cell"))
285 ; Basic FP latency is 10 cycles, thoughput is 1/cycle
286 (define_insn_reservation "cell-fp" 10
287 (and (eq_attr "type" "fp,dmul")
288 (eq_attr "cpu" "cell"))
289 "slot01,vsu1_cell,vsu1_cell*8")
291 (define_insn_reservation "cell-fpcompare" 1
292 (and (eq_attr "type" "fpcompare")
293 (eq_attr "cpu" "cell"))
296 ;; sdiv thoughput 1/74, not pipelined but only in the FPU
297 (define_insn_reservation "cell-sdiv" 74
298 (and (eq_attr "type" "sdiv,ddiv")
299 (eq_attr "cpu" "cell"))
300 "slot1,nonpipeline,nonpipeline*72")
302 ;; fsqrt thoughput 1/84, not pipelined but only in the FPU
303 (define_insn_reservation "cell-sqrt" 84
304 (and (eq_attr "type" "ssqrt,dsqrt")
305 (eq_attr "cpu" "cell"))
306 "slot1,nonpipeline,nonpipeline*82")
309 (define_insn_reservation "cell-vecsimple" 4
310 (and (eq_attr "type" "vecsimple")
311 (eq_attr "cpu" "cell"))
312 "slot01,vsu1_cell,vsu1_cell*2")
315 (define_insn_reservation "cell-veccomplex" 10
316 (and (eq_attr "type" "veccomplex")
317 (eq_attr "cpu" "cell"))
318 "slot01,vsu1_cell,vsu1_cell*8")
320 ;; TODO: add support for recording instructions
321 (define_insn_reservation "cell-veccmp" 4
322 (and (eq_attr "type" "veccmp")
323 (eq_attr "cpu" "cell"))
324 "slot01,vsu1_cell,vsu1_cell*2")
326 (define_insn_reservation "cell-vecfloat" 12
327 (and (eq_attr "type" "vecfloat")
328 (eq_attr "cpu" "cell"))
329 "slot01,vsu1_cell,vsu1_cell*10")
331 (define_insn_reservation "cell-vecperm" 4
332 (and (eq_attr "type" "vecperm")
333 (eq_attr "cpu" "cell"))
334 "slot01,vsu2_cell,vsu2_cell*2")
336 ;; New for 4.2, syncs
338 (define_insn_reservation "cell-sync" 11
339 (and (eq_attr "type" "sync")
340 (eq_attr "cpu" "cell"))
341 "slot01,lsu_cell,lsu_cell*9")
343 (define_insn_reservation "cell-isync" 11
344 (and (eq_attr "type" "isync")
345 (eq_attr "cpu" "cell"))
346 "slot01,lsu_cell,lsu_cell*9")
348 (define_insn_reservation "cell-load_l" 11
349 (and (eq_attr "type" "load_l")
350 (eq_attr "cpu" "cell"))
351 "slot01,lsu_cell,lsu_cell*9")
353 (define_insn_reservation "cell-store_c" 11
354 (and (eq_attr "type" "store_c")
355 (eq_attr "cpu" "cell"))
356 "slot01,lsu_cell,lsu_cell*9")
358 ;; RAW register dependency
362 ;; there are 5 cycle deplay for r3 bypassing
363 ;; there are 5 cycle delay for a dependent load after a load
364 (define_bypass 5 "cell-integer" "cell-load")
365 (define_bypass 5 "cell-integer" "cell-load-ext")
366 (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
368 ;; there is a 6 cycle delay after a fp compare until you can use the cr.
369 (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
372 (define_bypass 11 "cell-vecfloat" "cell-vecfloat")
375 (define_bypass 6 "cell-veccomplex" "cell-vecsimple")
376 ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
377 (define_bypass 3 "cell-vecfloat" "cell-veccomplex")
378 ; this is not correct,
379 ;; this is a stall in general and not dependent on result
380 (define_bypass 13 "cell-vecstore" "cell-fpstore")
381 ; this is not correct, this can never be true, not depent on result
382 (define_bypass 7 "cell-fp" "cell-fpload")
383 ;; vsu1 should avoid writing to the same target register as vsu2 insn
388 ;; the target of VSU estimate should not be reused within 10 dispatch groups
389 ;; the target of VSU float should not be reused within 8 dispatch groups
390 ;; the target of VSU complex should not be reused within 5 dispatch groups
391 ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
393 ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
394 ;; ex4 stage(10 cycles)
395 (define_bypass 10 "cell-mtjmpr" "cell-branchreg")
397 ;;Things are not simulated:
398 ;; update instruction, update address gpr are not simulated
399 ;; vrefp, vrsqrtefp have latency(14), currently simluated as 12 cycle float