1 ;; Scheduling description for cell processor.
2 ;; Copyright (C) 2001-2014 Free Software Foundation, Inc.
3 ;; Contributed by Sony Computer Entertainment, Inc.,
6 ;; This file is free software; you can redistribute it and/or modify it under
7 ;; the terms of the GNU General Public License as published by the Free
8 ;; Software Foundation; either version 3 of the License, or (at your option)
11 ;; This file is distributed in the hope that it will be useful, but WITHOUT
12 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 ;; You should have received a copy of the GNU General Public License
17 ;; along with GCC; see the file COPYING3. If not see
18 ;; <http://www.gnu.org/licenses/>.
20 ;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf)
22 ;; BE Architecture *DD3.0 and DD3.1*
23 ;; This file simulate PPU processor unit backend of pipeline, maualP24.
24 ;; manual P27, stall and flush points
25 ;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program
26 ;; order, the grouped address are aligned by 8
27 ;; This file only simulate one thread situation
28 ;; XU executes all fixed point insns(3 units, a simple alu, a complex unit,
29 ;; and load/store unit)
30 ;; VSU executes all scalar floating points insn(a float unit),
31 ;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point)
33 ;; Dual issue combination
36 ;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls)
40 ;;VMX(sx,cx,vsu_fp,fp_arth) X
41 ;;VMX(perm,vsu_ls, fp_ls) X
42 ;; X are illegal combination.
44 ;; Dual issue exceptions:
45 ;;(1) nop-pipelined FXU instr in slot 0
46 ;;(2) non-pipelined FPU inst in slot 0
47 ;; CSI instr(contex-synchronizing insn)
50 ;; BRU unit: bru(none register stall), bru_cr(cr register stall)
51 ;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex),
52 ;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for
53 ;; nonpipelined simulation
54 ;; micr insns will stall at least 7 cycles to get the first instr from ROM,
55 ;; micro instructions are not dual issued.
57 ;; slot0 is older than slot1
58 ;; non-pipelined insn need to be in slot1 to avoid 1cycle stall
60 ;; There different stall point
61 ;; IB2, only stall one thread if stall here, so try to stall here as much as
63 ;; condition(1) insert nop, OR and ORI instruction form
64 ;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or
65 ;; CR0-access while stdcx, or stwcx
66 ;; IS2 stall ;; Page91 for details
68 ;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to
69 ;; the vsu issue queue
71 ;;(define_automaton "cellxu")
73 ;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu")
76 (define_automaton "cellxu,cellvsu,cellbru,cell_mis")
78 (define_cpu_unit "fxu_cell,lsu_cell" "cellxu")
79 (define_cpu_unit "bru_cell" "cellbru")
80 (define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu")
82 (define_cpu_unit "slot0,slot1" "cell_mis")
84 (absence_set "slot0" "slot1")
86 (define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell")
87 (define_reservation "slot01" "slot0|slot1")
91 ;; lmw, lswi, lswx are only generated for optimize for space, MC,
92 ;; these instr are not simulated
93 (define_insn_reservation "cell-load" 2
94 (and (eq_attr "type" "load")
95 (eq_attr "sign_extend" "no")
96 (eq_attr "update" "no")
97 (eq_attr "cpu" "cell"))
100 ;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs,
101 ;; if with 32bytes alignment, CMC
102 (define_insn_reservation "cell-load-ux" 2
103 (and (eq_attr "type" "load")
104 (eq_attr "sign_extend" "no")
105 (eq_attr "update" "yes")
106 (eq_attr "cpu" "cell"))
107 "slot01,fxu_cell+lsu_cell")
109 ;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown
111 (define_insn_reservation "cell-load-ext" 2
112 (and (eq_attr "type" "load")
113 (eq_attr "sign_extend" "yes")
114 (eq_attr "cpu" "cell"))
115 "slot01,fxu_cell+lsu_cell")
117 ;;lfs,lfsx,lfd,lfdx, 1 cycle
118 (define_insn_reservation "cell-fpload" 1
119 (and (eq_attr "type" "fpload")
120 (eq_attr "update" "no")
121 (eq_attr "cpu" "cell"))
122 "vsu2_cell+lsu_cell+slot01")
124 ;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr)
125 (define_insn_reservation "cell-fpload-update" 1
126 (and (eq_attr "type" "fpload")
127 (eq_attr "update" "yes")
128 (eq_attr "cpu" "cell"))
129 "fxu_cell+vsu2_cell+lsu_cell+slot01")
131 (define_insn_reservation "cell-vecload" 2
132 (and (eq_attr "type" "vecload")
133 (eq_attr "cpu" "cell"))
134 "slot01,vsu2_cell+lsu_cell")
137 (define_insn_reservation "cell-store" 1
138 (and (eq_attr "type" "store")
139 (eq_attr "update" "no")
140 (eq_attr "cpu" "cell"))
143 ;;stdux, stdu, (hardware breaks into store and add) 2 for update reg
144 (define_insn_reservation "cell-store-update" 1
145 (and (eq_attr "type" "store")
146 (eq_attr "update" "yes")
147 (eq_attr "cpu" "cell"))
148 "fxu_cell+lsu_cell+slot01")
150 (define_insn_reservation "cell-fpstore" 1
151 (and (eq_attr "type" "fpstore")
152 (eq_attr "update" "no")
153 (eq_attr "cpu" "cell"))
154 "vsu2_cell+lsu_cell+slot01")
156 (define_insn_reservation "cell-fpstore-update" 1
157 (and (eq_attr "type" "fpstore")
158 (eq_attr "update" "yes")
159 (eq_attr "cpu" "cell"))
160 "vsu2_cell+fxu_cell+lsu_cell+slot01")
162 (define_insn_reservation "cell-vecstore" 1
163 (and (eq_attr "type" "vecstore")
164 (eq_attr "cpu" "cell"))
165 "vsu2_cell+lsu_cell+slot01")
167 ;; Integer latency is 2 cycles
168 (define_insn_reservation "cell-integer" 2
169 (and (ior (eq_attr "type" "integer,trap,cntlz,exts,isel")
170 (and (eq_attr "type" "add,logical,shift")
171 (eq_attr "dot" "no"))
172 (and (eq_attr "type" "insert")
173 (eq_attr "size" "64")))
174 (eq_attr "cpu" "cell"))
177 ;; Two integer latency is 4 cycles
178 (define_insn_reservation "cell-two" 4
179 (and (eq_attr "type" "two")
180 (eq_attr "cpu" "cell"))
181 "slot01,fxu_cell,fxu_cell*2")
183 ;; Three integer latency is 6 cycles
184 (define_insn_reservation "cell-three" 6
185 (and (eq_attr "type" "three")
186 (eq_attr "cpu" "cell"))
187 "slot01,fxu_cell,fxu_cell*4")
190 (define_insn_reservation "cell-insert" 2
191 (and (eq_attr "type" "insert")
192 (eq_attr "size" "32")
193 (eq_attr "cpu" "cell"))
196 ;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0
197 (define_insn_reservation "cell-cmp" 1
198 (and (eq_attr "type" "cmp")
199 (eq_attr "cpu" "cell"))
202 ;; add, addo, sub, subo, alter cr0, rldcli, rlwinm
203 (define_insn_reservation "cell-fast-cmp" 2
204 (and (ior (eq_attr "type" "compare")
205 (and (eq_attr "type" "add,logical,shift")
206 (eq_attr "dot" "yes")))
207 (eq_attr "cpu" "cell")
208 (eq_attr "cell_micro" "not"))
211 (define_insn_reservation "cell-cmp-microcoded" 9
212 (and (ior (eq_attr "type" "compare")
213 (and (eq_attr "type" "add,logical,shift")
214 (eq_attr "dot" "yes")))
215 (eq_attr "cpu" "cell")
216 (eq_attr "cell_micro" "always"))
217 "slot0+slot1,fxu_cell,fxu_cell*7")
220 (define_insn_reservation "cell-lmul" 15
221 (and (eq_attr "type" "mul")
223 (eq_attr "size" "64")
224 (eq_attr "cpu" "cell"))
225 "slot1,nonpipeline,nonpipeline*13")
227 ;; mulld. is microcoded
228 (define_insn_reservation "cell-lmul-cmp" 22
229 (and (eq_attr "type" "mul")
230 (eq_attr "dot" "yes")
231 (eq_attr "size" "64")
232 (eq_attr "cpu" "cell"))
233 "slot0+slot1,nonpipeline,nonpipeline*20")
236 (define_insn_reservation "cell-imul23" 6
237 (and (eq_attr "type" "mul")
238 (eq_attr "size" "8,16")
239 (eq_attr "cpu" "cell"))
240 "slot1,nonpipeline,nonpipeline*4")
243 (define_insn_reservation "cell-imul" 9
244 (and (eq_attr "type" "mul")
246 (eq_attr "size" "32")
247 (eq_attr "cpu" "cell"))
248 "slot1,nonpipeline,nonpipeline*7")
251 (define_insn_reservation "cell-idiv" 32
252 (and (eq_attr "type" "div")
253 (eq_attr "size" "32")
254 (eq_attr "cpu" "cell"))
255 "slot1,nonpipeline,nonpipeline*30")
257 (define_insn_reservation "cell-ldiv" 64
258 (and (eq_attr "type" "div")
259 (eq_attr "size" "64")
260 (eq_attr "cpu" "cell"))
261 "slot1,nonpipeline,nonpipeline*62")
263 ;;mflr and mfctr are pipelined
264 (define_insn_reservation "cell-mfjmpr" 1
265 (and (eq_attr "type" "mfjmpr")
266 (eq_attr "cpu" "cell"))
270 ;;mtspr fully pipelined
271 (define_insn_reservation "cell-mtjmpr" 1
272 (and (eq_attr "type" "mtjmpr")
273 (eq_attr "cpu" "cell"))
277 ;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency
278 ;; bcctr, bcctrl, latency 2, actually adjust by be to 4
279 (define_insn_reservation "cell-branch" 1
280 (and (eq_attr "type" "branch")
281 (eq_attr "cpu" "cell"))
284 (define_insn_reservation "cell-branchreg" 1
285 (and (eq_attr "type" "jmpreg")
286 (eq_attr "cpu" "cell"))
290 ;; page 90, special cases for CR hazard, only one instr can access cr per cycle
291 ;; if insn reads CR following a stwcx, pipeline stall till stwcx finish
292 (define_insn_reservation "cell-crlogical" 1
293 (and (eq_attr "type" "cr_logical,delayed_cr")
294 (eq_attr "cpu" "cell"))
297 ;; mfcrf and mfcr is about 34 cycles and nonpipelined
298 (define_insn_reservation "cell-mfcr" 34
299 (and (eq_attr "type" "mfcrf,mfcr")
300 (eq_attr "cpu" "cell"))
301 "slot1,nonpipeline,nonpipeline*32")
304 (define_insn_reservation "cell-mtcrf" 1
305 (and (eq_attr "type" "mtcr")
306 (eq_attr "cpu" "cell"))
309 ; Basic FP latency is 10 cycles, thoughput is 1/cycle
310 (define_insn_reservation "cell-fp" 10
311 (and (eq_attr "type" "fp,dmul")
312 (eq_attr "cpu" "cell"))
313 "slot01,vsu1_cell,vsu1_cell*8")
315 (define_insn_reservation "cell-fpcompare" 1
316 (and (eq_attr "type" "fpcompare")
317 (eq_attr "cpu" "cell"))
320 ;; sdiv thoughput 1/74, not pipelined but only in the FPU
321 (define_insn_reservation "cell-sdiv" 74
322 (and (eq_attr "type" "sdiv,ddiv")
323 (eq_attr "cpu" "cell"))
324 "slot1,nonpipeline,nonpipeline*72")
326 ;; fsqrt thoughput 1/84, not pipelined but only in the FPU
327 (define_insn_reservation "cell-sqrt" 84
328 (and (eq_attr "type" "ssqrt,dsqrt")
329 (eq_attr "cpu" "cell"))
330 "slot1,nonpipeline,nonpipeline*82")
333 (define_insn_reservation "cell-vecsimple" 4
334 (and (eq_attr "type" "vecsimple")
335 (eq_attr "cpu" "cell"))
336 "slot01,vsu1_cell,vsu1_cell*2")
339 (define_insn_reservation "cell-veccomplex" 10
340 (and (eq_attr "type" "veccomplex")
341 (eq_attr "cpu" "cell"))
342 "slot01,vsu1_cell,vsu1_cell*8")
344 ;; TODO: add support for recording instructions
345 (define_insn_reservation "cell-veccmp" 4
346 (and (eq_attr "type" "veccmp")
347 (eq_attr "cpu" "cell"))
348 "slot01,vsu1_cell,vsu1_cell*2")
350 (define_insn_reservation "cell-vecfloat" 12
351 (and (eq_attr "type" "vecfloat")
352 (eq_attr "cpu" "cell"))
353 "slot01,vsu1_cell,vsu1_cell*10")
355 (define_insn_reservation "cell-vecperm" 4
356 (and (eq_attr "type" "vecperm")
357 (eq_attr "cpu" "cell"))
358 "slot01,vsu2_cell,vsu2_cell*2")
360 ;; New for 4.2, syncs
362 (define_insn_reservation "cell-sync" 11
363 (and (eq_attr "type" "sync")
364 (eq_attr "cpu" "cell"))
365 "slot01,lsu_cell,lsu_cell*9")
367 (define_insn_reservation "cell-isync" 11
368 (and (eq_attr "type" "isync")
369 (eq_attr "cpu" "cell"))
370 "slot01,lsu_cell,lsu_cell*9")
372 (define_insn_reservation "cell-load_l" 11
373 (and (eq_attr "type" "load_l")
374 (eq_attr "cpu" "cell"))
375 "slot01,lsu_cell,lsu_cell*9")
377 (define_insn_reservation "cell-store_c" 11
378 (and (eq_attr "type" "store_c")
379 (eq_attr "cpu" "cell"))
380 "slot01,lsu_cell,lsu_cell*9")
382 ;; RAW register dependency
386 ;; there are 5 cycle deplay for r3 bypassing
387 ;; there are 5 cycle delay for a dependent load after a load
388 (define_bypass 5 "cell-integer" "cell-load")
389 (define_bypass 5 "cell-integer" "cell-load-ext")
390 (define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext")
392 ;; there is a 6 cycle delay after a fp compare until you can use the cr.
393 (define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical")
396 (define_bypass 11 "cell-vecfloat" "cell-vecfloat")
399 (define_bypass 6 "cell-veccomplex" "cell-vecsimple")
400 ;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg")
401 (define_bypass 3 "cell-vecfloat" "cell-veccomplex")
402 ; this is not correct,
403 ;; this is a stall in general and not dependent on result
404 (define_bypass 13 "cell-vecstore" "cell-fpstore")
405 ; this is not correct, this can never be true, not dependent on result
406 (define_bypass 7 "cell-fp" "cell-fpload")
407 ;; vsu1 should avoid writing to the same target register as vsu2 insn
412 ;; the target of VSU estimate should not be reused within 10 dispatch groups
413 ;; the target of VSU float should not be reused within 8 dispatch groups
414 ;; the target of VSU complex should not be reused within 5 dispatch groups
415 ;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus
417 ;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at
418 ;; ex4 stage(10 cycles)
419 (define_bypass 10 "cell-mtjmpr" "cell-branchreg")
421 ;;Things are not simulated:
422 ;; update instruction, update address gpr are not simulated
423 ;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float