1 ;; Scheduling for the Intel P6 family of processors
2 ;; Copyright (C) 2004-2015 Free Software Foundation, Inc.
4 ;; This file is part of GCC.
6 ;; GCC is free software; you can redistribute it and/or modify
7 ;; it under the terms of the GNU General Public License as published by
8 ;; the Free Software Foundation; either version 3, or (at your option)
11 ;; GCC is distributed in the hope that it will be useful,
12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ;; GNU General Public License for more details.
16 ;; You should have received a copy of the GNU General Public License
17 ;; along with GCC; see the file COPYING3. If not see
18 ;; <http://www.gnu.org/licenses/>. */
20 ;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
21 ;; and Xeon lines of CPUs. The DFA scheduler description in this file is
22 ;; based on information that can be found in the following three documents:
24 ;; "P6 Family of Processors Hardware Developer's Manual",
25 ;; Intel, September 1999.
27 ;; "Intel Architecture Optimization Manual",
28 ;; Intel, 1999 (Order Number: 245127-001).
30 ;; "How to optimize for the Pentium family of microprocessors",
33 ;; The P6 pipeline has three major components:
34 ;; 1) the FETCH/DECODE unit, an in-order issue front-end
35 ;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core
36 ;; 3) the RETIRE unit, an in-order retirement unit
38 ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
39 ;; retirement unit are naturally in-order.
43 ;; L1 ICACHE L1 DCACHE
45 ;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE
47 ;; INSTRUCTION POOL __________|_______/
48 ;; (inc. reorder buffer)
50 ;; Since the P6 CPUs execute instructions out-of-order, the most important
51 ;; consideration in performance tuning is making sure enough micro-ops are
52 ;; ready for execution in the out-of-order core, while not stalling the
56 ;; - Find a less crude way to model complex instructions, in
57 ;; particular how many cycles they take to be decoded.
58 ;; - Include decoder latencies in the total reservation latencies.
59 ;; This isn't necessary right now because we assume for every
60 ;; instruction that it never blocks a decoder.
61 ;; - Figure out where the p0 and p1 reservations come from. These
62 ;; appear not to be in the manual
63 ;; - Lots more because I'm sure this is still far from optimal :-)
65 ;; The ppro_idiv and ppro_fdiv automata are used to model issue
66 ;; latencies of idiv and fdiv type insns.
67 (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
69 ;; Simple instructions of the register-register form have only one uop.
70 ;; Load instructions are also only one uop. Store instructions decode to
71 ;; two uops, and simple read-modify instructions also take two uops.
72 ;; Simple instructions of the register-memory form have two to three uops.
73 ;; Simple read-modify-write instructions have four uops. The rules for
74 ;; the decoder are simple:
75 ;; - an instruction with 1 uop can be decoded by any of the three
76 ;; decoders in one cycle.
77 ;; - an instruction with 1 to 4 uops can be decoded only by decoder 0
78 ;; but still in only one cycle.
79 ;; - a complex (microcode) instruction can also only be decoded by
80 ;; decoder 0, and this takes an unspecified number of cycles.
82 ;; The goal is to schedule such that we have a few-one-one uops sequence
83 ;; in each cycle, to decode as many instructions per cycle as possible.
84 (define_cpu_unit "decoder0" "ppro_decoder")
85 (define_cpu_unit "decoder1" "ppro_decoder")
86 (define_cpu_unit "decoder2" "ppro_decoder")
88 ;; We first wish to find an instruction for decoder0, so exclude
89 ;; decoder1 and decoder2 from being reserved until decoder 0 is
91 (presence_set "decoder1" "decoder0")
92 (presence_set "decoder2" "decoder0")
94 ;; Most instructions can be decoded on any of the three decoders.
95 (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
97 ;; The out-of-order core has five pipelines. During each cycle, the core
98 ;; may dispatch zero or one uop on the port of any of the five pipelines
99 ;; so the maximum number of dispatched uops per cycle is 5. In practicer,
100 ;; 3 uops per cycle is more realistic.
102 ;; Two of the five pipelines contain several execution units:
104 ;; Port 0 Port 1 Port 2 Port 3 Port 4
105 ;; ALU ALU LOAD SAC SDA
111 ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
112 ;; JUE = Jump Execution Unit, AGU = Address Generation Unit)
114 (define_cpu_unit "p0,p1" "ppro_core")
115 (define_cpu_unit "p2" "ppro_load")
116 (define_cpu_unit "p3,p4" "ppro_store")
117 (define_cpu_unit "idiv" "ppro_idiv")
118 (define_cpu_unit "fdiv" "ppro_fdiv")
120 ;; Only the irregular instructions have to be modeled here. A load
121 ;; increases the latency by 2 or 3, or by nothing if the manual gives
122 ;; a latency already. Store latencies are not accounted for.
124 ;; The simple instructions follow a very regular pattern of 1 uop per
125 ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
126 ;; on port 4 and port 3. These instructions are modelled at the bottom
129 ;; For microcoded instructions we don't know how many uops are produced.
130 ;; These instructions are the "complex" ones in the Intel manuals. All
131 ;; we _do_ know is that they typically produce four or more uops, so
132 ;; they can only be decoded on decoder0. Modelling their latencies
133 ;; doesn't make sense because we don't know how these instructions are
134 ;; executed in the core. So we just model that they can only be decoded
135 ;; on decoder 0, and say that it takes a little while before the result
137 (define_insn_reservation "ppro_complex_insn" 6
138 (and (eq_attr "cpu" "pentiumpro")
139 (eq_attr "type" "other,multi,call,callv,str"))
142 ;; imov with memory operands does not use the integer units.
143 (define_insn_reservation "ppro_imov" 1
144 (and (eq_attr "cpu" "pentiumpro")
145 (and (eq_attr "memory" "none")
146 (eq_attr "type" "imov")))
149 (define_insn_reservation "ppro_imov_load" 4
150 (and (eq_attr "cpu" "pentiumpro")
151 (and (eq_attr "memory" "load")
152 (eq_attr "type" "imov")))
155 (define_insn_reservation "ppro_imov_store" 1
156 (and (eq_attr "cpu" "pentiumpro")
157 (and (eq_attr "memory" "store")
158 (eq_attr "type" "imov")))
161 ;; imovx always decodes to one uop, and also doesn't use the integer
162 ;; units if it has memory operands.
163 (define_insn_reservation "ppro_imovx" 1
164 (and (eq_attr "cpu" "pentiumpro")
165 (and (eq_attr "memory" "none")
166 (eq_attr "type" "imovx")))
169 (define_insn_reservation "ppro_imovx_load" 4
170 (and (eq_attr "cpu" "pentiumpro")
171 (and (eq_attr "memory" "load")
172 (eq_attr "type" "imovx")))
175 ;; lea executes on port 0 with latency one and throughput 1.
176 (define_insn_reservation "ppro_lea" 1
177 (and (eq_attr "cpu" "pentiumpro")
178 (and (eq_attr "memory" "none")
179 (eq_attr "type" "lea")))
182 ;; Shift and rotate execute on port 0 with latency and throughput 1.
183 ;; The load and store units need to be reserved when memory operands
185 (define_insn_reservation "ppro_shift_rotate" 1
186 (and (eq_attr "cpu" "pentiumpro")
187 (and (eq_attr "memory" "none")
188 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
191 (define_insn_reservation "ppro_shift_rotate_mem" 4
192 (and (eq_attr "cpu" "pentiumpro")
193 (and (eq_attr "memory" "!none")
194 (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
195 "decoder0,p2+p0,p4+p3")
198 ;; The P6 has a sophisticated branch prediction mechanism to minimize
199 ;; latencies due to branching. In particular, it has a fast way to
200 ;; execute branches that are taken multiple times (such as in loops).
201 ;; Branches not taken suffer no penalty, and correctly predicted
202 ;; branches cost only one fetch cycle. Mispredicted branches are very
203 ;; costly: typically 15 cycles and possibly as many as 26 cycles.
205 ;; Unfortunately all this makes it quite difficult to properly model
206 ;; the latencies for the compiler. Here I've made the choice to be
207 ;; optimistic and assume branches are often predicted correctly, so
208 ;; they have latency 1, and the decoders are not blocked.
210 ;; In addition, the model assumes a branch always decodes to only 1 uop,
211 ;; which is not exactly true because there are a few instructions that
212 ;; decode to 2 uops or microcode. But this probably gives the best
213 ;; results because we can assume these instructions can decode on all
215 (define_insn_reservation "ppro_branch" 1
216 (and (eq_attr "cpu" "pentiumpro")
217 (and (eq_attr "memory" "none")
218 (eq_attr "type" "ibr")))
221 ;; ??? Indirect branches probably have worse latency than this.
222 (define_insn_reservation "ppro_indirect_branch" 6
223 (and (eq_attr "cpu" "pentiumpro")
224 (and (eq_attr "memory" "!none")
225 (eq_attr "type" "ibr")))
228 (define_insn_reservation "ppro_leave" 4
229 (and (eq_attr "cpu" "pentiumpro")
230 (eq_attr "type" "leave"))
231 "decoder0,p2+(p0|p1),(p0|p1)")
233 ;; imul has throughput one, but latency 4, and can only execute on port 0.
234 (define_insn_reservation "ppro_imul" 4
235 (and (eq_attr "cpu" "pentiumpro")
236 (and (eq_attr "memory" "none")
237 (eq_attr "type" "imul")))
240 (define_insn_reservation "ppro_imul_mem" 4
241 (and (eq_attr "cpu" "pentiumpro")
242 (and (eq_attr "memory" "!none")
243 (eq_attr "type" "imul")))
246 ;; div and idiv are very similar, so we model them the same.
247 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
248 ;; These issue latencies are modelled via the ppro_div automaton.
249 (define_insn_reservation "ppro_idiv_QI" 19
250 (and (eq_attr "cpu" "pentiumpro")
251 (and (eq_attr "memory" "none")
252 (and (eq_attr "mode" "QI")
253 (eq_attr "type" "idiv"))))
254 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
256 (define_insn_reservation "ppro_idiv_QI_load" 19
257 (and (eq_attr "cpu" "pentiumpro")
258 (and (eq_attr "memory" "load")
259 (and (eq_attr "mode" "QI")
260 (eq_attr "type" "idiv"))))
261 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
263 (define_insn_reservation "ppro_idiv_HI" 23
264 (and (eq_attr "cpu" "pentiumpro")
265 (and (eq_attr "memory" "none")
266 (and (eq_attr "mode" "HI")
267 (eq_attr "type" "idiv"))))
268 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
270 (define_insn_reservation "ppro_idiv_HI_load" 23
271 (and (eq_attr "cpu" "pentiumpro")
272 (and (eq_attr "memory" "load")
273 (and (eq_attr "mode" "HI")
274 (eq_attr "type" "idiv"))))
275 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
277 (define_insn_reservation "ppro_idiv_SI" 39
278 (and (eq_attr "cpu" "pentiumpro")
279 (and (eq_attr "memory" "none")
280 (and (eq_attr "mode" "SI")
281 (eq_attr "type" "idiv"))))
282 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
284 (define_insn_reservation "ppro_idiv_SI_load" 39
285 (and (eq_attr "cpu" "pentiumpro")
286 (and (eq_attr "memory" "load")
287 (and (eq_attr "mode" "SI")
288 (eq_attr "type" "idiv"))))
289 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
291 ;; Floating point operations always execute on port 0.
292 ;; ??? where do these latencies come from? fadd has latency 3 and
293 ;; has throughput "1/cycle (align with FADD)". What do they
294 ;; mean and how can we model that?
295 (define_insn_reservation "ppro_fop" 3
296 (and (eq_attr "cpu" "pentiumpro")
297 (and (eq_attr "memory" "none,unknown")
298 (eq_attr "type" "fop")))
301 (define_insn_reservation "ppro_fop_load" 5
302 (and (eq_attr "cpu" "pentiumpro")
303 (and (eq_attr "memory" "load")
304 (eq_attr "type" "fop")))
307 (define_insn_reservation "ppro_fop_store" 3
308 (and (eq_attr "cpu" "pentiumpro")
309 (and (eq_attr "memory" "store")
310 (eq_attr "type" "fop")))
311 "decoder0,p0,p0,p0+p4+p3")
313 (define_insn_reservation "ppro_fop_both" 5
314 (and (eq_attr "cpu" "pentiumpro")
315 (and (eq_attr "memory" "both")
316 (eq_attr "type" "fop")))
317 "decoder0,p2+p0,p0+p4+p3")
319 (define_insn_reservation "ppro_fsgn" 1
320 (and (eq_attr "cpu" "pentiumpro")
321 (eq_attr "type" "fsgn"))
324 (define_insn_reservation "ppro_fistp" 5
325 (and (eq_attr "cpu" "pentiumpro")
326 (eq_attr "type" "fistp"))
327 "decoder0,p0*2,p4+p3")
329 (define_insn_reservation "ppro_fcmov" 2
330 (and (eq_attr "cpu" "pentiumpro")
331 (eq_attr "type" "fcmov"))
334 (define_insn_reservation "ppro_fcmp" 1
335 (and (eq_attr "cpu" "pentiumpro")
336 (and (eq_attr "memory" "none")
337 (eq_attr "type" "fcmp")))
340 (define_insn_reservation "ppro_fcmp_load" 4
341 (and (eq_attr "cpu" "pentiumpro")
342 (and (eq_attr "memory" "load")
343 (eq_attr "type" "fcmp")))
346 (define_insn_reservation "ppro_fmov" 1
347 (and (eq_attr "cpu" "pentiumpro")
348 (and (eq_attr "memory" "none")
349 (eq_attr "type" "fmov")))
352 (define_insn_reservation "ppro_fmov_load" 1
353 (and (eq_attr "cpu" "pentiumpro")
354 (and (eq_attr "memory" "load")
355 (and (eq_attr "mode" "!XF")
356 (eq_attr "type" "fmov"))))
359 (define_insn_reservation "ppro_fmov_XF_load" 3
360 (and (eq_attr "cpu" "pentiumpro")
361 (and (eq_attr "memory" "load")
362 (and (eq_attr "mode" "XF")
363 (eq_attr "type" "fmov"))))
364 "decoder0,(p2+p0)*2")
366 (define_insn_reservation "ppro_fmov_store" 1
367 (and (eq_attr "cpu" "pentiumpro")
368 (and (eq_attr "memory" "store")
369 (and (eq_attr "mode" "!XF")
370 (eq_attr "type" "fmov"))))
373 (define_insn_reservation "ppro_fmov_XF_store" 3
374 (and (eq_attr "cpu" "pentiumpro")
375 (and (eq_attr "memory" "store")
376 (and (eq_attr "mode" "XF")
377 (eq_attr "type" "fmov"))))
378 "decoder0,(p0+p4),(p0+p3)")
380 ;; fmul executes on port 0 with latency 5. It has issue latency 2,
381 ;; but we don't model this.
382 (define_insn_reservation "ppro_fmul" 5
383 (and (eq_attr "cpu" "pentiumpro")
384 (and (eq_attr "memory" "none")
385 (eq_attr "type" "fmul")))
388 (define_insn_reservation "ppro_fmul_load" 6
389 (and (eq_attr "cpu" "pentiumpro")
390 (and (eq_attr "memory" "load")
391 (eq_attr "type" "fmul")))
394 ;; fdiv latencies depend on the mode of the operands. XFmode gives
395 ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
396 ;; Division by a power of 2 takes only 9 cycles, but we cannot model
397 ;; that. Throughput is equal to latency - 1, which we model using the
398 ;; ppro_div automaton.
399 (define_insn_reservation "ppro_fdiv_SF" 18
400 (and (eq_attr "cpu" "pentiumpro")
401 (and (eq_attr "memory" "none")
402 (and (eq_attr "mode" "SF")
403 (eq_attr "type" "fdiv,fpspc"))))
404 "decodern,p0+fdiv,fdiv*16")
406 (define_insn_reservation "ppro_fdiv_SF_load" 19
407 (and (eq_attr "cpu" "pentiumpro")
408 (and (eq_attr "memory" "load")
409 (and (eq_attr "mode" "SF")
410 (eq_attr "type" "fdiv,fpspc"))))
411 "decoder0,p2+p0+fdiv,fdiv*16")
413 (define_insn_reservation "ppro_fdiv_DF" 32
414 (and (eq_attr "cpu" "pentiumpro")
415 (and (eq_attr "memory" "none")
416 (and (eq_attr "mode" "DF")
417 (eq_attr "type" "fdiv,fpspc"))))
418 "decodern,p0+fdiv,fdiv*30")
420 (define_insn_reservation "ppro_fdiv_DF_load" 33
421 (and (eq_attr "cpu" "pentiumpro")
422 (and (eq_attr "memory" "load")
423 (and (eq_attr "mode" "DF")
424 (eq_attr "type" "fdiv,fpspc"))))
425 "decoder0,p2+p0+fdiv,fdiv*30")
427 (define_insn_reservation "ppro_fdiv_XF" 38
428 (and (eq_attr "cpu" "pentiumpro")
429 (and (eq_attr "memory" "none")
430 (and (eq_attr "mode" "XF")
431 (eq_attr "type" "fdiv,fpspc"))))
432 "decodern,p0+fdiv,fdiv*36")
434 (define_insn_reservation "ppro_fdiv_XF_load" 39
435 (and (eq_attr "cpu" "pentiumpro")
436 (and (eq_attr "memory" "load")
437 (and (eq_attr "mode" "XF")
438 (eq_attr "type" "fdiv,fpspc"))))
439 "decoder0,p2+p0+fdiv,fdiv*36")
441 ;; MMX instructions can execute on either port 0 or port 1 with a
442 ;; throughput of 1/cycle.
443 ;; on port 0: - ALU (latency 1)
444 ;; - Multiplier Unit (latency 3)
445 ;; on port 1: - ALU (latency 1)
446 ;; - Shift Unit (latency 1)
448 ;; MMX instructions are either of the type reg-reg, or read-modify, and
449 ;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
450 ;; so they behave as "simple" instructions that need no special modelling.
451 ;; We only have to model mmxshft and mmxmul.
452 (define_insn_reservation "ppro_mmx_shft" 1
453 (and (eq_attr "cpu" "pentiumpro")
454 (and (eq_attr "memory" "none")
455 (eq_attr "type" "mmxshft")))
458 (define_insn_reservation "ppro_mmx_shft_load" 2
459 (and (eq_attr "cpu" "pentiumpro")
460 (and (eq_attr "memory" "none")
461 (eq_attr "type" "mmxshft")))
464 (define_insn_reservation "ppro_mmx_mul" 3
465 (and (eq_attr "cpu" "pentiumpro")
466 (and (eq_attr "memory" "none")
467 (eq_attr "type" "mmxmul")))
470 (define_insn_reservation "ppro_mmx_mul_load" 3
471 (and (eq_attr "cpu" "pentiumpro")
472 (and (eq_attr "memory" "none")
473 (eq_attr "type" "mmxmul")))
476 (define_insn_reservation "ppro_sse_mmxcvt" 4
477 (and (eq_attr "cpu" "pentiumpro")
478 (and (eq_attr "mode" "DI")
479 (eq_attr "type" "mmxcvt")))
482 ;; FIXME: These are Pentium III only, but we cannot tell here if
483 ;; we're generating code for PentiumPro/Pentium II or Pentium III
484 ;; (define_insn_reservation "ppro_sse_mmxshft" 2
485 ;; (and (eq_attr "cpu" "pentiumpro")
486 ;; (and (eq_attr "mode" "DI")
487 ;; (eq_attr "type" "mmxshft")))
490 ;; SSE is very complicated, and takes a bit more effort.
491 ;; ??? I assumed that all SSE instructions decode on decoder0,
492 ;; but is this correct?
494 ;; The sfence instruction.
495 (define_insn_reservation "ppro_sse_sfence" 3
496 (and (eq_attr "cpu" "pentiumpro")
497 (and (eq_attr "memory" "unknown")
498 (eq_attr "type" "sse")))
501 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
502 (define_insn_reservation "ppro_sse_SF" 3
503 (and (eq_attr "cpu" "pentiumpro")
504 (and (eq_attr "mode" "SF")
505 (eq_attr "type" "sse")))
508 (define_insn_reservation "ppro_sse_add_SF" 3
509 (and (eq_attr "cpu" "pentiumpro")
510 (and (eq_attr "memory" "none")
511 (and (eq_attr "mode" "SF")
512 (eq_attr "type" "sseadd,sseadd1"))))
515 (define_insn_reservation "ppro_sse_add_SF_load" 3
516 (and (eq_attr "cpu" "pentiumpro")
517 (and (eq_attr "memory" "load")
518 (and (eq_attr "mode" "SF")
519 (eq_attr "type" "sseadd,sseadd1"))))
522 (define_insn_reservation "ppro_sse_cmp_SF" 3
523 (and (eq_attr "cpu" "pentiumpro")
524 (and (eq_attr "memory" "none")
525 (and (eq_attr "mode" "SF")
526 (eq_attr "type" "ssecmp"))))
529 (define_insn_reservation "ppro_sse_cmp_SF_load" 3
530 (and (eq_attr "cpu" "pentiumpro")
531 (and (eq_attr "memory" "load")
532 (and (eq_attr "mode" "SF")
533 (eq_attr "type" "ssecmp"))))
536 (define_insn_reservation "ppro_sse_comi_SF" 1
537 (and (eq_attr "cpu" "pentiumpro")
538 (and (eq_attr "memory" "none")
539 (and (eq_attr "mode" "SF")
540 (eq_attr "type" "ssecomi"))))
543 (define_insn_reservation "ppro_sse_comi_SF_load" 1
544 (and (eq_attr "cpu" "pentiumpro")
545 (and (eq_attr "memory" "load")
546 (and (eq_attr "mode" "SF")
547 (eq_attr "type" "ssecomi"))))
550 (define_insn_reservation "ppro_sse_mul_SF" 4
551 (and (eq_attr "cpu" "pentiumpro")
552 (and (eq_attr "memory" "none")
553 (and (eq_attr "mode" "SF")
554 (eq_attr "type" "ssemul"))))
557 (define_insn_reservation "ppro_sse_mul_SF_load" 4
558 (and (eq_attr "cpu" "pentiumpro")
559 (and (eq_attr "memory" "load")
560 (and (eq_attr "mode" "SF")
561 (eq_attr "type" "ssemul"))))
564 ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
565 (define_insn_reservation "ppro_sse_div_SF" 18
566 (and (eq_attr "cpu" "pentiumpro")
567 (and (eq_attr "memory" "none")
568 (and (eq_attr "mode" "SF")
569 (eq_attr "type" "ssediv"))))
572 (define_insn_reservation "ppro_sse_div_SF_load" 18
573 (and (eq_attr "cpu" "pentiumpro")
574 (and (eq_attr "memory" "none")
575 (and (eq_attr "mode" "SF")
576 (eq_attr "type" "ssediv"))))
577 "decoder0,(p2+p0),p0*16")
579 (define_insn_reservation "ppro_sse_icvt_SF" 4
580 (and (eq_attr "cpu" "pentiumpro")
581 (and (eq_attr "mode" "SF")
582 (eq_attr "type" "sseicvt")))
583 "decoder0,(p2+p1)*2")
585 (define_insn_reservation "ppro_sse_icvt_SI" 3
586 (and (eq_attr "cpu" "pentiumpro")
587 (and (eq_attr "mode" "SI")
588 (eq_attr "type" "sseicvt")))
591 (define_insn_reservation "ppro_sse_mov_SF" 3
592 (and (eq_attr "cpu" "pentiumpro")
593 (and (eq_attr "memory" "none")
594 (and (eq_attr "mode" "SF")
595 (eq_attr "type" "ssemov"))))
598 (define_insn_reservation "ppro_sse_mov_SF_load" 3
599 (and (eq_attr "cpu" "pentiumpro")
600 (and (eq_attr "memory" "load")
601 (and (eq_attr "mode" "SF")
602 (eq_attr "type" "ssemov"))))
603 "decoder0,p2+(p0|p1)")
605 (define_insn_reservation "ppro_sse_mov_SF_store" 3
606 (and (eq_attr "cpu" "pentiumpro")
607 (and (eq_attr "memory" "store")
608 (and (eq_attr "mode" "SF")
609 (eq_attr "type" "ssemov"))))
612 (define_insn_reservation "ppro_sse_V4SF" 4
613 (and (eq_attr "cpu" "pentiumpro")
614 (and (eq_attr "mode" "V4SF")
615 (eq_attr "type" "sse")))
618 (define_insn_reservation "ppro_sse_add_V4SF" 3
619 (and (eq_attr "cpu" "pentiumpro")
620 (and (eq_attr "memory" "none")
621 (and (eq_attr "mode" "V4SF")
622 (eq_attr "type" "sseadd,sseadd1"))))
625 (define_insn_reservation "ppro_sse_add_V4SF_load" 3
626 (and (eq_attr "cpu" "pentiumpro")
627 (and (eq_attr "memory" "load")
628 (and (eq_attr "mode" "V4SF")
629 (eq_attr "type" "sseadd,sseadd1"))))
630 "decoder0,(p2+p1)*2")
632 (define_insn_reservation "ppro_sse_cmp_V4SF" 3
633 (and (eq_attr "cpu" "pentiumpro")
634 (and (eq_attr "memory" "none")
635 (and (eq_attr "mode" "V4SF")
636 (eq_attr "type" "ssecmp"))))
639 (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
640 (and (eq_attr "cpu" "pentiumpro")
641 (and (eq_attr "memory" "load")
642 (and (eq_attr "mode" "V4SF")
643 (eq_attr "type" "ssecmp"))))
644 "decoder0,(p2+p1)*2")
646 (define_insn_reservation "ppro_sse_cvt_V4SF" 3
647 (and (eq_attr "cpu" "pentiumpro")
648 (and (eq_attr "memory" "none,unknown")
649 (and (eq_attr "mode" "V4SF")
650 (eq_attr "type" "ssecvt"))))
653 (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
654 (and (eq_attr "cpu" "pentiumpro")
655 (and (eq_attr "memory" "!none,unknown")
656 (and (eq_attr "mode" "V4SF")
657 (eq_attr "type" "ssecmp"))))
660 (define_insn_reservation "ppro_sse_mul_V4SF" 5
661 (and (eq_attr "cpu" "pentiumpro")
662 (and (eq_attr "memory" "none")
663 (and (eq_attr "mode" "V4SF")
664 (eq_attr "type" "ssemul"))))
667 (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
668 (and (eq_attr "cpu" "pentiumpro")
669 (and (eq_attr "memory" "load")
670 (and (eq_attr "mode" "V4SF")
671 (eq_attr "type" "ssemul"))))
672 "decoder0,(p2+p0)*2")
674 ;; FIXME: p0 really closed this long???
675 (define_insn_reservation "ppro_sse_div_V4SF" 48
676 (and (eq_attr "cpu" "pentiumpro")
677 (and (eq_attr "memory" "none")
678 (and (eq_attr "mode" "V4SF")
679 (eq_attr "type" "ssediv"))))
682 (define_insn_reservation "ppro_sse_div_V4SF_load" 48
683 (and (eq_attr "cpu" "pentiumpro")
684 (and (eq_attr "memory" "load")
685 (and (eq_attr "mode" "V4SF")
686 (eq_attr "type" "ssediv"))))
687 "decoder0,(p2+p0)*2,p0*32")
689 (define_insn_reservation "ppro_sse_log_V4SF" 2
690 (and (eq_attr "cpu" "pentiumpro")
691 (and (eq_attr "memory" "none")
692 (and (eq_attr "mode" "V4SF")
693 (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
696 (define_insn_reservation "ppro_sse_log_V4SF_load" 2
697 (and (eq_attr "cpu" "pentiumpro")
698 (and (eq_attr "memory" "load")
699 (and (eq_attr "mode" "V4SF")
700 (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
703 (define_insn_reservation "ppro_sse_mov_V4SF" 1
704 (and (eq_attr "cpu" "pentiumpro")
705 (and (eq_attr "memory" "none")
706 (and (eq_attr "mode" "V4SF")
707 (eq_attr "type" "ssemov"))))
708 "decoder0,(p0|p1)*2")
710 (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
711 (and (eq_attr "cpu" "pentiumpro")
712 (and (eq_attr "memory" "load")
713 (and (eq_attr "mode" "V4SF")
714 (eq_attr "type" "ssemov"))))
717 (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
718 (and (eq_attr "cpu" "pentiumpro")
719 (and (eq_attr "memory" "store")
720 (and (eq_attr "mode" "V4SF")
721 (eq_attr "type" "ssemov"))))
722 "decoder0,(p4+p3)*2")
724 ;; All other instructions are modelled as simple instructions.
725 ;; We have already modelled all i387 floating point instructions, so all
726 ;; other instructions execute on either port 0 or port 1. This includes
727 ;; the ALU units, and the MMX units.
729 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
730 ;; the three decoders.
731 (define_insn_reservation "ppro_insn" 1
732 (and (eq_attr "cpu" "pentiumpro")
733 (and (eq_attr "memory" "none,unknown")
734 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
737 ;; read-modify and register-memory instructions have 2 or three uops,
738 ;; so they have to be decoded on decoder0.
739 (define_insn_reservation "ppro_insn_load" 3
740 (and (eq_attr "cpu" "pentiumpro")
741 (and (eq_attr "memory" "load")
742 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
743 "decoder0,p2+(p0|p1)")
745 (define_insn_reservation "ppro_insn_store" 1
746 (and (eq_attr "cpu" "pentiumpro")
747 (and (eq_attr "memory" "store")
748 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
749 "decoder0,(p0|p1),p4+p3")
751 ;; read-modify-store instructions produce 4 uops so they have to be
752 ;; decoded on decoder0 as well.
753 (define_insn_reservation "ppro_insn_both" 4
754 (and (eq_attr "cpu" "pentiumpro")
755 (and (eq_attr "memory" "both")
756 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
757 "decoder0,p2+(p0|p1),p4+p3")