[AVX-512] Enable QI-mode mask logic patterns on non-AVX-512DQ targets.
[official-gcc.git] / gcc / config / i386 / ppro.md
blob979acc1e33e410c39b2a2b9e3a0ac64f77b2a3f1
1 ;; Scheduling for the Intel P6 family of processors
2 ;; Copyright (C) 2004-2015 Free Software Foundation, Inc.
3 ;;
4 ;; This file is part of GCC.
5 ;;
6 ;; GCC is free software; you can redistribute it and/or modify
7 ;; it under the terms of the GNU General Public License as published by
8 ;; the Free Software Foundation; either version 3, or (at your option)
9 ;; any later version.
11 ;; GCC is distributed in the hope that it will be useful,
12 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 ;; GNU General Public License for more details.
16 ;; You should have received a copy of the GNU General Public License
17 ;; along with GCC; see the file COPYING3.  If not see
18 ;; <http://www.gnu.org/licenses/>.  */
20 ;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron
21 ;; and Xeon lines of CPUs.  The DFA scheduler description in this file is
22 ;; based on information that can be found in the following three documents:
24 ;;    "P6 Family of Processors Hardware Developer's Manual",
25 ;;    Intel, September 1999.
27 ;;    "Intel Architecture Optimization Manual",
28 ;;    Intel, 1999 (Order Number: 245127-001).
30 ;;    "How to optimize for the Pentium family of microprocessors",
31 ;;    by Agner Fog, PhD.
33 ;; The P6 pipeline has three major components:
34 ;;   1) the FETCH/DECODE unit, an in-order issue front-end
35 ;;   2) the DISPATCH/EXECUTE unit, which is the out-of-order core
36 ;;   3) the RETIRE unit, an in-order retirement unit
38 ;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and
39 ;; retirement unit are naturally in-order.
41 ;;                       BUS INTERFACE UNIT
42 ;;                     /                   \
43 ;;                L1 ICACHE             L1 DCACHE
44 ;;              /     |     \              |     \
45 ;;       DECODER0  DECODER1  DECODER2  DISP/EXEC  RETIRE
46 ;;              \     |     /              |        |
47 ;;            INSTRUCTION POOL   __________|_______/
48 ;;          (inc. reorder buffer)
50 ;; Since the P6 CPUs execute instructions out-of-order, the most important
51 ;; consideration in performance tuning is making sure enough micro-ops are
52 ;; ready for execution in the out-of-order core, while not stalling the
53 ;; decoder.
55 ;; TODO:
56 ;; - Find a less crude way to model complex instructions, in
57 ;;   particular how many cycles they take to be decoded.
58 ;; - Include decoder latencies in the total reservation latencies.
59 ;;   This isn't necessary right now because we assume for every
60 ;;   instruction that it never blocks a decoder.
61 ;; - Figure out where the p0 and p1 reservations come from.  These
62 ;;   appear not to be in the manual
63 ;; - Lots more because I'm sure this is still far from optimal :-)
65 ;; The ppro_idiv and ppro_fdiv automata are used to model issue
66 ;; latencies of idiv and fdiv type insns.
67 (define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store")
69 ;; Simple instructions of the register-register form have only one uop.
70 ;; Load instructions are also only one uop.  Store instructions decode to
71 ;; two uops, and simple read-modify instructions also take two uops.
72 ;; Simple instructions of the register-memory form have two to three uops.
73 ;; Simple read-modify-write instructions have four uops.  The rules for
74 ;; the decoder are simple:
75 ;;  - an instruction with 1 uop can be decoded by any of the three
76 ;;    decoders in one cycle.
77 ;;  - an instruction with 1 to 4 uops can be decoded only by decoder 0
78 ;;    but still in only one cycle.
79 ;;  - a complex (microcode) instruction can also only be decoded by
80 ;;    decoder 0, and this takes an unspecified number of cycles.
82 ;; The goal is to schedule such that we have a few-one-one uops sequence
83 ;; in each cycle, to decode as many instructions per cycle as possible.
84 (define_cpu_unit "decoder0" "ppro_decoder")
85 (define_cpu_unit "decoder1" "ppro_decoder")
86 (define_cpu_unit "decoder2" "ppro_decoder")
88 ;; We first wish to find an instruction for decoder0, so exclude
89 ;; decoder1 and decoder2 from being reserved until decoder 0 is
90 ;; reserved.
91 (presence_set "decoder1" "decoder0")
92 (presence_set "decoder2" "decoder0")
94 ;; Most instructions can be decoded on any of the three decoders.
95 (define_reservation "decodern" "(decoder0|decoder1|decoder2)")
97 ;; The out-of-order core has five pipelines.  During each cycle, the core
98 ;; may dispatch zero or one uop on the port of any of the five pipelines
99 ;; so the maximum number of dispatched uops per cycle is 5.  In practicer,
100 ;; 3 uops per cycle is more realistic.
102 ;; Two of the five pipelines contain several execution units:
104 ;; Port 0       Port 1          Port 2          Port 3          Port 4
105 ;; ALU          ALU             LOAD            SAC             SDA
106 ;; FPU          JUE
107 ;; AGU          MMX
108 ;; MMX          P3FPU
109 ;; P3FPU
111 ;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit,
112 ;;  JUE = Jump Execution Unit, AGU = Address Generation Unit)
114 (define_cpu_unit "p0,p1" "ppro_core")
115 (define_cpu_unit "p2" "ppro_load")
116 (define_cpu_unit "p3,p4" "ppro_store")
117 (define_cpu_unit "idiv" "ppro_idiv")
118 (define_cpu_unit "fdiv" "ppro_fdiv")
120 ;; Only the irregular instructions have to be modeled here.  A load
121 ;; increases the latency by 2 or 3, or by nothing if the manual gives
122 ;; a latency already.  Store latencies are not accounted for.
124 ;; The simple instructions follow a very regular pattern of 1 uop per
125 ;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store
126 ;; on port 4 and port 3.  These instructions are modelled at the bottom
127 ;; of this file.
129 ;; For microcoded instructions we don't know how many uops are produced.
130 ;; These instructions are the "complex" ones in the Intel manuals.  All
131 ;; we _do_ know is that they typically produce four or more uops, so
132 ;; they can only be decoded on decoder0.  Modelling their latencies
133 ;; doesn't make sense because we don't know how these instructions are
134 ;; executed in the core.  So we just model that they can only be decoded
135 ;; on decoder 0, and say that it takes a little while before the result
136 ;; is available.
137 (define_insn_reservation "ppro_complex_insn" 6
138                          (and (eq_attr "cpu" "pentiumpro")
139                               (eq_attr "type" "other,multi,call,callv,str"))
140                          "decoder0")
142 ;; imov with memory operands does not use the integer units.
143 (define_insn_reservation "ppro_imov" 1
144                          (and (eq_attr "cpu" "pentiumpro")
145                               (and (eq_attr "memory" "none")
146                                    (eq_attr "type" "imov")))
147                          "decodern,(p0|p1)")
149 (define_insn_reservation "ppro_imov_load" 4
150                          (and (eq_attr "cpu" "pentiumpro")
151                               (and (eq_attr "memory" "load")
152                                    (eq_attr "type" "imov")))
153                          "decodern,p2")
155 (define_insn_reservation "ppro_imov_store" 1
156                          (and (eq_attr "cpu" "pentiumpro")
157                               (and (eq_attr "memory" "store")
158                                    (eq_attr "type" "imov")))
159                          "decoder0,p4+p3")
161 ;; imovx always decodes to one uop, and also doesn't use the integer
162 ;; units if it has memory operands.
163 (define_insn_reservation "ppro_imovx" 1
164                          (and (eq_attr "cpu" "pentiumpro")
165                               (and (eq_attr "memory" "none")
166                                    (eq_attr "type" "imovx")))
167                          "decodern,(p0|p1)")
169 (define_insn_reservation "ppro_imovx_load" 4
170                          (and (eq_attr "cpu" "pentiumpro")
171                               (and (eq_attr "memory" "load")
172                                    (eq_attr "type" "imovx")))
173                          "decodern,p2")
175 ;; lea executes on port 0 with latency one and throughput 1.
176 (define_insn_reservation "ppro_lea" 1
177                          (and (eq_attr "cpu" "pentiumpro")
178                               (and (eq_attr "memory" "none")
179                                    (eq_attr "type" "lea")))
180                          "decodern,p0")
182 ;; Shift and rotate execute on port 0 with latency and throughput 1.
183 ;; The load and store units need to be reserved when memory operands
184 ;; are involved.
185 (define_insn_reservation "ppro_shift_rotate" 1
186                          (and (eq_attr "cpu" "pentiumpro")
187                               (and (eq_attr "memory" "none")
188                                    (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
189                          "decodern,p0")
191 (define_insn_reservation "ppro_shift_rotate_mem" 4
192                          (and (eq_attr "cpu" "pentiumpro")
193                               (and (eq_attr "memory" "!none")
194                                    (eq_attr "type" "ishift,ishift1,rotate,rotate1")))
195                          "decoder0,p2+p0,p4+p3")
198 ;; The P6 has a sophisticated branch prediction mechanism to minimize
199 ;; latencies due to branching.  In particular, it has a fast way to
200 ;; execute branches that are taken multiple times (such as in loops).
201 ;; Branches not taken suffer no penalty, and correctly predicted
202 ;; branches cost only one fetch cycle.  Mispredicted branches are very
203 ;; costly: typically 15 cycles and possibly as many as 26 cycles.
205 ;; Unfortunately all this makes it quite difficult to properly model
206 ;; the latencies for the compiler.  Here I've made the choice to be
207 ;; optimistic and assume branches are often predicted correctly, so
208 ;; they have latency 1, and the decoders are not blocked.
210 ;; In addition, the model assumes a branch always decodes to only 1 uop,
211 ;; which is not exactly true because there are a few instructions that
212 ;; decode to 2 uops or microcode.  But this probably gives the best
213 ;; results because we can assume these instructions can decode on all
214 ;; decoders.
215 (define_insn_reservation "ppro_branch" 1
216                          (and (eq_attr "cpu" "pentiumpro")
217                               (and (eq_attr "memory" "none")
218                                    (eq_attr "type" "ibr")))
219                          "decodern,p1")
221 ;; ??? Indirect branches probably have worse latency than this.
222 (define_insn_reservation "ppro_indirect_branch" 6
223                          (and (eq_attr "cpu" "pentiumpro")
224                               (and (eq_attr "memory" "!none")
225                                    (eq_attr "type" "ibr")))
226                          "decoder0,p2+p1")
228 (define_insn_reservation "ppro_leave" 4
229                          (and (eq_attr "cpu" "pentiumpro")
230                               (eq_attr "type" "leave"))
231                          "decoder0,p2+(p0|p1),(p0|p1)")
233 ;; imul has throughput one, but latency 4, and can only execute on port 0.
234 (define_insn_reservation "ppro_imul" 4
235                          (and (eq_attr "cpu" "pentiumpro")
236                               (and (eq_attr "memory" "none")
237                                    (eq_attr "type" "imul")))
238                          "decodern,p0")
240 (define_insn_reservation "ppro_imul_mem" 4
241                          (and (eq_attr "cpu" "pentiumpro")
242                               (and (eq_attr "memory" "!none")
243                                    (eq_attr "type" "imul")))
244                          "decoder0,p2+p0")
246 ;; div and idiv are very similar, so we model them the same.
247 ;; QI, HI, and SI have issue latency 12, 21, and 37, respectively.
248 ;; These issue latencies are modelled via the ppro_div automaton.
249 (define_insn_reservation "ppro_idiv_QI" 19
250                          (and (eq_attr "cpu" "pentiumpro")
251                               (and (eq_attr "memory" "none")
252                                    (and (eq_attr "mode" "QI")
253                                         (eq_attr "type" "idiv"))))
254                          "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9")
256 (define_insn_reservation "ppro_idiv_QI_load" 19
257                          (and (eq_attr "cpu" "pentiumpro")
258                               (and (eq_attr "memory" "load")
259                                    (and (eq_attr "mode" "QI")
260                                         (eq_attr "type" "idiv"))))
261                          "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9")
263 (define_insn_reservation "ppro_idiv_HI" 23
264                          (and (eq_attr "cpu" "pentiumpro")
265                               (and (eq_attr "memory" "none")
266                                    (and (eq_attr "mode" "HI")
267                                         (eq_attr "type" "idiv"))))
268                          "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17")
270 (define_insn_reservation "ppro_idiv_HI_load" 23
271                          (and (eq_attr "cpu" "pentiumpro")
272                               (and (eq_attr "memory" "load")
273                                    (and (eq_attr "mode" "HI")
274                                         (eq_attr "type" "idiv"))))
275                          "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18")
277 (define_insn_reservation "ppro_idiv_SI" 39
278                          (and (eq_attr "cpu" "pentiumpro")
279                               (and (eq_attr "memory" "none")
280                                    (and (eq_attr "mode" "SI")
281                                         (eq_attr "type" "idiv"))))
282                          "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33")
284 (define_insn_reservation "ppro_idiv_SI_load" 39
285                          (and (eq_attr "cpu" "pentiumpro")
286                               (and (eq_attr "memory" "load")
287                                    (and (eq_attr "mode" "SI")
288                                         (eq_attr "type" "idiv"))))
289                          "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34")
291 ;; Floating point operations always execute on port 0.
292 ;; ??? where do these latencies come from? fadd has latency 3 and
293 ;;     has throughput "1/cycle (align with FADD)".  What do they
294 ;;     mean and how can we model that?
295 (define_insn_reservation "ppro_fop" 3
296                          (and (eq_attr "cpu" "pentiumpro")
297                               (and (eq_attr "memory" "none,unknown")
298                                    (eq_attr "type" "fop")))
299                          "decodern,p0")
301 (define_insn_reservation "ppro_fop_load" 5
302                          (and (eq_attr "cpu" "pentiumpro")
303                               (and (eq_attr "memory" "load")
304                                    (eq_attr "type" "fop")))
305                          "decoder0,p2+p0,p0")
307 (define_insn_reservation "ppro_fop_store" 3
308                          (and (eq_attr "cpu" "pentiumpro")
309                               (and (eq_attr "memory" "store")
310                                    (eq_attr "type" "fop")))
311                          "decoder0,p0,p0,p0+p4+p3")
313 (define_insn_reservation "ppro_fop_both" 5
314                          (and (eq_attr "cpu" "pentiumpro")
315                               (and (eq_attr "memory" "both")
316                                    (eq_attr "type" "fop")))
317                          "decoder0,p2+p0,p0+p4+p3")
319 (define_insn_reservation "ppro_fsgn" 1
320                          (and (eq_attr "cpu" "pentiumpro")
321                               (eq_attr "type" "fsgn"))
322                          "decodern,p0")
324 (define_insn_reservation "ppro_fistp" 5
325                          (and (eq_attr "cpu" "pentiumpro")
326                               (eq_attr "type" "fistp"))
327                          "decoder0,p0*2,p4+p3")
329 (define_insn_reservation "ppro_fcmov" 2
330                          (and (eq_attr "cpu" "pentiumpro")
331                               (eq_attr "type" "fcmov"))
332                          "decoder0,p0*2")
334 (define_insn_reservation "ppro_fcmp" 1
335                          (and (eq_attr "cpu" "pentiumpro")
336                               (and (eq_attr "memory" "none")
337                                    (eq_attr "type" "fcmp")))
338                          "decodern,p0")
340 (define_insn_reservation "ppro_fcmp_load" 4
341                          (and (eq_attr "cpu" "pentiumpro")
342                               (and (eq_attr "memory" "load")
343                                    (eq_attr "type" "fcmp")))
344                          "decoder0,p2+p0")
346 (define_insn_reservation "ppro_fmov" 1
347                          (and (eq_attr "cpu" "pentiumpro")
348                               (and (eq_attr "memory" "none")
349                                    (eq_attr "type" "fmov")))
350                          "decodern,p0")
352 (define_insn_reservation "ppro_fmov_load" 1
353                          (and (eq_attr "cpu" "pentiumpro")
354                               (and (eq_attr "memory" "load")
355                                    (and (eq_attr "mode" "!XF")
356                                         (eq_attr "type" "fmov"))))
357                          "decodern,p2")
359 (define_insn_reservation "ppro_fmov_XF_load" 3
360                          (and (eq_attr "cpu" "pentiumpro")
361                               (and (eq_attr "memory" "load")
362                                    (and (eq_attr "mode" "XF")
363                                         (eq_attr "type" "fmov"))))
364                          "decoder0,(p2+p0)*2")
366 (define_insn_reservation "ppro_fmov_store" 1
367                          (and (eq_attr "cpu" "pentiumpro")
368                               (and (eq_attr "memory" "store")
369                                    (and (eq_attr "mode" "!XF")
370                                         (eq_attr "type" "fmov"))))
371                          "decodern,p0")
373 (define_insn_reservation "ppro_fmov_XF_store" 3
374                          (and (eq_attr "cpu" "pentiumpro")
375                               (and (eq_attr "memory" "store")
376                                    (and (eq_attr "mode" "XF")
377                                         (eq_attr "type" "fmov"))))
378                          "decoder0,(p0+p4),(p0+p3)")
380 ;; fmul executes on port 0 with latency 5.  It has issue latency 2,
381 ;; but we don't model this.
382 (define_insn_reservation "ppro_fmul" 5
383                          (and (eq_attr "cpu" "pentiumpro")
384                               (and (eq_attr "memory" "none")
385                                    (eq_attr "type" "fmul")))
386                          "decoder0,p0*2")
388 (define_insn_reservation "ppro_fmul_load" 6
389                          (and (eq_attr "cpu" "pentiumpro")
390                               (and (eq_attr "memory" "load")
391                                    (eq_attr "type" "fmul")))
392                          "decoder0,p2+p0,p0")
394 ;; fdiv latencies depend on the mode of the operands.  XFmode gives
395 ;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18.
396 ;; Division by a power of 2 takes only 9 cycles, but we cannot model
397 ;; that.  Throughput is equal to latency - 1, which we model using the
398 ;; ppro_div automaton.
399 (define_insn_reservation "ppro_fdiv_SF" 18
400                          (and (eq_attr "cpu" "pentiumpro")
401                               (and (eq_attr "memory" "none")
402                                    (and (eq_attr "mode" "SF")
403                                         (eq_attr "type" "fdiv,fpspc"))))
404                          "decodern,p0+fdiv,fdiv*16")
406 (define_insn_reservation "ppro_fdiv_SF_load" 19
407                          (and (eq_attr "cpu" "pentiumpro")
408                               (and (eq_attr "memory" "load")
409                                    (and (eq_attr "mode" "SF")
410                                         (eq_attr "type" "fdiv,fpspc"))))
411                          "decoder0,p2+p0+fdiv,fdiv*16")
413 (define_insn_reservation "ppro_fdiv_DF" 32
414                          (and (eq_attr "cpu" "pentiumpro")
415                               (and (eq_attr "memory" "none")
416                                    (and (eq_attr "mode" "DF")
417                                         (eq_attr "type" "fdiv,fpspc"))))
418                          "decodern,p0+fdiv,fdiv*30")
420 (define_insn_reservation "ppro_fdiv_DF_load" 33
421                          (and (eq_attr "cpu" "pentiumpro")
422                               (and (eq_attr "memory" "load")
423                                    (and (eq_attr "mode" "DF")
424                                         (eq_attr "type" "fdiv,fpspc"))))
425                          "decoder0,p2+p0+fdiv,fdiv*30")
427 (define_insn_reservation "ppro_fdiv_XF" 38
428                          (and (eq_attr "cpu" "pentiumpro")
429                               (and (eq_attr "memory" "none")
430                                    (and (eq_attr "mode" "XF")
431                                         (eq_attr "type" "fdiv,fpspc"))))
432                          "decodern,p0+fdiv,fdiv*36")
434 (define_insn_reservation "ppro_fdiv_XF_load" 39
435                          (and (eq_attr "cpu" "pentiumpro")
436                               (and (eq_attr "memory" "load")
437                                    (and (eq_attr "mode" "XF")
438                                         (eq_attr "type" "fdiv,fpspc"))))
439                          "decoder0,p2+p0+fdiv,fdiv*36")
441 ;; MMX instructions can execute on either port 0 or port 1 with a
442 ;; throughput of 1/cycle.
443 ;;   on port 0: - ALU (latency 1)
444 ;;              - Multiplier Unit (latency 3)
445 ;;   on port 1: - ALU (latency 1)
446 ;;              - Shift Unit (latency 1)
448 ;; MMX instructions are either of the type reg-reg, or read-modify, and
449 ;; except for mmxshft and mmxmul they can execute on port 0 or port 1,
450 ;; so they behave as "simple" instructions that need no special modelling.
451 ;; We only have to model mmxshft and mmxmul.
452 (define_insn_reservation "ppro_mmx_shft" 1
453                          (and (eq_attr "cpu" "pentiumpro")
454                               (and (eq_attr "memory" "none")
455                                    (eq_attr "type" "mmxshft")))
456                          "decodern,p1")
458 (define_insn_reservation "ppro_mmx_shft_load" 2
459                          (and (eq_attr "cpu" "pentiumpro")
460                               (and (eq_attr "memory" "none")
461                                    (eq_attr "type" "mmxshft")))
462                          "decoder0,p2+p1")
464 (define_insn_reservation "ppro_mmx_mul" 3
465                          (and (eq_attr "cpu" "pentiumpro")
466                               (and (eq_attr "memory" "none")
467                                    (eq_attr "type" "mmxmul")))
468                          "decodern,p0")
470 (define_insn_reservation "ppro_mmx_mul_load" 3
471                          (and (eq_attr "cpu" "pentiumpro")
472                               (and (eq_attr "memory" "none")
473                                    (eq_attr "type" "mmxmul")))
474                          "decoder0,p2+p0")
476 (define_insn_reservation "ppro_sse_mmxcvt" 4
477                          (and (eq_attr "cpu" "pentiumpro")
478                               (and (eq_attr "mode" "DI")
479                                    (eq_attr "type" "mmxcvt")))
480                          "decodern,p1")
482 ;; FIXME: These are Pentium III only, but we cannot tell here if
483 ;; we're generating code for PentiumPro/Pentium II or Pentium III
484 ;; (define_insn_reservation "ppro_sse_mmxshft" 2
485 ;;                       (and (eq_attr "cpu" "pentiumpro")
486 ;;                            (and (eq_attr "mode" "DI")
487 ;;                                 (eq_attr "type" "mmxshft")))
488 ;;                       "decodern,p0")
490 ;; SSE is very complicated, and takes a bit more effort.
491 ;; ??? I assumed that all SSE instructions decode on decoder0,
492 ;;     but is this correct?
494 ;; The sfence instruction.
495 (define_insn_reservation "ppro_sse_sfence" 3
496                          (and (eq_attr "cpu" "pentiumpro")
497                               (and (eq_attr "memory" "unknown")
498                                    (eq_attr "type" "sse")))
499                          "decoder0,p4+p3")
501 ;; FIXME: This reservation is all wrong when we're scheduling sqrtss.
502 (define_insn_reservation "ppro_sse_SF" 3
503                          (and (eq_attr "cpu" "pentiumpro")
504                               (and (eq_attr "mode" "SF")
505                                    (eq_attr "type" "sse")))
506                          "decodern,p0")
508 (define_insn_reservation "ppro_sse_add_SF" 3
509                          (and (eq_attr "cpu" "pentiumpro")
510                               (and (eq_attr "memory" "none")
511                                    (and (eq_attr "mode" "SF")
512                                         (eq_attr "type" "sseadd,sseadd1"))))
513                          "decodern,p1")
515 (define_insn_reservation "ppro_sse_add_SF_load" 3
516                          (and (eq_attr "cpu" "pentiumpro")
517                               (and (eq_attr "memory" "load")
518                                    (and (eq_attr "mode" "SF")
519                                         (eq_attr "type" "sseadd,sseadd1"))))
520                          "decoder0,p2+p1")
522 (define_insn_reservation "ppro_sse_cmp_SF" 3
523                          (and (eq_attr "cpu" "pentiumpro")
524                               (and (eq_attr "memory" "none")
525                                    (and (eq_attr "mode" "SF")
526                                         (eq_attr "type" "ssecmp"))))
527                          "decoder0,p1")
529 (define_insn_reservation "ppro_sse_cmp_SF_load" 3
530                          (and (eq_attr "cpu" "pentiumpro")
531                               (and (eq_attr "memory" "load")
532                                    (and (eq_attr "mode" "SF")
533                                         (eq_attr "type" "ssecmp"))))
534                          "decoder0,p2+p1")
536 (define_insn_reservation "ppro_sse_comi_SF" 1
537                          (and (eq_attr "cpu" "pentiumpro")
538                               (and (eq_attr "memory" "none")
539                                    (and (eq_attr "mode" "SF")
540                                         (eq_attr "type" "ssecomi"))))
541                          "decodern,p0")
543 (define_insn_reservation "ppro_sse_comi_SF_load" 1
544                          (and (eq_attr "cpu" "pentiumpro")
545                               (and (eq_attr "memory" "load")
546                                    (and (eq_attr "mode" "SF")
547                                         (eq_attr "type" "ssecomi"))))
548                          "decoder0,p2+p0")
550 (define_insn_reservation "ppro_sse_mul_SF" 4
551                          (and (eq_attr "cpu" "pentiumpro")
552                               (and (eq_attr "memory" "none")
553                                    (and (eq_attr "mode" "SF")
554                                         (eq_attr "type" "ssemul"))))
555                         "decodern,p0")
557 (define_insn_reservation "ppro_sse_mul_SF_load" 4
558                          (and (eq_attr "cpu" "pentiumpro")
559                               (and (eq_attr "memory" "load")
560                                    (and (eq_attr "mode" "SF")
561                                         (eq_attr "type" "ssemul"))))
562                         "decoder0,p2+p0")
564 ;; FIXME: ssediv doesn't close p0 for 17 cycles, surely???
565 (define_insn_reservation "ppro_sse_div_SF" 18
566                          (and (eq_attr "cpu" "pentiumpro")
567                               (and (eq_attr "memory" "none")
568                                    (and (eq_attr "mode" "SF")
569                                         (eq_attr "type" "ssediv"))))
570                          "decoder0,p0*17")
572 (define_insn_reservation "ppro_sse_div_SF_load" 18
573                          (and (eq_attr "cpu" "pentiumpro")
574                               (and (eq_attr "memory" "none")
575                                    (and (eq_attr "mode" "SF")
576                                         (eq_attr "type" "ssediv"))))
577                          "decoder0,(p2+p0),p0*16")
579 (define_insn_reservation "ppro_sse_icvt_SF" 4
580                          (and (eq_attr "cpu" "pentiumpro")
581                               (and (eq_attr "mode" "SF")
582                                    (eq_attr "type" "sseicvt")))
583                          "decoder0,(p2+p1)*2")
585 (define_insn_reservation "ppro_sse_icvt_SI" 3
586                          (and (eq_attr "cpu" "pentiumpro")
587                               (and (eq_attr "mode" "SI")
588                                    (eq_attr "type" "sseicvt")))
589                          "decoder0,(p2+p1)")
591 (define_insn_reservation "ppro_sse_mov_SF" 3
592                          (and (eq_attr "cpu" "pentiumpro")
593                               (and (eq_attr "memory" "none")
594                                    (and (eq_attr "mode" "SF")
595                                         (eq_attr "type" "ssemov"))))
596                          "decoder0,(p0|p1)")
598 (define_insn_reservation "ppro_sse_mov_SF_load" 3
599                          (and (eq_attr "cpu" "pentiumpro")
600                               (and (eq_attr "memory" "load")
601                                    (and (eq_attr "mode" "SF")
602                                         (eq_attr "type" "ssemov"))))
603                          "decoder0,p2+(p0|p1)")
605 (define_insn_reservation "ppro_sse_mov_SF_store" 3
606                          (and (eq_attr "cpu" "pentiumpro")
607                               (and (eq_attr "memory" "store")
608                                    (and (eq_attr "mode" "SF")
609                                         (eq_attr "type" "ssemov"))))
610                          "decoder0,p4+p3")
612 (define_insn_reservation "ppro_sse_V4SF" 4
613                          (and (eq_attr "cpu" "pentiumpro")
614                               (and (eq_attr "mode" "V4SF")
615                                    (eq_attr "type" "sse")))
616                          "decoder0,p1*2")
618 (define_insn_reservation "ppro_sse_add_V4SF" 3
619                          (and (eq_attr "cpu" "pentiumpro")
620                               (and (eq_attr "memory" "none")
621                                    (and (eq_attr "mode" "V4SF")
622                                         (eq_attr "type" "sseadd,sseadd1"))))
623                          "decoder0,p1*2")
625 (define_insn_reservation "ppro_sse_add_V4SF_load" 3
626                          (and (eq_attr "cpu" "pentiumpro")
627                               (and (eq_attr "memory" "load")
628                                    (and (eq_attr "mode" "V4SF")
629                                         (eq_attr "type" "sseadd,sseadd1"))))
630                          "decoder0,(p2+p1)*2")
632 (define_insn_reservation "ppro_sse_cmp_V4SF" 3
633                          (and (eq_attr "cpu" "pentiumpro")
634                               (and (eq_attr "memory" "none")
635                                    (and (eq_attr "mode" "V4SF")
636                                         (eq_attr "type" "ssecmp"))))
637                          "decoder0,p1*2")
639 (define_insn_reservation "ppro_sse_cmp_V4SF_load" 3
640                          (and (eq_attr "cpu" "pentiumpro")
641                               (and (eq_attr "memory" "load")
642                                    (and (eq_attr "mode" "V4SF")
643                                         (eq_attr "type" "ssecmp"))))
644                          "decoder0,(p2+p1)*2")
646 (define_insn_reservation "ppro_sse_cvt_V4SF" 3
647                          (and (eq_attr "cpu" "pentiumpro")
648                               (and (eq_attr "memory" "none,unknown")
649                                    (and (eq_attr "mode" "V4SF")
650                                         (eq_attr "type" "ssecvt"))))
651                          "decoder0,p1*2")
653 (define_insn_reservation "ppro_sse_cvt_V4SF_other" 4
654                          (and (eq_attr "cpu" "pentiumpro")
655                               (and (eq_attr "memory" "!none,unknown")
656                                    (and (eq_attr "mode" "V4SF")
657                                         (eq_attr "type" "ssecmp"))))
658                          "decoder0,p1,p4+p3")
660 (define_insn_reservation "ppro_sse_mul_V4SF" 5
661                          (and (eq_attr "cpu" "pentiumpro")
662                               (and (eq_attr "memory" "none")
663                                    (and (eq_attr "mode" "V4SF")
664                                         (eq_attr "type" "ssemul"))))
665                         "decoder0,p0*2")
667 (define_insn_reservation "ppro_sse_mul_V4SF_load" 5
668                          (and (eq_attr "cpu" "pentiumpro")
669                               (and (eq_attr "memory" "load")
670                                    (and (eq_attr "mode" "V4SF")
671                                         (eq_attr "type" "ssemul"))))
672                         "decoder0,(p2+p0)*2")
674 ;; FIXME: p0 really closed this long???
675 (define_insn_reservation "ppro_sse_div_V4SF" 48
676                          (and (eq_attr "cpu" "pentiumpro")
677                               (and (eq_attr "memory" "none")
678                                    (and (eq_attr "mode" "V4SF")
679                                         (eq_attr "type" "ssediv"))))
680                          "decoder0,p0*34")
682 (define_insn_reservation "ppro_sse_div_V4SF_load" 48
683                          (and (eq_attr "cpu" "pentiumpro")
684                               (and (eq_attr "memory" "load")
685                                    (and (eq_attr "mode" "V4SF")
686                                         (eq_attr "type" "ssediv"))))
687                          "decoder0,(p2+p0)*2,p0*32")
689 (define_insn_reservation "ppro_sse_log_V4SF" 2
690                          (and (eq_attr "cpu" "pentiumpro")
691                               (and (eq_attr "memory" "none")
692                                    (and (eq_attr "mode" "V4SF")
693                                         (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
694                          "decodern,p1")
696 (define_insn_reservation "ppro_sse_log_V4SF_load" 2
697                          (and (eq_attr "cpu" "pentiumpro")
698                               (and (eq_attr "memory" "load")
699                                    (and (eq_attr "mode" "V4SF")
700                                         (eq_attr "type" "sselog,sselog1,sseshuf,sseshuf1"))))
701                          "decoder0,(p2+p1)")
703 (define_insn_reservation "ppro_sse_mov_V4SF" 1
704                          (and (eq_attr "cpu" "pentiumpro")
705                               (and (eq_attr "memory" "none")
706                                    (and (eq_attr "mode" "V4SF")
707                                         (eq_attr "type" "ssemov"))))
708                          "decoder0,(p0|p1)*2")
710 (define_insn_reservation "ppro_sse_mov_V4SF_load" 2
711                          (and (eq_attr "cpu" "pentiumpro")
712                               (and (eq_attr "memory" "load")
713                                    (and (eq_attr "mode" "V4SF")
714                                         (eq_attr "type" "ssemov"))))
715                          "decoder0,p2*2")
717 (define_insn_reservation "ppro_sse_mov_V4SF_store" 3
718                          (and (eq_attr "cpu" "pentiumpro")
719                               (and (eq_attr "memory" "store")
720                                    (and (eq_attr "mode" "V4SF")
721                                         (eq_attr "type" "ssemov"))))
722                          "decoder0,(p4+p3)*2")
724 ;; All other instructions are modelled as simple instructions.
725 ;; We have already modelled all i387 floating point instructions, so all
726 ;; other instructions execute on either port 0 or port 1.  This includes
727 ;; the ALU units, and the MMX units.
729 ;; reg-reg instructions produce 1 uop so they can be decoded on any of
730 ;; the three decoders.
731 (define_insn_reservation "ppro_insn" 1
732                          (and (eq_attr "cpu" "pentiumpro")
733                               (and (eq_attr "memory" "none,unknown")
734                                    (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
735                          "decodern,(p0|p1)")
737 ;; read-modify and register-memory instructions have 2 or three uops,
738 ;; so they have to be decoded on decoder0.
739 (define_insn_reservation "ppro_insn_load" 3
740                          (and (eq_attr "cpu" "pentiumpro")
741                               (and (eq_attr "memory" "load")
742                                    (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
743                          "decoder0,p2+(p0|p1)")
745 (define_insn_reservation "ppro_insn_store" 1
746                          (and (eq_attr "cpu" "pentiumpro")
747                               (and (eq_attr "memory" "store")
748                                    (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
749                          "decoder0,(p0|p1),p4+p3")
751 ;; read-modify-store instructions produce 4 uops so they have to be
752 ;; decoded on decoder0 as well.
753 (define_insn_reservation "ppro_insn_both" 4
754                          (and (eq_attr "cpu" "pentiumpro")
755                               (and (eq_attr "memory" "both")
756                                    (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseishft1,sseimul,mmx,mmxadd,mmxcmp")))
757                          "decoder0,p2+(p0|p1),p4+p3")