1 ;; AMD K6/K6-2 Scheduling
2 ;; Copyright (C) 2002, 2004
3 ;; Free Software Foundation, Inc.
5 ;; This file is part of GCC.
7 ;; GCC is free software; you can redistribute it and/or modify
8 ;; it under the terms of the GNU General Public License as published by
9 ;; the Free Software Foundation; either version 2, or (at your option)
12 ;; GCC is distributed in the hope that it will be useful,
13 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;; GNU General Public License for more details.
17 ;; You should have received a copy of the GNU General Public License
18 ;; along with GCC; see the file COPYING. If not, write to
19 ;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
20 ;; Boston, MA 02110-1301, USA.
22 ;; The K6 architecture is quite similar to PPro. Important difference is
23 ;; that there are only two decoders and they seems to be much slower than
24 ;; any of the execution units. So we have to pay much more attention to
25 ;; proper scheduling for the decoders.
26 ;; FIXME: We don't do that right now. A good start would be to sort the
27 ;; instructions based on length.
29 ;; This description is based on data from the following documents:
31 ;; "AMD-K6 Processor Data Sheet (Preliminary information)"
32 ;; Advanced Micro Devices, Inc., 1998.
34 ;; "AMD-K6 Processor Code Optimization Application Note"
35 ;; Advanced Micro Devices, Inc., 2000.
37 ;; CPU execution units of the K6:
39 ;; store describes the Store unit. This unit is not modelled
40 ;; completely and it is only used to model lea operation.
41 ;; Otherwise it lies outside of any critical path.
42 ;; load describes the Load unit
43 ;; alux describes the Integer X unit
44 ;; mm describes the Multimedia unit, which shares a pipe
45 ;; with the Integer X unit. This unit is used for MMX,
46 ;; which is not implemented for K6.
47 ;; aluy describes the Integer Y unit
48 ;; fpu describes the FPU unit
49 ;; branch describes the Branch unit
51 ;; The fp unit is not pipelined, and it can only do one operation per two
52 ;; cycles, including fxcg.
54 ;; Generally this is a very poor description, but at least no worse than
55 ;; the old description, and a lot easier to extend to something more
56 ;; reasonable if anyone still cares enough about this architecture in 2004.
58 ;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real.
60 (define_automaton "k6_decoder,k6_load_unit,k6_store_unit,k6_integer_units,k6_fpu_unit,k6_branch_unit")
62 ;; The K6 instruction decoding begins before the on-chip instruction cache is
63 ;; filled. Depending on the length of the instruction, two simple instructions
64 ;; can be decoded in two parallel short decoders, or one complex instruction can
65 ;; be decoded in either the long or the vector decoder. For all practical
66 ;; purposes, the long and vector decoder can be modelled as one decoder.
67 (define_cpu_unit "k6_decode_short0" "k6_decoder")
68 (define_cpu_unit "k6_decode_short1" "k6_decoder")
69 (define_cpu_unit "k6_decode_long" "k6_decoder")
70 (exclusion_set "k6_decode_long" "k6_decode_short0,k6_decode_short1")
71 (define_reservation "k6_decode_short" "k6_decode_short0|k6_decode_short1")
72 (define_reservation "k6_decode_vector" "k6_decode_long")
74 (define_cpu_unit "k6_store" "k6_store_unit")
75 (define_cpu_unit "k6_load" "k6_load_unit")
76 (define_cpu_unit "k6_alux,k6_aluy" "k6_integer_units")
77 (define_cpu_unit "k6_fpu" "k6_fpu_unit")
78 (define_cpu_unit "k6_branch" "k6_branch_unit")
80 ;; Shift instructions and certain arithmetic are issued only on Integer X.
81 (define_insn_reservation "k6_alux_only" 1
82 (and (eq_attr "cpu" "k6")
83 (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot,cld")
84 (eq_attr "memory" "none")))
85 "k6_decode_short,k6_alux")
87 (define_insn_reservation "k6_alux_only_load" 3
88 (and (eq_attr "cpu" "k6")
89 (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot,cld")
90 (eq_attr "memory" "load")))
91 "k6_decode_short,k6_load,k6_alux")
93 (define_insn_reservation "k6_alux_only_store" 3
94 (and (eq_attr "cpu" "k6")
95 (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot,cld")
96 (eq_attr "memory" "store,both,unknown")))
97 "k6_decode_long,k6_load,k6_alux,k6_store")
99 ;; Integer divide and multiply can only be issued on Integer X, too.
100 (define_insn_reservation "k6_alu_imul" 2
101 (and (eq_attr "cpu" "k6")
102 (eq_attr "type" "imul"))
103 "k6_decode_vector,k6_alux*3")
105 (define_insn_reservation "k6_alu_imul_load" 4
106 (and (eq_attr "cpu" "k6")
107 (and (eq_attr "type" "imul")
108 (eq_attr "memory" "load")))
109 "k6_decode_vector,k6_load,k6_alux*3")
111 (define_insn_reservation "k6_alu_imul_store" 4
112 (and (eq_attr "cpu" "k6")
113 (and (eq_attr "type" "imul")
114 (eq_attr "memory" "store,both,unknown")))
115 "k6_decode_vector,k6_load,k6_alux*3,k6_store")
117 ;; ??? Guessed latencies based on the old pipeline description.
118 (define_insn_reservation "k6_alu_idiv" 17
119 (and (eq_attr "cpu" "k6")
120 (and (eq_attr "type" "idiv")
121 (eq_attr "memory" "none")))
122 "k6_decode_vector,k6_alux*17")
124 (define_insn_reservation "k6_alu_idiv_mem" 19
125 (and (eq_attr "cpu" "k6")
126 (and (eq_attr "type" "idiv")
127 (eq_attr "memory" "!none")))
128 "k6_decode_vector,k6_load,k6_alux*17")
130 ;; Basic word and doubleword ALU ops can be issued on both Integer units.
131 (define_insn_reservation "k6_alu" 1
132 (and (eq_attr "cpu" "k6")
133 (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc")
134 (eq_attr "memory" "none")))
135 "k6_decode_short,k6_alux|k6_aluy")
137 (define_insn_reservation "k6_alu_load" 3
138 (and (eq_attr "cpu" "k6")
139 (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc")
140 (eq_attr "memory" "load")))
141 "k6_decode_short,k6_load,k6_alux|k6_aluy")
143 (define_insn_reservation "k6_alu_store" 3
144 (and (eq_attr "cpu" "k6")
145 (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc")
146 (eq_attr "memory" "store,both,unknown")))
147 "k6_decode_long,k6_load,k6_alux|k6_aluy,k6_store")
149 ;; A "load immediate" operation does not require execution at all,
150 ;; it is available immediately after decoding. Special-case this.
151 (define_insn_reservation "k6_alu_imov" 1
152 (and (eq_attr "cpu" "k6")
153 (and (eq_attr "type" "imov")
154 (and (eq_attr "memory" "none")
155 (match_operand 1 "nonimmediate_operand"))))
156 "k6_decode_short,k6_alux|k6_aluy")
158 (define_insn_reservation "k6_alu_imov_imm" 0
159 (and (eq_attr "cpu" "k6")
160 (and (eq_attr "type" "imov")
161 (and (eq_attr "memory" "none")
162 (match_operand 1 "immediate_operand"))))
165 (define_insn_reservation "k6_alu_imov_load" 2
166 (and (eq_attr "cpu" "k6")
167 (and (eq_attr "type" "imov")
168 (eq_attr "memory" "load")))
169 "k6_decode_short,k6_load")
171 (define_insn_reservation "k6_alu_imov_store" 1
172 (and (eq_attr "cpu" "k6")
173 (and (eq_attr "type" "imov")
174 (eq_attr "memory" "store")))
175 "k6_decode_short,k6_store")
177 (define_insn_reservation "k6_alu_imov_both" 2
178 (and (eq_attr "cpu" "k6")
179 (and (eq_attr "type" "imov")
180 (eq_attr "memory" "both,unknown")))
181 "k6_decode_long,k6_load,k6_alux|k6_aluy")
184 (define_insn_reservation "k6_branch_call" 1
185 (and (eq_attr "cpu" "k6")
186 (eq_attr "type" "call,callv"))
187 "k6_decode_vector,k6_branch")
189 (define_insn_reservation "k6_branch_branch" 1
190 (and (eq_attr "cpu" "k6")
191 (eq_attr "type" "ibr"))
192 "k6_decode_short,k6_branch")
194 ;; The load and units have two pipeline stages. The load latency is
196 (define_insn_reservation "k6_load_pop" 3
197 (and (eq_attr "cpu" "k6")
198 (ior (eq_attr "type" "pop")
199 (eq_attr "memory" "load,both")))
200 "k6_decode_short,k6_load")
202 (define_insn_reservation "k6_load_leave" 5
203 (and (eq_attr "cpu" "k6")
204 (eq_attr "type" "leave"))
205 "k6_decode_long,k6_load,(k6_alux|k6_aluy)*2")
207 ;; ??? From the old pipeline description. Egad!
208 ;; ??? Apparently we take care of this reservation in adjust_cost.
209 (define_insn_reservation "k6_load_str" 10
210 (and (eq_attr "cpu" "k6")
211 (and (eq_attr "type" "str")
212 (eq_attr "memory" "load,both")))
213 "k6_decode_vector,k6_load*10")
215 ;; The store unit handles lea and push. It is otherwise unmodelled.
216 (define_insn_reservation "k6_store_lea" 2
217 (and (eq_attr "cpu" "k6")
218 (eq_attr "type" "lea"))
219 "k6_decode_short,k6_store,k6_alux|k6_aluy")
221 (define_insn_reservation "k6_store_push" 2
222 (and (eq_attr "cpu" "k6")
223 (ior (eq_attr "type" "push")
224 (eq_attr "memory" "store,both")))
225 "k6_decode_short,k6_store")
227 (define_insn_reservation "k6_store_str" 10
228 (and (eq_attr "cpu" "k6")
229 (eq_attr "type" "str"))
232 ;; Most FPU instructions have latency 2 and throughput 2.
233 (define_insn_reservation "k6_fpu" 2
234 (and (eq_attr "cpu" "k6")
235 (and (eq_attr "type" "fop,fmov,fcmp,fistp")
236 (eq_attr "memory" "none")))
237 "k6_decode_vector,k6_fpu*2")
239 (define_insn_reservation "k6_fpu_load" 6
240 (and (eq_attr "cpu" "k6")
241 (and (eq_attr "type" "fop,fmov,fcmp,fistp")
242 (eq_attr "memory" "load,both")))
243 "k6_decode_short,k6_load,k6_fpu*2")
245 (define_insn_reservation "k6_fpu_store" 6
246 (and (eq_attr "cpu" "k6")
247 (and (eq_attr "type" "fop,fmov,fcmp,fistp")
248 (eq_attr "memory" "store")))
249 "k6_decode_short,k6_store,k6_fpu*2")
251 (define_insn_reservation "k6_fpu_fmul" 2
252 (and (eq_attr "cpu" "k6")
253 (and (eq_attr "type" "fmul")
254 (eq_attr "memory" "none")))
255 "k6_decode_short,k6_fpu*2")
257 (define_insn_reservation "k6_fpu_fmul_load" 2
258 (and (eq_attr "cpu" "k6")
259 (and (eq_attr "type" "fmul")
260 (eq_attr "memory" "load,both")))
261 "k6_decode_short,k6_load,k6_fpu*2")
263 ;; ??? Guessed latencies from the old pipeline description.
264 (define_insn_reservation "k6_fpu_expensive" 56
265 (and (eq_attr "cpu" "k6")
266 (eq_attr "type" "fdiv,fpspc"))
267 "k6_decode_short,k6_fpu*56")