Import GCC-8 to a new vendor branch
[dragonfly.git] / contrib / gcc-8.0 / gcc / config / i386 / x86-tune-costs.h
blob8409a5f166ca24f386c9a040c4ea1c4808aebd18
1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
30 static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 COSTS_N_BYTES (2), /* cost of an add instruction */
40 COSTS_N_BYTES (3), /* cost of a lea instruction */
41 COSTS_N_BYTES (2), /* variable shift costs */
42 COSTS_N_BYTES (3), /* constant shift costs */
43 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
44 COSTS_N_BYTES (3), /* HI */
45 COSTS_N_BYTES (3), /* SI */
46 COSTS_N_BYTES (3), /* DI */
47 COSTS_N_BYTES (5)}, /* other */
48 0, /* cost of multiply per each bit set */
49 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
50 COSTS_N_BYTES (3), /* HI */
51 COSTS_N_BYTES (3), /* SI */
52 COSTS_N_BYTES (3), /* DI */
53 COSTS_N_BYTES (5)}, /* other */
54 COSTS_N_BYTES (3), /* cost of movsx */
55 COSTS_N_BYTES (3), /* cost of movzx */
56 0, /* "large" insn */
57 2, /* MOVE_RATIO */
59 /* All move costs are relative to integer->integer move times 2. */
60 2, /* cost for loading QImode using movzbl */
61 {2, 2, 2}, /* cost of loading integer registers
62 in QImode, HImode and SImode.
63 Relative to reg-reg move (2). */
64 {2, 2, 2}, /* cost of storing integer registers */
65 2, /* cost of reg,reg fld/fst */
66 {2, 2, 2}, /* cost of loading fp registers
67 in SFmode, DFmode and XFmode */
68 {2, 2, 2}, /* cost of storing fp registers
69 in SFmode, DFmode and XFmode */
70 3, /* cost of moving MMX register */
71 {3, 3}, /* cost of loading MMX registers
72 in SImode and DImode */
73 {3, 3}, /* cost of storing MMX registers
74 in SImode and DImode */
75 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
76 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
77 in 32,64,128,256 and 512-bit */
78 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
79 in 128bit, 256bit and 512bit */
80 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
81 in 32,64,128,256 and 512-bit */
82 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
83 in 128bit, 256bit and 512bit */
84 3, 3, /* SSE->integer and integer->SSE moves */
85 5, 0, /* Gather load static, per_elt. */
86 5, 0, /* Gather store static, per_elt. */
87 0, /* size of l1 cache */
88 0, /* size of l2 cache */
89 0, /* size of prefetch block */
90 0, /* number of parallel prefetches */
91 2, /* Branch cost */
92 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
93 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
94 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
95 COSTS_N_BYTES (2), /* cost of FABS instruction. */
96 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
97 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
99 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
100 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
101 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
102 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
103 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
104 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
105 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
106 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
107 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
108 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
109 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
110 ix86_size_memcpy,
111 ix86_size_memset,
112 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
113 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
116 /* Processor costs (relative to an add) */
117 static stringop_algs i386_memcpy[2] = {
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
119 DUMMY_STRINGOP_ALGS};
120 static stringop_algs i386_memset[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 DUMMY_STRINGOP_ALGS};
124 static const
125 struct processor_costs i386_cost = { /* 386 specific costs */
126 COSTS_N_INSNS (1), /* cost of an add instruction */
127 COSTS_N_INSNS (1), /* cost of a lea instruction */
128 COSTS_N_INSNS (3), /* variable shift costs */
129 COSTS_N_INSNS (2), /* constant shift costs */
130 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
131 COSTS_N_INSNS (6), /* HI */
132 COSTS_N_INSNS (6), /* SI */
133 COSTS_N_INSNS (6), /* DI */
134 COSTS_N_INSNS (6)}, /* other */
135 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
136 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
137 COSTS_N_INSNS (23), /* HI */
138 COSTS_N_INSNS (23), /* SI */
139 COSTS_N_INSNS (23), /* DI */
140 COSTS_N_INSNS (23)}, /* other */
141 COSTS_N_INSNS (3), /* cost of movsx */
142 COSTS_N_INSNS (2), /* cost of movzx */
143 15, /* "large" insn */
144 3, /* MOVE_RATIO */
146 /* All move costs are relative to integer->integer move times 2 and thus
147 they are latency*2. */
148 4, /* cost for loading QImode using movzbl */
149 {2, 4, 2}, /* cost of loading integer registers
150 in QImode, HImode and SImode.
151 Relative to reg-reg move (2). */
152 {2, 4, 2}, /* cost of storing integer registers */
153 2, /* cost of reg,reg fld/fst */
154 {8, 8, 8}, /* cost of loading fp registers
155 in SFmode, DFmode and XFmode */
156 {8, 8, 8}, /* cost of storing fp registers
157 in SFmode, DFmode and XFmode */
158 2, /* cost of moving MMX register */
159 {4, 8}, /* cost of loading MMX registers
160 in SImode and DImode */
161 {4, 8}, /* cost of storing MMX registers
162 in SImode and DImode */
163 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
164 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
165 in 32,64,128,256 and 512-bit */
166 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
167 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
168 in 32,64,128,256 and 512-bit */
169 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
170 3, 3, /* SSE->integer and integer->SSE moves */
171 4, 4, /* Gather load static, per_elt. */
172 4, 4, /* Gather store static, per_elt. */
173 0, /* size of l1 cache */
174 0, /* size of l2 cache */
175 0, /* size of prefetch block */
176 0, /* number of parallel prefetches */
177 1, /* Branch cost */
178 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
179 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
180 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
181 COSTS_N_INSNS (22), /* cost of FABS instruction. */
182 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
183 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
185 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
186 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
187 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
188 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
189 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
190 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
191 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
192 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
193 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
194 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
195 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
196 i386_memcpy,
197 i386_memset,
198 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
199 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
202 static stringop_algs i486_memcpy[2] = {
203 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
204 DUMMY_STRINGOP_ALGS};
205 static stringop_algs i486_memset[2] = {
206 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
207 DUMMY_STRINGOP_ALGS};
209 static const
210 struct processor_costs i486_cost = { /* 486 specific costs */
211 COSTS_N_INSNS (1), /* cost of an add instruction */
212 COSTS_N_INSNS (1), /* cost of a lea instruction */
213 COSTS_N_INSNS (3), /* variable shift costs */
214 COSTS_N_INSNS (2), /* constant shift costs */
215 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
216 COSTS_N_INSNS (12), /* HI */
217 COSTS_N_INSNS (12), /* SI */
218 COSTS_N_INSNS (12), /* DI */
219 COSTS_N_INSNS (12)}, /* other */
220 1, /* cost of multiply per each bit set */
221 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
222 COSTS_N_INSNS (40), /* HI */
223 COSTS_N_INSNS (40), /* SI */
224 COSTS_N_INSNS (40), /* DI */
225 COSTS_N_INSNS (40)}, /* other */
226 COSTS_N_INSNS (3), /* cost of movsx */
227 COSTS_N_INSNS (2), /* cost of movzx */
228 15, /* "large" insn */
229 3, /* MOVE_RATIO */
231 /* All move costs are relative to integer->integer move times 2 and thus
232 they are latency*2. */
233 4, /* cost for loading QImode using movzbl */
234 {2, 4, 2}, /* cost of loading integer registers
235 in QImode, HImode and SImode.
236 Relative to reg-reg move (2). */
237 {2, 4, 2}, /* cost of storing integer registers */
238 2, /* cost of reg,reg fld/fst */
239 {8, 8, 8}, /* cost of loading fp registers
240 in SFmode, DFmode and XFmode */
241 {8, 8, 8}, /* cost of storing fp registers
242 in SFmode, DFmode and XFmode */
243 2, /* cost of moving MMX register */
244 {4, 8}, /* cost of loading MMX registers
245 in SImode and DImode */
246 {4, 8}, /* cost of storing MMX registers
247 in SImode and DImode */
248 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
249 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
250 in 32,64,128,256 and 512-bit */
251 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
252 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
253 in 32,64,128,256 and 512-bit */
254 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
255 3, 3, /* SSE->integer and integer->SSE moves */
256 4, 4, /* Gather load static, per_elt. */
257 4, 4, /* Gather store static, per_elt. */
258 4, /* size of l1 cache. 486 has 8kB cache
259 shared for code and data, so 4kB is
260 not really precise. */
261 4, /* size of l2 cache */
262 0, /* size of prefetch block */
263 0, /* number of parallel prefetches */
264 1, /* Branch cost */
265 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
266 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
267 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
268 COSTS_N_INSNS (3), /* cost of FABS instruction. */
269 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
270 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
272 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
273 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
274 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
275 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
276 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
277 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
278 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
279 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
280 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
281 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
282 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
283 i486_memcpy,
284 i486_memset,
285 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
286 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
289 static stringop_algs pentium_memcpy[2] = {
290 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
291 DUMMY_STRINGOP_ALGS};
292 static stringop_algs pentium_memset[2] = {
293 {libcall, {{-1, rep_prefix_4_byte, false}}},
294 DUMMY_STRINGOP_ALGS};
296 static const
297 struct processor_costs pentium_cost = {
298 COSTS_N_INSNS (1), /* cost of an add instruction */
299 COSTS_N_INSNS (1), /* cost of a lea instruction */
300 COSTS_N_INSNS (4), /* variable shift costs */
301 COSTS_N_INSNS (1), /* constant shift costs */
302 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
303 COSTS_N_INSNS (11), /* HI */
304 COSTS_N_INSNS (11), /* SI */
305 COSTS_N_INSNS (11), /* DI */
306 COSTS_N_INSNS (11)}, /* other */
307 0, /* cost of multiply per each bit set */
308 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
309 COSTS_N_INSNS (25), /* HI */
310 COSTS_N_INSNS (25), /* SI */
311 COSTS_N_INSNS (25), /* DI */
312 COSTS_N_INSNS (25)}, /* other */
313 COSTS_N_INSNS (3), /* cost of movsx */
314 COSTS_N_INSNS (2), /* cost of movzx */
315 8, /* "large" insn */
316 6, /* MOVE_RATIO */
318 /* All move costs are relative to integer->integer move times 2 and thus
319 they are latency*2. */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
336 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
337 in 32,64,128,256 and 512-bit */
338 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
339 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
340 in 32,64,128,256 and 512-bit */
341 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
342 3, 3, /* SSE->integer and integer->SSE moves */
343 4, 4, /* Gather load static, per_elt. */
344 4, 4, /* Gather store static, per_elt. */
345 8, /* size of l1 cache. */
346 8, /* size of l2 cache */
347 0, /* size of prefetch block */
348 0, /* number of parallel prefetches */
349 2, /* Branch cost */
350 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
351 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
352 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
353 COSTS_N_INSNS (1), /* cost of FABS instruction. */
354 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
355 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
357 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
358 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
359 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
360 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
361 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
362 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
363 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
364 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
365 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
366 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
367 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
368 pentium_memcpy,
369 pentium_memset,
370 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
371 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
374 static const
375 struct processor_costs lakemont_cost = {
376 COSTS_N_INSNS (1), /* cost of an add instruction */
377 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
378 COSTS_N_INSNS (1), /* variable shift costs */
379 COSTS_N_INSNS (1), /* constant shift costs */
380 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
381 COSTS_N_INSNS (11), /* HI */
382 COSTS_N_INSNS (11), /* SI */
383 COSTS_N_INSNS (11), /* DI */
384 COSTS_N_INSNS (11)}, /* other */
385 0, /* cost of multiply per each bit set */
386 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
387 COSTS_N_INSNS (25), /* HI */
388 COSTS_N_INSNS (25), /* SI */
389 COSTS_N_INSNS (25), /* DI */
390 COSTS_N_INSNS (25)}, /* other */
391 COSTS_N_INSNS (3), /* cost of movsx */
392 COSTS_N_INSNS (2), /* cost of movzx */
393 8, /* "large" insn */
394 17, /* MOVE_RATIO */
396 /* All move costs are relative to integer->integer move times 2 and thus
397 they are latency*2. */
398 6, /* cost for loading QImode using movzbl */
399 {2, 4, 2}, /* cost of loading integer registers
400 in QImode, HImode and SImode.
401 Relative to reg-reg move (2). */
402 {2, 4, 2}, /* cost of storing integer registers */
403 2, /* cost of reg,reg fld/fst */
404 {2, 2, 6}, /* cost of loading fp registers
405 in SFmode, DFmode and XFmode */
406 {4, 4, 6}, /* cost of storing fp registers
407 in SFmode, DFmode and XFmode */
408 8, /* cost of moving MMX register */
409 {8, 8}, /* cost of loading MMX registers
410 in SImode and DImode */
411 {8, 8}, /* cost of storing MMX registers
412 in SImode and DImode */
413 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
414 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
415 in 32,64,128,256 and 512-bit */
416 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
417 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
418 in 32,64,128,256 and 512-bit */
419 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
420 3, 3, /* SSE->integer and integer->SSE moves */
421 4, 4, /* Gather load static, per_elt. */
422 4, 4, /* Gather store static, per_elt. */
423 8, /* size of l1 cache. */
424 8, /* size of l2 cache */
425 0, /* size of prefetch block */
426 0, /* number of parallel prefetches */
427 2, /* Branch cost */
428 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
429 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
430 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
431 COSTS_N_INSNS (1), /* cost of FABS instruction. */
432 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
433 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
435 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
436 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
437 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
438 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
439 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
440 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
441 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
442 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
443 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
444 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
445 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
446 pentium_memcpy,
447 pentium_memset,
448 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
449 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
452 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
453 (we ensure the alignment). For small blocks inline loop is still a
454 noticeable win, for bigger blocks either rep movsl or rep movsb is
455 way to go. Rep movsb has apparently more expensive startup time in CPU,
456 but after 4K the difference is down in the noise. */
457 static stringop_algs pentiumpro_memcpy[2] = {
458 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
459 {8192, rep_prefix_4_byte, false},
460 {-1, rep_prefix_1_byte, false}}},
461 DUMMY_STRINGOP_ALGS};
462 static stringop_algs pentiumpro_memset[2] = {
463 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
464 {8192, rep_prefix_4_byte, false},
465 {-1, libcall, false}}},
466 DUMMY_STRINGOP_ALGS};
467 static const
468 struct processor_costs pentiumpro_cost = {
469 COSTS_N_INSNS (1), /* cost of an add instruction */
470 COSTS_N_INSNS (1), /* cost of a lea instruction */
471 COSTS_N_INSNS (1), /* variable shift costs */
472 COSTS_N_INSNS (1), /* constant shift costs */
473 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
474 COSTS_N_INSNS (4), /* HI */
475 COSTS_N_INSNS (4), /* SI */
476 COSTS_N_INSNS (4), /* DI */
477 COSTS_N_INSNS (4)}, /* other */
478 0, /* cost of multiply per each bit set */
479 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
480 COSTS_N_INSNS (17), /* HI */
481 COSTS_N_INSNS (17), /* SI */
482 COSTS_N_INSNS (17), /* DI */
483 COSTS_N_INSNS (17)}, /* other */
484 COSTS_N_INSNS (1), /* cost of movsx */
485 COSTS_N_INSNS (1), /* cost of movzx */
486 8, /* "large" insn */
487 6, /* MOVE_RATIO */
489 /* All move costs are relative to integer->integer move times 2 and thus
490 they are latency*2. */
491 2, /* cost for loading QImode using movzbl */
492 {4, 4, 4}, /* cost of loading integer registers
493 in QImode, HImode and SImode.
494 Relative to reg-reg move (2). */
495 {2, 2, 2}, /* cost of storing integer registers */
496 2, /* cost of reg,reg fld/fst */
497 {2, 2, 6}, /* cost of loading fp registers
498 in SFmode, DFmode and XFmode */
499 {4, 4, 6}, /* cost of storing fp registers
500 in SFmode, DFmode and XFmode */
501 2, /* cost of moving MMX register */
502 {2, 2}, /* cost of loading MMX registers
503 in SImode and DImode */
504 {2, 2}, /* cost of storing MMX registers
505 in SImode and DImode */
506 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
507 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
508 in 32,64,128,256 and 512-bit */
509 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
510 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
511 in 32,64,128,256 and 512-bit */
512 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
513 3, 3, /* SSE->integer and integer->SSE moves */
514 4, 4, /* Gather load static, per_elt. */
515 4, 4, /* Gather store static, per_elt. */
516 8, /* size of l1 cache. */
517 256, /* size of l2 cache */
518 32, /* size of prefetch block */
519 6, /* number of parallel prefetches */
520 2, /* Branch cost */
521 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
522 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
523 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
524 COSTS_N_INSNS (2), /* cost of FABS instruction. */
525 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
526 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
528 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
529 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
530 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
531 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
532 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
533 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
534 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
535 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
536 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
537 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
538 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
539 pentiumpro_memcpy,
540 pentiumpro_memset,
541 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
542 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
545 static stringop_algs geode_memcpy[2] = {
546 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
547 DUMMY_STRINGOP_ALGS};
548 static stringop_algs geode_memset[2] = {
549 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
550 DUMMY_STRINGOP_ALGS};
551 static const
552 struct processor_costs geode_cost = {
553 COSTS_N_INSNS (1), /* cost of an add instruction */
554 COSTS_N_INSNS (1), /* cost of a lea instruction */
555 COSTS_N_INSNS (2), /* variable shift costs */
556 COSTS_N_INSNS (1), /* constant shift costs */
557 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
558 COSTS_N_INSNS (4), /* HI */
559 COSTS_N_INSNS (7), /* SI */
560 COSTS_N_INSNS (7), /* DI */
561 COSTS_N_INSNS (7)}, /* other */
562 0, /* cost of multiply per each bit set */
563 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
564 COSTS_N_INSNS (23), /* HI */
565 COSTS_N_INSNS (39), /* SI */
566 COSTS_N_INSNS (39), /* DI */
567 COSTS_N_INSNS (39)}, /* other */
568 COSTS_N_INSNS (1), /* cost of movsx */
569 COSTS_N_INSNS (1), /* cost of movzx */
570 8, /* "large" insn */
571 4, /* MOVE_RATIO */
573 /* All move costs are relative to integer->integer move times 2 and thus
574 they are latency*2. */
575 2, /* cost for loading QImode using movzbl */
576 {2, 2, 2}, /* cost of loading integer registers
577 in QImode, HImode and SImode.
578 Relative to reg-reg move (2). */
579 {2, 2, 2}, /* cost of storing integer registers */
580 2, /* cost of reg,reg fld/fst */
581 {2, 2, 2}, /* cost of loading fp registers
582 in SFmode, DFmode and XFmode */
583 {4, 6, 6}, /* cost of storing fp registers
584 in SFmode, DFmode and XFmode */
586 2, /* cost of moving MMX register */
587 {2, 2}, /* cost of loading MMX registers
588 in SImode and DImode */
589 {2, 2}, /* cost of storing MMX registers
590 in SImode and DImode */
591 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
592 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
593 in 32,64,128,256 and 512-bit */
594 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
595 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
596 in 32,64,128,256 and 512-bit */
597 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
598 6, 6, /* SSE->integer and integer->SSE moves */
599 2, 2, /* Gather load static, per_elt. */
600 2, 2, /* Gather store static, per_elt. */
601 64, /* size of l1 cache. */
602 128, /* size of l2 cache. */
603 32, /* size of prefetch block */
604 1, /* number of parallel prefetches */
605 1, /* Branch cost */
606 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
607 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
608 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
609 COSTS_N_INSNS (1), /* cost of FABS instruction. */
610 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
611 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
613 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
614 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
615 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
616 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
617 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
618 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
619 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
620 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
621 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
622 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
623 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
624 geode_memcpy,
625 geode_memset,
626 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
627 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
630 static stringop_algs k6_memcpy[2] = {
631 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
632 DUMMY_STRINGOP_ALGS};
633 static stringop_algs k6_memset[2] = {
634 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
635 DUMMY_STRINGOP_ALGS};
636 static const
637 struct processor_costs k6_cost = {
638 COSTS_N_INSNS (1), /* cost of an add instruction */
639 COSTS_N_INSNS (2), /* cost of a lea instruction */
640 COSTS_N_INSNS (1), /* variable shift costs */
641 COSTS_N_INSNS (1), /* constant shift costs */
642 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
643 COSTS_N_INSNS (3), /* HI */
644 COSTS_N_INSNS (3), /* SI */
645 COSTS_N_INSNS (3), /* DI */
646 COSTS_N_INSNS (3)}, /* other */
647 0, /* cost of multiply per each bit set */
648 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
649 COSTS_N_INSNS (18), /* HI */
650 COSTS_N_INSNS (18), /* SI */
651 COSTS_N_INSNS (18), /* DI */
652 COSTS_N_INSNS (18)}, /* other */
653 COSTS_N_INSNS (2), /* cost of movsx */
654 COSTS_N_INSNS (2), /* cost of movzx */
655 8, /* "large" insn */
656 4, /* MOVE_RATIO */
658 /* All move costs are relative to integer->integer move times 2 and thus
659 they are latency*2. */
660 3, /* cost for loading QImode using movzbl */
661 {4, 5, 4}, /* cost of loading integer registers
662 in QImode, HImode and SImode.
663 Relative to reg-reg move (2). */
664 {2, 3, 2}, /* cost of storing integer registers */
665 4, /* cost of reg,reg fld/fst */
666 {6, 6, 6}, /* cost of loading fp registers
667 in SFmode, DFmode and XFmode */
668 {4, 4, 4}, /* cost of storing fp registers
669 in SFmode, DFmode and XFmode */
670 2, /* cost of moving MMX register */
671 {2, 2}, /* cost of loading MMX registers
672 in SImode and DImode */
673 {2, 2}, /* cost of storing MMX registers
674 in SImode and DImode */
675 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
676 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
677 in 32,64,128,256 and 512-bit */
678 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
679 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
680 in 32,64,128,256 and 512-bit */
681 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
682 6, 6, /* SSE->integer and integer->SSE moves */
683 2, 2, /* Gather load static, per_elt. */
684 2, 2, /* Gather store static, per_elt. */
685 32, /* size of l1 cache. */
686 32, /* size of l2 cache. Some models
687 have integrated l2 cache, but
688 optimizing for k6 is not important
689 enough to worry about that. */
690 32, /* size of prefetch block */
691 1, /* number of parallel prefetches */
692 1, /* Branch cost */
693 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
694 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
695 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
696 COSTS_N_INSNS (2), /* cost of FABS instruction. */
697 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
698 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
700 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
701 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
702 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
703 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
704 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
705 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
706 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
707 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
708 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
709 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
710 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
711 k6_memcpy,
712 k6_memset,
713 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
714 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
717 /* For some reason, Athlon deals better with REP prefix (relative to loops)
718 compared to K8. Alignment becomes important after 8 bytes for memcpy and
719 128 bytes for memset. */
720 static stringop_algs athlon_memcpy[2] = {
721 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
722 DUMMY_STRINGOP_ALGS};
723 static stringop_algs athlon_memset[2] = {
724 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
725 DUMMY_STRINGOP_ALGS};
726 static const
727 struct processor_costs athlon_cost = {
728 COSTS_N_INSNS (1), /* cost of an add instruction */
729 COSTS_N_INSNS (2), /* cost of a lea instruction */
730 COSTS_N_INSNS (1), /* variable shift costs */
731 COSTS_N_INSNS (1), /* constant shift costs */
732 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
733 COSTS_N_INSNS (5), /* HI */
734 COSTS_N_INSNS (5), /* SI */
735 COSTS_N_INSNS (5), /* DI */
736 COSTS_N_INSNS (5)}, /* other */
737 0, /* cost of multiply per each bit set */
738 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
739 COSTS_N_INSNS (26), /* HI */
740 COSTS_N_INSNS (42), /* SI */
741 COSTS_N_INSNS (74), /* DI */
742 COSTS_N_INSNS (74)}, /* other */
743 COSTS_N_INSNS (1), /* cost of movsx */
744 COSTS_N_INSNS (1), /* cost of movzx */
745 8, /* "large" insn */
746 9, /* MOVE_RATIO */
748 /* All move costs are relative to integer->integer move times 2 and thus
749 they are latency*2. */
750 4, /* cost for loading QImode using movzbl */
751 {3, 4, 3}, /* cost of loading integer registers
752 in QImode, HImode and SImode.
753 Relative to reg-reg move (2). */
754 {3, 4, 3}, /* cost of storing integer registers */
755 4, /* cost of reg,reg fld/fst */
756 {4, 4, 12}, /* cost of loading fp registers
757 in SFmode, DFmode and XFmode */
758 {6, 6, 8}, /* cost of storing fp registers
759 in SFmode, DFmode and XFmode */
760 2, /* cost of moving MMX register */
761 {4, 4}, /* cost of loading MMX registers
762 in SImode and DImode */
763 {4, 4}, /* cost of storing MMX registers
764 in SImode and DImode */
765 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
766 {4, 4, 6, 12, 24}, /* cost of loading SSE registers
767 in 32,64,128,256 and 512-bit */
768 {4, 4, 6, 12, 24}, /* cost of unaligned loads. */
769 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
770 in 32,64,128,256 and 512-bit */
771 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
772 5, 5, /* SSE->integer and integer->SSE moves */
773 4, 4, /* Gather load static, per_elt. */
774 4, 4, /* Gather store static, per_elt. */
775 64, /* size of l1 cache. */
776 256, /* size of l2 cache. */
777 64, /* size of prefetch block */
778 6, /* number of parallel prefetches */
779 5, /* Branch cost */
780 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
781 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
782 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
783 COSTS_N_INSNS (2), /* cost of FABS instruction. */
784 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
785 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
787 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
788 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
789 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
790 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
791 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
792 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
793 /* 11-16 */
794 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
795 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
796 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
797 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
798 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
799 athlon_memcpy,
800 athlon_memset,
801 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
802 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
805 /* K8 has optimized REP instruction for medium sized blocks, but for very
806 small blocks it is better to use loop. For large blocks, libcall can
807 do nontemporary accesses and beat inline considerably. */
808 static stringop_algs k8_memcpy[2] = {
809 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
810 {-1, rep_prefix_4_byte, false}}},
811 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
812 {-1, libcall, false}}}};
813 static stringop_algs k8_memset[2] = {
814 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
815 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
816 {libcall, {{48, unrolled_loop, false},
817 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
818 static const
819 struct processor_costs k8_cost = {
820 COSTS_N_INSNS (1), /* cost of an add instruction */
821 COSTS_N_INSNS (2), /* cost of a lea instruction */
822 COSTS_N_INSNS (1), /* variable shift costs */
823 COSTS_N_INSNS (1), /* constant shift costs */
824 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
825 COSTS_N_INSNS (4), /* HI */
826 COSTS_N_INSNS (3), /* SI */
827 COSTS_N_INSNS (4), /* DI */
828 COSTS_N_INSNS (5)}, /* other */
829 0, /* cost of multiply per each bit set */
830 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
831 COSTS_N_INSNS (26), /* HI */
832 COSTS_N_INSNS (42), /* SI */
833 COSTS_N_INSNS (74), /* DI */
834 COSTS_N_INSNS (74)}, /* other */
835 COSTS_N_INSNS (1), /* cost of movsx */
836 COSTS_N_INSNS (1), /* cost of movzx */
837 8, /* "large" insn */
838 9, /* MOVE_RATIO */
840 /* All move costs are relative to integer->integer move times 2 and thus
841 they are latency*2. */
842 4, /* cost for loading QImode using movzbl */
843 {3, 4, 3}, /* cost of loading integer registers
844 in QImode, HImode and SImode.
845 Relative to reg-reg move (2). */
846 {3, 4, 3}, /* cost of storing integer registers */
847 4, /* cost of reg,reg fld/fst */
848 {4, 4, 12}, /* cost of loading fp registers
849 in SFmode, DFmode and XFmode */
850 {6, 6, 8}, /* cost of storing fp registers
851 in SFmode, DFmode and XFmode */
852 2, /* cost of moving MMX register */
853 {3, 3}, /* cost of loading MMX registers
854 in SImode and DImode */
855 {4, 4}, /* cost of storing MMX registers
856 in SImode and DImode */
857 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
858 {4, 3, 6, 12, 24}, /* cost of loading SSE registers
859 in 32,64,128,256 and 512-bit */
860 {4, 3, 6, 12, 24}, /* cost of unaligned loads. */
861 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
862 in 32,64,128,256 and 512-bit */
863 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
864 5, 5, /* SSE->integer and integer->SSE moves */
865 4, 4, /* Gather load static, per_elt. */
866 4, 4, /* Gather store static, per_elt. */
867 64, /* size of l1 cache. */
868 512, /* size of l2 cache. */
869 64, /* size of prefetch block */
870 /* New AMD processors never drop prefetches; if they cannot be performed
871 immediately, they are queued. We set number of simultaneous prefetches
872 to a large constant to reflect this (it probably is not a good idea not
873 to limit number of prefetches at all, as their execution also takes some
874 time). */
875 100, /* number of parallel prefetches */
876 3, /* Branch cost */
877 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
878 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
879 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
880 COSTS_N_INSNS (2), /* cost of FABS instruction. */
881 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
882 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
884 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
885 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
886 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
887 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
888 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
889 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
890 /* 11-16 */
891 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
892 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
893 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
894 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
895 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
896 k8_memcpy,
897 k8_memset,
898 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
899 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
902 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
903 very small blocks it is better to use loop. For large blocks, libcall can
904 do nontemporary accesses and beat inline considerably. */
905 static stringop_algs amdfam10_memcpy[2] = {
906 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}};
910 static stringop_algs amdfam10_memset[2] = {
911 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
912 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
913 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
914 {-1, libcall, false}}}};
915 struct processor_costs amdfam10_cost = {
916 COSTS_N_INSNS (1), /* cost of an add instruction */
917 COSTS_N_INSNS (2), /* cost of a lea instruction */
918 COSTS_N_INSNS (1), /* variable shift costs */
919 COSTS_N_INSNS (1), /* constant shift costs */
920 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
921 COSTS_N_INSNS (4), /* HI */
922 COSTS_N_INSNS (3), /* SI */
923 COSTS_N_INSNS (4), /* DI */
924 COSTS_N_INSNS (5)}, /* other */
925 0, /* cost of multiply per each bit set */
926 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
927 COSTS_N_INSNS (35), /* HI */
928 COSTS_N_INSNS (51), /* SI */
929 COSTS_N_INSNS (83), /* DI */
930 COSTS_N_INSNS (83)}, /* other */
931 COSTS_N_INSNS (1), /* cost of movsx */
932 COSTS_N_INSNS (1), /* cost of movzx */
933 8, /* "large" insn */
934 9, /* MOVE_RATIO */
936 /* All move costs are relative to integer->integer move times 2 and thus
937 they are latency*2. */
938 4, /* cost for loading QImode using movzbl */
939 {3, 4, 3}, /* cost of loading integer registers
940 in QImode, HImode and SImode.
941 Relative to reg-reg move (2). */
942 {3, 4, 3}, /* cost of storing integer registers */
943 4, /* cost of reg,reg fld/fst */
944 {4, 4, 12}, /* cost of loading fp registers
945 in SFmode, DFmode and XFmode */
946 {6, 6, 8}, /* cost of storing fp registers
947 in SFmode, DFmode and XFmode */
948 2, /* cost of moving MMX register */
949 {3, 3}, /* cost of loading MMX registers
950 in SImode and DImode */
951 {4, 4}, /* cost of storing MMX registers
952 in SImode and DImode */
953 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
954 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
955 in 32,64,128,256 and 512-bit */
956 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
957 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
958 in 32,64,128,256 and 512-bit */
959 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
960 3, 3, /* SSE->integer and integer->SSE moves */
961 /* On K8:
962 MOVD reg64, xmmreg Double FSTORE 4
963 MOVD reg32, xmmreg Double FSTORE 4
964 On AMDFAM10:
965 MOVD reg64, xmmreg Double FADD 3
966 1/1 1/1
967 MOVD reg32, xmmreg Double FADD 3
968 1/1 1/1 */
969 4, 4, /* Gather load static, per_elt. */
970 4, 4, /* Gather store static, per_elt. */
971 64, /* size of l1 cache. */
972 512, /* size of l2 cache. */
973 64, /* size of prefetch block */
974 /* New AMD processors never drop prefetches; if they cannot be performed
975 immediately, they are queued. We set number of simultaneous prefetches
976 to a large constant to reflect this (it probably is not a good idea not
977 to limit number of prefetches at all, as their execution also takes some
978 time). */
979 100, /* number of parallel prefetches */
980 2, /* Branch cost */
981 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
982 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
983 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
984 COSTS_N_INSNS (2), /* cost of FABS instruction. */
985 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
986 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
988 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
989 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
990 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
991 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
992 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
993 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
994 /* 11-16 */
995 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
996 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
997 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
998 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
999 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1000 amdfam10_memcpy,
1001 amdfam10_memset,
1002 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1003 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1006 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1007 very small blocks it is better to use loop. For large blocks, libcall
1008 can do nontemporary accesses and beat inline considerably. */
1009 static stringop_algs bdver1_memcpy[2] = {
1010 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1011 {-1, rep_prefix_4_byte, false}}},
1012 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1013 {-1, libcall, false}}}};
1014 static stringop_algs bdver1_memset[2] = {
1015 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1016 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1017 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1018 {-1, libcall, false}}}};
1020 const struct processor_costs bdver1_cost = {
1021 COSTS_N_INSNS (1), /* cost of an add instruction */
1022 COSTS_N_INSNS (1), /* cost of a lea instruction */
1023 COSTS_N_INSNS (1), /* variable shift costs */
1024 COSTS_N_INSNS (1), /* constant shift costs */
1025 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1026 COSTS_N_INSNS (4), /* HI */
1027 COSTS_N_INSNS (4), /* SI */
1028 COSTS_N_INSNS (6), /* DI */
1029 COSTS_N_INSNS (6)}, /* other */
1030 0, /* cost of multiply per each bit set */
1031 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1032 COSTS_N_INSNS (35), /* HI */
1033 COSTS_N_INSNS (51), /* SI */
1034 COSTS_N_INSNS (83), /* DI */
1035 COSTS_N_INSNS (83)}, /* other */
1036 COSTS_N_INSNS (1), /* cost of movsx */
1037 COSTS_N_INSNS (1), /* cost of movzx */
1038 8, /* "large" insn */
1039 9, /* MOVE_RATIO */
1041 /* All move costs are relative to integer->integer move times 2 and thus
1042 they are latency*2. */
1043 8, /* cost for loading QImode using movzbl */
1044 {8, 8, 8}, /* cost of loading integer registers
1045 in QImode, HImode and SImode.
1046 Relative to reg-reg move (2). */
1047 {8, 8, 8}, /* cost of storing integer registers */
1048 4, /* cost of reg,reg fld/fst */
1049 {12, 12, 28}, /* cost of loading fp registers
1050 in SFmode, DFmode and XFmode */
1051 {10, 10, 18}, /* cost of storing fp registers
1052 in SFmode, DFmode and XFmode */
1053 4, /* cost of moving MMX register */
1054 {12, 12}, /* cost of loading MMX registers
1055 in SImode and DImode */
1056 {10, 10}, /* cost of storing MMX registers
1057 in SImode and DImode */
1058 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1059 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1060 in 32,64,128,256 and 512-bit */
1061 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1062 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1063 in 32,64,128,256 and 512-bit */
1064 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1065 16, 20, /* SSE->integer and integer->SSE moves */
1066 12, 12, /* Gather load static, per_elt. */
1067 10, 10, /* Gather store static, per_elt. */
1068 16, /* size of l1 cache. */
1069 2048, /* size of l2 cache. */
1070 64, /* size of prefetch block */
1071 /* New AMD processors never drop prefetches; if they cannot be performed
1072 immediately, they are queued. We set number of simultaneous prefetches
1073 to a large constant to reflect this (it probably is not a good idea not
1074 to limit number of prefetches at all, as their execution also takes some
1075 time). */
1076 100, /* number of parallel prefetches */
1077 2, /* Branch cost */
1078 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1079 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1080 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1081 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1082 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1083 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1085 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1086 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1087 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1088 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1089 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1090 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1091 /* 9-24 */
1092 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1093 /* 9-27 */
1094 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1095 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1096 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1097 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1098 bdver1_memcpy,
1099 bdver1_memset,
1100 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1101 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1104 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1105 very small blocks it is better to use loop. For large blocks, libcall
1106 can do nontemporary accesses and beat inline considerably. */
1108 static stringop_algs bdver2_memcpy[2] = {
1109 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1110 {-1, rep_prefix_4_byte, false}}},
1111 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1112 {-1, libcall, false}}}};
1113 static stringop_algs bdver2_memset[2] = {
1114 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1115 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1116 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1117 {-1, libcall, false}}}};
1119 const struct processor_costs bdver2_cost = {
1120 COSTS_N_INSNS (1), /* cost of an add instruction */
1121 COSTS_N_INSNS (1), /* cost of a lea instruction */
1122 COSTS_N_INSNS (1), /* variable shift costs */
1123 COSTS_N_INSNS (1), /* constant shift costs */
1124 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1125 COSTS_N_INSNS (4), /* HI */
1126 COSTS_N_INSNS (4), /* SI */
1127 COSTS_N_INSNS (6), /* DI */
1128 COSTS_N_INSNS (6)}, /* other */
1129 0, /* cost of multiply per each bit set */
1130 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1131 COSTS_N_INSNS (35), /* HI */
1132 COSTS_N_INSNS (51), /* SI */
1133 COSTS_N_INSNS (83), /* DI */
1134 COSTS_N_INSNS (83)}, /* other */
1135 COSTS_N_INSNS (1), /* cost of movsx */
1136 COSTS_N_INSNS (1), /* cost of movzx */
1137 8, /* "large" insn */
1138 9, /* MOVE_RATIO */
1140 /* All move costs are relative to integer->integer move times 2 and thus
1141 they are latency*2. */
1142 8, /* cost for loading QImode using movzbl */
1143 {8, 8, 8}, /* cost of loading integer registers
1144 in QImode, HImode and SImode.
1145 Relative to reg-reg move (2). */
1146 {8, 8, 8}, /* cost of storing integer registers */
1147 4, /* cost of reg,reg fld/fst */
1148 {12, 12, 28}, /* cost of loading fp registers
1149 in SFmode, DFmode and XFmode */
1150 {10, 10, 18}, /* cost of storing fp registers
1151 in SFmode, DFmode and XFmode */
1152 4, /* cost of moving MMX register */
1153 {12, 12}, /* cost of loading MMX registers
1154 in SImode and DImode */
1155 {10, 10}, /* cost of storing MMX registers
1156 in SImode and DImode */
1157 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1158 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1159 in 32,64,128,256 and 512-bit */
1160 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1161 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1162 in 32,64,128,256 and 512-bit */
1163 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1164 16, 20, /* SSE->integer and integer->SSE moves */
1165 12, 12, /* Gather load static, per_elt. */
1166 10, 10, /* Gather store static, per_elt. */
1167 16, /* size of l1 cache. */
1168 2048, /* size of l2 cache. */
1169 64, /* size of prefetch block */
1170 /* New AMD processors never drop prefetches; if they cannot be performed
1171 immediately, they are queued. We set number of simultaneous prefetches
1172 to a large constant to reflect this (it probably is not a good idea not
1173 to limit number of prefetches at all, as their execution also takes some
1174 time). */
1175 100, /* number of parallel prefetches */
1176 2, /* Branch cost */
1177 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1178 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1179 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1180 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1181 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1182 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1184 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1185 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1186 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1187 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1188 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1189 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1190 /* 9-24 */
1191 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1192 /* 9-27 */
1193 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1194 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1195 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1196 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1197 bdver2_memcpy,
1198 bdver2_memset,
1199 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1200 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1204 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1205 very small blocks it is better to use loop. For large blocks, libcall
1206 can do nontemporary accesses and beat inline considerably. */
1207 static stringop_algs bdver3_memcpy[2] = {
1208 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1209 {-1, rep_prefix_4_byte, false}}},
1210 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1211 {-1, libcall, false}}}};
1212 static stringop_algs bdver3_memset[2] = {
1213 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1214 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1215 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1216 {-1, libcall, false}}}};
1217 struct processor_costs bdver3_cost = {
1218 COSTS_N_INSNS (1), /* cost of an add instruction */
1219 COSTS_N_INSNS (1), /* cost of a lea instruction */
1220 COSTS_N_INSNS (1), /* variable shift costs */
1221 COSTS_N_INSNS (1), /* constant shift costs */
1222 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1223 COSTS_N_INSNS (4), /* HI */
1224 COSTS_N_INSNS (4), /* SI */
1225 COSTS_N_INSNS (6), /* DI */
1226 COSTS_N_INSNS (6)}, /* other */
1227 0, /* cost of multiply per each bit set */
1228 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1229 COSTS_N_INSNS (35), /* HI */
1230 COSTS_N_INSNS (51), /* SI */
1231 COSTS_N_INSNS (83), /* DI */
1232 COSTS_N_INSNS (83)}, /* other */
1233 COSTS_N_INSNS (1), /* cost of movsx */
1234 COSTS_N_INSNS (1), /* cost of movzx */
1235 8, /* "large" insn */
1236 9, /* MOVE_RATIO */
1238 /* All move costs are relative to integer->integer move times 2 and thus
1239 they are latency*2. */
1240 8, /* cost for loading QImode using movzbl */
1241 {8, 8, 8}, /* cost of loading integer registers
1242 in QImode, HImode and SImode.
1243 Relative to reg-reg move (2). */
1244 {8, 8, 8}, /* cost of storing integer registers */
1245 4, /* cost of reg,reg fld/fst */
1246 {12, 12, 28}, /* cost of loading fp registers
1247 in SFmode, DFmode and XFmode */
1248 {10, 10, 18}, /* cost of storing fp registers
1249 in SFmode, DFmode and XFmode */
1250 4, /* cost of moving MMX register */
1251 {12, 12}, /* cost of loading MMX registers
1252 in SImode and DImode */
1253 {10, 10}, /* cost of storing MMX registers
1254 in SImode and DImode */
1255 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1256 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1257 in 32,64,128,256 and 512-bit */
1258 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1259 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1260 in 32,64,128,256 and 512-bit */
1261 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1262 16, 20, /* SSE->integer and integer->SSE moves */
1263 12, 12, /* Gather load static, per_elt. */
1264 10, 10, /* Gather store static, per_elt. */
1265 16, /* size of l1 cache. */
1266 2048, /* size of l2 cache. */
1267 64, /* size of prefetch block */
1268 /* New AMD processors never drop prefetches; if they cannot be performed
1269 immediately, they are queued. We set number of simultaneous prefetches
1270 to a large constant to reflect this (it probably is not a good idea not
1271 to limit number of prefetches at all, as their execution also takes some
1272 time). */
1273 100, /* number of parallel prefetches */
1274 2, /* Branch cost */
1275 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1276 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1277 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1278 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1279 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1280 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1282 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1283 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1284 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1285 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1286 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1287 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1288 /* 9-24 */
1289 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1290 /* 9-27 */
1291 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1292 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1293 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1294 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1295 bdver3_memcpy,
1296 bdver3_memset,
1297 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1298 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1301 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1302 very small blocks it is better to use loop. For large blocks, libcall
1303 can do nontemporary accesses and beat inline considerably. */
1304 static stringop_algs bdver4_memcpy[2] = {
1305 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1306 {-1, rep_prefix_4_byte, false}}},
1307 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1308 {-1, libcall, false}}}};
1309 static stringop_algs bdver4_memset[2] = {
1310 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1311 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1312 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1313 {-1, libcall, false}}}};
1314 struct processor_costs bdver4_cost = {
1315 COSTS_N_INSNS (1), /* cost of an add instruction */
1316 COSTS_N_INSNS (1), /* cost of a lea instruction */
1317 COSTS_N_INSNS (1), /* variable shift costs */
1318 COSTS_N_INSNS (1), /* constant shift costs */
1319 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1320 COSTS_N_INSNS (4), /* HI */
1321 COSTS_N_INSNS (4), /* SI */
1322 COSTS_N_INSNS (6), /* DI */
1323 COSTS_N_INSNS (6)}, /* other */
1324 0, /* cost of multiply per each bit set */
1325 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1326 COSTS_N_INSNS (35), /* HI */
1327 COSTS_N_INSNS (51), /* SI */
1328 COSTS_N_INSNS (83), /* DI */
1329 COSTS_N_INSNS (83)}, /* other */
1330 COSTS_N_INSNS (1), /* cost of movsx */
1331 COSTS_N_INSNS (1), /* cost of movzx */
1332 8, /* "large" insn */
1333 9, /* MOVE_RATIO */
1335 /* All move costs are relative to integer->integer move times 2 and thus
1336 they are latency*2. */
1337 8, /* cost for loading QImode using movzbl */
1338 {8, 8, 8}, /* cost of loading integer registers
1339 in QImode, HImode and SImode.
1340 Relative to reg-reg move (2). */
1341 {8, 8, 8}, /* cost of storing integer registers */
1342 4, /* cost of reg,reg fld/fst */
1343 {12, 12, 28}, /* cost of loading fp registers
1344 in SFmode, DFmode and XFmode */
1345 {10, 10, 18}, /* cost of storing fp registers
1346 in SFmode, DFmode and XFmode */
1347 4, /* cost of moving MMX register */
1348 {12, 12}, /* cost of loading MMX registers
1349 in SImode and DImode */
1350 {10, 10}, /* cost of storing MMX registers
1351 in SImode and DImode */
1352 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1353 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1354 in 32,64,128,256 and 512-bit */
1355 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1356 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1357 in 32,64,128,256 and 512-bit */
1358 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1359 16, 20, /* SSE->integer and integer->SSE moves */
1360 12, 12, /* Gather load static, per_elt. */
1361 10, 10, /* Gather store static, per_elt. */
1362 16, /* size of l1 cache. */
1363 2048, /* size of l2 cache. */
1364 64, /* size of prefetch block */
1365 /* New AMD processors never drop prefetches; if they cannot be performed
1366 immediately, they are queued. We set number of simultaneous prefetches
1367 to a large constant to reflect this (it probably is not a good idea not
1368 to limit number of prefetches at all, as their execution also takes some
1369 time). */
1370 100, /* number of parallel prefetches */
1371 2, /* Branch cost */
1372 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1373 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1374 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1375 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1376 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1377 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1379 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1380 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1381 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1382 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1383 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1384 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1385 /* 9-24 */
1386 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1387 /* 9-27 */
1388 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1389 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1390 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1391 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1392 bdver4_memcpy,
1393 bdver4_memset,
1394 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1395 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1399 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1400 very small blocks it is better to use loop. For large blocks, libcall
1401 can do nontemporary accesses and beat inline considerably. */
1402 static stringop_algs znver1_memcpy[2] = {
1403 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1404 {-1, rep_prefix_4_byte, false}}},
1405 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1406 {-1, libcall, false}}}};
1407 static stringop_algs znver1_memset[2] = {
1408 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1409 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1410 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1411 {-1, libcall, false}}}};
1412 struct processor_costs znver1_cost = {
1413 COSTS_N_INSNS (1), /* cost of an add instruction. */
1414 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1415 COSTS_N_INSNS (1), /* variable shift costs. */
1416 COSTS_N_INSNS (1), /* constant shift costs. */
1417 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1418 COSTS_N_INSNS (3), /* HI. */
1419 COSTS_N_INSNS (3), /* SI. */
1420 COSTS_N_INSNS (3), /* DI. */
1421 COSTS_N_INSNS (3)}, /* other. */
1422 0, /* cost of multiply per each bit
1423 set. */
1424 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1425 bound. */
1426 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1427 COSTS_N_INSNS (22), /* HI. */
1428 COSTS_N_INSNS (30), /* SI. */
1429 COSTS_N_INSNS (45), /* DI. */
1430 COSTS_N_INSNS (45)}, /* other. */
1431 COSTS_N_INSNS (1), /* cost of movsx. */
1432 COSTS_N_INSNS (1), /* cost of movzx. */
1433 8, /* "large" insn. */
1434 9, /* MOVE_RATIO. */
1436 /* All move costs are relative to integer->integer move times 2 and thus
1437 they are latency*2. */
1439 /* reg-reg moves are done by renaming and thus they are even cheaper than
1440 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1441 to doubles of latencies, we do not model this correctly. It does not
1442 seem to make practical difference to bump prices up even more. */
1443 6, /* cost for loading QImode using
1444 movzbl. */
1445 {6, 6, 6}, /* cost of loading integer registers
1446 in QImode, HImode and SImode.
1447 Relative to reg-reg move (2). */
1448 {8, 8, 8}, /* cost of storing integer
1449 registers. */
1450 2, /* cost of reg,reg fld/fst. */
1451 {6, 6, 16}, /* cost of loading fp registers
1452 in SFmode, DFmode and XFmode. */
1453 {8, 8, 16}, /* cost of storing fp registers
1454 in SFmode, DFmode and XFmode. */
1455 2, /* cost of moving MMX register. */
1456 {6, 6}, /* cost of loading MMX registers
1457 in SImode and DImode. */
1458 {8, 8}, /* cost of storing MMX registers
1459 in SImode and DImode. */
1460 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1461 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1462 in 32,64,128,256 and 512-bit. */
1463 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1464 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1465 in 32,64,128,256 and 512-bit. */
1466 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1467 6, 6, /* SSE->integer and integer->SSE moves. */
1468 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1469 throughput 12. Approx 9 uops do not depend on vector size and every load
1470 is 7 uops. */
1471 18, 8, /* Gather load static, per_elt. */
1472 18, 10, /* Gather store static, per_elt. */
1473 32, /* size of l1 cache. */
1474 512, /* size of l2 cache. */
1475 64, /* size of prefetch block. */
1476 /* New AMD processors never drop prefetches; if they cannot be performed
1477 immediately, they are queued. We set number of simultaneous prefetches
1478 to a large constant to reflect this (it probably is not a good idea not
1479 to limit number of prefetches at all, as their execution also takes some
1480 time). */
1481 100, /* number of parallel prefetches. */
1482 3, /* Branch cost. */
1483 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1485 /* Latency of fdiv is 8-15. */
1486 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1487 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1488 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1489 /* Latency of fsqrt is 4-10. */
1490 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1492 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1493 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1494 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1495 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1496 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1497 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1498 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1499 /* 9-13 */
1500 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1501 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1502 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1503 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1504 and it can execute 2 integer additions and 2 multiplications thus
1505 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1506 that 4 works better than 6 probably due to register pressure.
1508 Integer vector operations are taken by FP unit and execute 3 vector
1509 plus/minus operations per cycle but only one multiply. This is adjusted
1510 in ix86_reassociation_width. */
1511 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1512 znver1_memcpy,
1513 znver1_memset,
1514 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1515 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1518 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1519 static stringop_algs skylake_memcpy[2] = {
1520 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1521 {libcall, {{16, loop, false}, {512, rep_prefix_8_byte, false},
1522 {-1, libcall, false}}}};
1524 static stringop_algs skylake_memset[2] = {
1525 {libcall, {{6, loop_1_byte, true},
1526 {24, loop, true},
1527 {8192, rep_prefix_4_byte, true},
1528 {-1, libcall, false}}},
1529 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, false},
1530 {-1, libcall, false}}}};
1532 static const
1533 struct processor_costs skylake_cost = {
1534 COSTS_N_INSNS (1), /* cost of an add instruction */
1535 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
1536 COSTS_N_INSNS (1), /* variable shift costs */
1537 COSTS_N_INSNS (1), /* constant shift costs */
1538 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1539 COSTS_N_INSNS (4), /* HI */
1540 COSTS_N_INSNS (3), /* SI */
1541 COSTS_N_INSNS (3), /* DI */
1542 COSTS_N_INSNS (3)}, /* other */
1543 0, /* cost of multiply per each bit set */
1544 /* Expanding div/mod currently doesn't consider parallelism. So the cost
1545 model is not realistic. We compensate by increasing the latencies a bit. */
1546 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
1547 COSTS_N_INSNS (11), /* HI */
1548 COSTS_N_INSNS (14), /* SI */
1549 COSTS_N_INSNS (76), /* DI */
1550 COSTS_N_INSNS (76)}, /* other */
1551 COSTS_N_INSNS (1), /* cost of movsx */
1552 COSTS_N_INSNS (0), /* cost of movzx */
1553 8, /* "large" insn */
1554 17, /* MOVE_RATIO */
1556 6, /* cost for loading QImode using movzbl */
1557 {4, 4, 4}, /* cost of loading integer registers
1558 in QImode, HImode and SImode.
1559 Relative to reg-reg move (2). */
1560 {6, 6, 3}, /* cost of storing integer registers */
1561 2, /* cost of reg,reg fld/fst */
1562 {6, 6, 8}, /* cost of loading fp registers
1563 in SFmode, DFmode and XFmode */
1564 {6, 6, 10}, /* cost of storing fp registers
1565 in SFmode, DFmode and XFmode */
1566 2, /* cost of moving MMX register */
1567 {6, 6}, /* cost of loading MMX registers
1568 in SImode and DImode */
1569 {6, 6}, /* cost of storing MMX registers
1570 in SImode and DImode */
1571 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1572 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1573 in 32,64,128,256 and 512-bit */
1574 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1575 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
1576 in 32,64,128,256 and 512-bit */
1577 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1578 2, 2, /* SSE->integer and integer->SSE moves */
1579 20, 8, /* Gather load static, per_elt. */
1580 22, 10, /* Gather store static, per_elt. */
1581 64, /* size of l1 cache. */
1582 512, /* size of l2 cache. */
1583 64, /* size of prefetch block */
1584 6, /* number of parallel prefetches */
1585 3, /* Branch cost */
1586 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1587 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1588 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1589 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1590 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1591 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
1593 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1594 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1595 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1596 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1597 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1598 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1599 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
1600 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
1601 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
1602 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1603 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1604 skylake_memcpy,
1605 skylake_memset,
1606 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1607 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1609 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1610 very small blocks it is better to use loop. For large blocks, libcall can
1611 do nontemporary accesses and beat inline considerably. */
1612 static stringop_algs btver1_memcpy[2] = {
1613 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1614 {-1, rep_prefix_4_byte, false}}},
1615 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1616 {-1, libcall, false}}}};
1617 static stringop_algs btver1_memset[2] = {
1618 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1619 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1620 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1621 {-1, libcall, false}}}};
1622 const struct processor_costs btver1_cost = {
1623 COSTS_N_INSNS (1), /* cost of an add instruction */
1624 COSTS_N_INSNS (2), /* cost of a lea instruction */
1625 COSTS_N_INSNS (1), /* variable shift costs */
1626 COSTS_N_INSNS (1), /* constant shift costs */
1627 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1628 COSTS_N_INSNS (4), /* HI */
1629 COSTS_N_INSNS (3), /* SI */
1630 COSTS_N_INSNS (4), /* DI */
1631 COSTS_N_INSNS (5)}, /* other */
1632 0, /* cost of multiply per each bit set */
1633 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1634 COSTS_N_INSNS (35), /* HI */
1635 COSTS_N_INSNS (51), /* SI */
1636 COSTS_N_INSNS (83), /* DI */
1637 COSTS_N_INSNS (83)}, /* other */
1638 COSTS_N_INSNS (1), /* cost of movsx */
1639 COSTS_N_INSNS (1), /* cost of movzx */
1640 8, /* "large" insn */
1641 9, /* MOVE_RATIO */
1643 /* All move costs are relative to integer->integer move times 2 and thus
1644 they are latency*2. */
1645 8, /* cost for loading QImode using movzbl */
1646 {6, 8, 6}, /* cost of loading integer registers
1647 in QImode, HImode and SImode.
1648 Relative to reg-reg move (2). */
1649 {6, 8, 6}, /* cost of storing integer registers */
1650 4, /* cost of reg,reg fld/fst */
1651 {12, 12, 28}, /* cost of loading fp registers
1652 in SFmode, DFmode and XFmode */
1653 {12, 12, 38}, /* cost of storing fp registers
1654 in SFmode, DFmode and XFmode */
1655 4, /* cost of moving MMX register */
1656 {10, 10}, /* cost of loading MMX registers
1657 in SImode and DImode */
1658 {12, 12}, /* cost of storing MMX registers
1659 in SImode and DImode */
1660 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1661 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1662 in 32,64,128,256 and 512-bit */
1663 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1664 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1665 in 32,64,128,256 and 512-bit */
1666 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1667 14, 14, /* SSE->integer and integer->SSE moves */
1668 10, 10, /* Gather load static, per_elt. */
1669 10, 10, /* Gather store static, per_elt. */
1670 32, /* size of l1 cache. */
1671 512, /* size of l2 cache. */
1672 64, /* size of prefetch block */
1673 100, /* number of parallel prefetches */
1674 2, /* Branch cost */
1675 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1676 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1677 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1678 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1679 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1680 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1682 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1683 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1684 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1685 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1686 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1687 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1688 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1689 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1690 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1691 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1692 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1693 btver1_memcpy,
1694 btver1_memset,
1695 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1696 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1699 static stringop_algs btver2_memcpy[2] = {
1700 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1701 {-1, rep_prefix_4_byte, false}}},
1702 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1703 {-1, libcall, false}}}};
1704 static stringop_algs btver2_memset[2] = {
1705 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1706 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1707 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1708 {-1, libcall, false}}}};
1709 const struct processor_costs btver2_cost = {
1710 COSTS_N_INSNS (1), /* cost of an add instruction */
1711 COSTS_N_INSNS (2), /* cost of a lea instruction */
1712 COSTS_N_INSNS (1), /* variable shift costs */
1713 COSTS_N_INSNS (1), /* constant shift costs */
1714 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1715 COSTS_N_INSNS (4), /* HI */
1716 COSTS_N_INSNS (3), /* SI */
1717 COSTS_N_INSNS (4), /* DI */
1718 COSTS_N_INSNS (5)}, /* other */
1719 0, /* cost of multiply per each bit set */
1720 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1721 COSTS_N_INSNS (35), /* HI */
1722 COSTS_N_INSNS (51), /* SI */
1723 COSTS_N_INSNS (83), /* DI */
1724 COSTS_N_INSNS (83)}, /* other */
1725 COSTS_N_INSNS (1), /* cost of movsx */
1726 COSTS_N_INSNS (1), /* cost of movzx */
1727 8, /* "large" insn */
1728 9, /* MOVE_RATIO */
1730 /* All move costs are relative to integer->integer move times 2 and thus
1731 they are latency*2. */
1732 8, /* cost for loading QImode using movzbl */
1733 {8, 8, 6}, /* cost of loading integer registers
1734 in QImode, HImode and SImode.
1735 Relative to reg-reg move (2). */
1736 {8, 8, 6}, /* cost of storing integer registers */
1737 4, /* cost of reg,reg fld/fst */
1738 {12, 12, 28}, /* cost of loading fp registers
1739 in SFmode, DFmode and XFmode */
1740 {12, 12, 38}, /* cost of storing fp registers
1741 in SFmode, DFmode and XFmode */
1742 4, /* cost of moving MMX register */
1743 {10, 10}, /* cost of loading MMX registers
1744 in SImode and DImode */
1745 {12, 12}, /* cost of storing MMX registers
1746 in SImode and DImode */
1747 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1748 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1749 in 32,64,128,256 and 512-bit */
1750 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1751 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1752 in 32,64,128,256 and 512-bit */
1753 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1754 14, 14, /* SSE->integer and integer->SSE moves */
1755 10, 10, /* Gather load static, per_elt. */
1756 10, 10, /* Gather store static, per_elt. */
1757 32, /* size of l1 cache. */
1758 2048, /* size of l2 cache. */
1759 64, /* size of prefetch block */
1760 100, /* number of parallel prefetches */
1761 2, /* Branch cost */
1762 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1763 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1764 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1765 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1766 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1767 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1769 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1770 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1771 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1772 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1773 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1774 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1775 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1776 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1777 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1778 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1779 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1780 btver2_memcpy,
1781 btver2_memset,
1782 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1783 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1786 static stringop_algs pentium4_memcpy[2] = {
1787 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1788 DUMMY_STRINGOP_ALGS};
1789 static stringop_algs pentium4_memset[2] = {
1790 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1791 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1792 DUMMY_STRINGOP_ALGS};
1794 static const
1795 struct processor_costs pentium4_cost = {
1796 COSTS_N_INSNS (1), /* cost of an add instruction */
1797 COSTS_N_INSNS (3), /* cost of a lea instruction */
1798 COSTS_N_INSNS (4), /* variable shift costs */
1799 COSTS_N_INSNS (4), /* constant shift costs */
1800 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1801 COSTS_N_INSNS (15), /* HI */
1802 COSTS_N_INSNS (15), /* SI */
1803 COSTS_N_INSNS (15), /* DI */
1804 COSTS_N_INSNS (15)}, /* other */
1805 0, /* cost of multiply per each bit set */
1806 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1807 COSTS_N_INSNS (56), /* HI */
1808 COSTS_N_INSNS (56), /* SI */
1809 COSTS_N_INSNS (56), /* DI */
1810 COSTS_N_INSNS (56)}, /* other */
1811 COSTS_N_INSNS (1), /* cost of movsx */
1812 COSTS_N_INSNS (1), /* cost of movzx */
1813 16, /* "large" insn */
1814 6, /* MOVE_RATIO */
1816 /* All move costs are relative to integer->integer move times 2 and thus
1817 they are latency*2. */
1818 5, /* cost for loading QImode using movzbl */
1819 {4, 5, 4}, /* cost of loading integer registers
1820 in QImode, HImode and SImode.
1821 Relative to reg-reg move (2). */
1822 {2, 3, 2}, /* cost of storing integer registers */
1823 12, /* cost of reg,reg fld/fst */
1824 {14, 14, 14}, /* cost of loading fp registers
1825 in SFmode, DFmode and XFmode */
1826 {14, 14, 14}, /* cost of storing fp registers
1827 in SFmode, DFmode and XFmode */
1828 12, /* cost of moving MMX register */
1829 {16, 16}, /* cost of loading MMX registers
1830 in SImode and DImode */
1831 {16, 16}, /* cost of storing MMX registers
1832 in SImode and DImode */
1833 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1834 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
1835 in 32,64,128,256 and 512-bit */
1836 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
1837 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
1838 in 32,64,128,256 and 512-bit */
1839 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
1840 20, 12, /* SSE->integer and integer->SSE moves */
1841 16, 16, /* Gather load static, per_elt. */
1842 16, 16, /* Gather store static, per_elt. */
1843 8, /* size of l1 cache. */
1844 256, /* size of l2 cache. */
1845 64, /* size of prefetch block */
1846 6, /* number of parallel prefetches */
1847 2, /* Branch cost */
1848 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1849 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1850 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1851 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1852 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1853 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1855 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1856 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1857 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1858 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1859 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1860 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1861 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1862 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
1863 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
1864 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
1865 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1866 pentium4_memcpy,
1867 pentium4_memset,
1868 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1869 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1872 static stringop_algs nocona_memcpy[2] = {
1873 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1874 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1875 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1877 static stringop_algs nocona_memset[2] = {
1878 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1879 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1880 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1881 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1883 static const
1884 struct processor_costs nocona_cost = {
1885 COSTS_N_INSNS (1), /* cost of an add instruction */
1886 COSTS_N_INSNS (1), /* cost of a lea instruction */
1887 COSTS_N_INSNS (1), /* variable shift costs */
1888 COSTS_N_INSNS (1), /* constant shift costs */
1889 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1890 COSTS_N_INSNS (10), /* HI */
1891 COSTS_N_INSNS (10), /* SI */
1892 COSTS_N_INSNS (10), /* DI */
1893 COSTS_N_INSNS (10)}, /* other */
1894 0, /* cost of multiply per each bit set */
1895 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1896 COSTS_N_INSNS (66), /* HI */
1897 COSTS_N_INSNS (66), /* SI */
1898 COSTS_N_INSNS (66), /* DI */
1899 COSTS_N_INSNS (66)}, /* other */
1900 COSTS_N_INSNS (1), /* cost of movsx */
1901 COSTS_N_INSNS (1), /* cost of movzx */
1902 16, /* "large" insn */
1903 17, /* MOVE_RATIO */
1905 /* All move costs are relative to integer->integer move times 2 and thus
1906 they are latency*2. */
1907 4, /* cost for loading QImode using movzbl */
1908 {4, 4, 4}, /* cost of loading integer registers
1909 in QImode, HImode and SImode.
1910 Relative to reg-reg move (2). */
1911 {4, 4, 4}, /* cost of storing integer registers */
1912 12, /* cost of reg,reg fld/fst */
1913 {14, 14, 14}, /* cost of loading fp registers
1914 in SFmode, DFmode and XFmode */
1915 {14, 14, 14}, /* cost of storing fp registers
1916 in SFmode, DFmode and XFmode */
1917 14, /* cost of moving MMX register */
1918 {12, 12}, /* cost of loading MMX registers
1919 in SImode and DImode */
1920 {12, 12}, /* cost of storing MMX registers
1921 in SImode and DImode */
1922 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
1923 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
1924 in 32,64,128,256 and 512-bit */
1925 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
1926 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
1927 in 32,64,128,256 and 512-bit */
1928 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
1929 20, 12, /* SSE->integer and integer->SSE moves */
1930 12, 12, /* Gather load static, per_elt. */
1931 12, 12, /* Gather store static, per_elt. */
1932 8, /* size of l1 cache. */
1933 1024, /* size of l2 cache. */
1934 64, /* size of prefetch block */
1935 8, /* number of parallel prefetches */
1936 1, /* Branch cost */
1937 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1938 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1939 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1940 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1941 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1942 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1944 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1945 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1946 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
1947 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
1948 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
1949 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
1950 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
1951 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
1952 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
1953 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
1954 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1955 nocona_memcpy,
1956 nocona_memset,
1957 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1958 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1961 static stringop_algs atom_memcpy[2] = {
1962 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1963 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1964 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1965 static stringop_algs atom_memset[2] = {
1966 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1967 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1968 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1969 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1970 static const
1971 struct processor_costs atom_cost = {
1972 COSTS_N_INSNS (1), /* cost of an add instruction */
1973 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1974 COSTS_N_INSNS (1), /* variable shift costs */
1975 COSTS_N_INSNS (1), /* constant shift costs */
1976 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1977 COSTS_N_INSNS (4), /* HI */
1978 COSTS_N_INSNS (3), /* SI */
1979 COSTS_N_INSNS (4), /* DI */
1980 COSTS_N_INSNS (2)}, /* other */
1981 0, /* cost of multiply per each bit set */
1982 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1983 COSTS_N_INSNS (26), /* HI */
1984 COSTS_N_INSNS (42), /* SI */
1985 COSTS_N_INSNS (74), /* DI */
1986 COSTS_N_INSNS (74)}, /* other */
1987 COSTS_N_INSNS (1), /* cost of movsx */
1988 COSTS_N_INSNS (1), /* cost of movzx */
1989 8, /* "large" insn */
1990 17, /* MOVE_RATIO */
1992 /* All move costs are relative to integer->integer move times 2 and thus
1993 they are latency*2. */
1994 6, /* cost for loading QImode using movzbl */
1995 {6, 6, 6}, /* cost of loading integer registers
1996 in QImode, HImode and SImode.
1997 Relative to reg-reg move (2). */
1998 {6, 6, 6}, /* cost of storing integer registers */
1999 4, /* cost of reg,reg fld/fst */
2000 {6, 6, 18}, /* cost of loading fp registers
2001 in SFmode, DFmode and XFmode */
2002 {14, 14, 24}, /* cost of storing fp registers
2003 in SFmode, DFmode and XFmode */
2004 2, /* cost of moving MMX register */
2005 {8, 8}, /* cost of loading MMX registers
2006 in SImode and DImode */
2007 {10, 10}, /* cost of storing MMX registers
2008 in SImode and DImode */
2009 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2010 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2011 in 32,64,128,256 and 512-bit */
2012 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2013 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2014 in 32,64,128,256 and 512-bit */
2015 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2016 8, 6, /* SSE->integer and integer->SSE moves */
2017 8, 8, /* Gather load static, per_elt. */
2018 8, 8, /* Gather store static, per_elt. */
2019 32, /* size of l1 cache. */
2020 256, /* size of l2 cache. */
2021 64, /* size of prefetch block */
2022 6, /* number of parallel prefetches */
2023 3, /* Branch cost */
2024 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2025 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2026 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2027 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2028 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2029 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2031 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2032 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2033 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2034 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2035 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2036 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2037 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
2038 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
2039 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
2040 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
2041 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2042 atom_memcpy,
2043 atom_memset,
2044 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2045 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2048 static stringop_algs slm_memcpy[2] = {
2049 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2050 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2051 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2052 static stringop_algs slm_memset[2] = {
2053 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2054 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2055 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2056 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2057 static const
2058 struct processor_costs slm_cost = {
2059 COSTS_N_INSNS (1), /* cost of an add instruction */
2060 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2061 COSTS_N_INSNS (1), /* variable shift costs */
2062 COSTS_N_INSNS (1), /* constant shift costs */
2063 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2064 COSTS_N_INSNS (3), /* HI */
2065 COSTS_N_INSNS (3), /* SI */
2066 COSTS_N_INSNS (4), /* DI */
2067 COSTS_N_INSNS (2)}, /* other */
2068 0, /* cost of multiply per each bit set */
2069 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2070 COSTS_N_INSNS (26), /* HI */
2071 COSTS_N_INSNS (42), /* SI */
2072 COSTS_N_INSNS (74), /* DI */
2073 COSTS_N_INSNS (74)}, /* other */
2074 COSTS_N_INSNS (1), /* cost of movsx */
2075 COSTS_N_INSNS (1), /* cost of movzx */
2076 8, /* "large" insn */
2077 17, /* MOVE_RATIO */
2079 /* All move costs are relative to integer->integer move times 2 and thus
2080 they are latency*2. */
2081 8, /* cost for loading QImode using movzbl */
2082 {8, 8, 8}, /* cost of loading integer registers
2083 in QImode, HImode and SImode.
2084 Relative to reg-reg move (2). */
2085 {6, 6, 6}, /* cost of storing integer registers */
2086 2, /* cost of reg,reg fld/fst */
2087 {8, 8, 18}, /* cost of loading fp registers
2088 in SFmode, DFmode and XFmode */
2089 {6, 6, 18}, /* cost of storing fp registers
2090 in SFmode, DFmode and XFmode */
2091 2, /* cost of moving MMX register */
2092 {8, 8}, /* cost of loading MMX registers
2093 in SImode and DImode */
2094 {6, 6}, /* cost of storing MMX registers
2095 in SImode and DImode */
2096 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2097 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2098 in 32,64,128,256 and 512-bit */
2099 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2100 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2101 in 32,64,128,256 and 512-bit */
2102 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2103 8, 6, /* SSE->integer and integer->SSE moves */
2104 8, 8, /* Gather load static, per_elt. */
2105 8, 8, /* Gather store static, per_elt. */
2106 32, /* size of l1 cache. */
2107 256, /* size of l2 cache. */
2108 64, /* size of prefetch block */
2109 6, /* number of parallel prefetches */
2110 3, /* Branch cost */
2111 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2112 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2113 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2114 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2115 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2116 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2118 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2119 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2120 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2121 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2122 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2123 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2124 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2125 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2126 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2127 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
2128 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2129 slm_memcpy,
2130 slm_memset,
2131 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2132 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2135 static stringop_algs intel_memcpy[2] = {
2136 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2137 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2138 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2139 static stringop_algs intel_memset[2] = {
2140 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2141 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2142 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2143 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2144 static const
2145 struct processor_costs intel_cost = {
2146 COSTS_N_INSNS (1), /* cost of an add instruction */
2147 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2148 COSTS_N_INSNS (1), /* variable shift costs */
2149 COSTS_N_INSNS (1), /* constant shift costs */
2150 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2151 COSTS_N_INSNS (3), /* HI */
2152 COSTS_N_INSNS (3), /* SI */
2153 COSTS_N_INSNS (4), /* DI */
2154 COSTS_N_INSNS (2)}, /* other */
2155 0, /* cost of multiply per each bit set */
2156 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2157 COSTS_N_INSNS (26), /* HI */
2158 COSTS_N_INSNS (42), /* SI */
2159 COSTS_N_INSNS (74), /* DI */
2160 COSTS_N_INSNS (74)}, /* other */
2161 COSTS_N_INSNS (1), /* cost of movsx */
2162 COSTS_N_INSNS (1), /* cost of movzx */
2163 8, /* "large" insn */
2164 17, /* MOVE_RATIO */
2166 /* All move costs are relative to integer->integer move times 2 and thus
2167 they are latency*2. */
2168 6, /* cost for loading QImode using movzbl */
2169 {4, 4, 4}, /* cost of loading integer registers
2170 in QImode, HImode and SImode.
2171 Relative to reg-reg move (2). */
2172 {6, 6, 6}, /* cost of storing integer registers */
2173 2, /* cost of reg,reg fld/fst */
2174 {6, 6, 8}, /* cost of loading fp registers
2175 in SFmode, DFmode and XFmode */
2176 {6, 6, 10}, /* cost of storing fp registers
2177 in SFmode, DFmode and XFmode */
2178 2, /* cost of moving MMX register */
2179 {6, 6}, /* cost of loading MMX registers
2180 in SImode and DImode */
2181 {6, 6}, /* cost of storing MMX registers
2182 in SImode and DImode */
2183 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2184 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2185 in 32,64,128,256 and 512-bit */
2186 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2187 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2188 in 32,64,128,256 and 512-bit */
2189 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2190 4, 4, /* SSE->integer and integer->SSE moves */
2191 6, 6, /* Gather load static, per_elt. */
2192 6, 6, /* Gather store static, per_elt. */
2193 32, /* size of l1 cache. */
2194 256, /* size of l2 cache. */
2195 64, /* size of prefetch block */
2196 6, /* number of parallel prefetches */
2197 3, /* Branch cost */
2198 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2199 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2200 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2201 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2202 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2203 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2205 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */
2206 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2207 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2208 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2209 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2210 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2211 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2212 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2213 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2214 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2215 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2216 intel_memcpy,
2217 intel_memset,
2218 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2219 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2222 /* Generic should produce code tuned for Core-i7 (and newer chips)
2223 and btver1 (and newer chips). */
2225 static stringop_algs generic_memcpy[2] = {
2226 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2227 {-1, libcall, false}}},
2228 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2229 {-1, libcall, false}}}};
2230 static stringop_algs generic_memset[2] = {
2231 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2232 {-1, libcall, false}}},
2233 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2234 {-1, libcall, false}}}};
2235 static const
2236 struct processor_costs generic_cost = {
2237 COSTS_N_INSNS (1), /* cost of an add instruction */
2238 /* Setting cost to 2 makes our current implementation of synth_mult result in
2239 use of unnecessary temporary registers causing regression on several
2240 SPECfp benchmarks. */
2241 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2242 COSTS_N_INSNS (1), /* variable shift costs */
2243 COSTS_N_INSNS (1), /* constant shift costs */
2244 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2245 COSTS_N_INSNS (4), /* HI */
2246 COSTS_N_INSNS (3), /* SI */
2247 COSTS_N_INSNS (4), /* DI */
2248 COSTS_N_INSNS (4)}, /* other */
2249 0, /* cost of multiply per each bit set */
2250 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2251 COSTS_N_INSNS (22), /* HI */
2252 COSTS_N_INSNS (30), /* SI */
2253 COSTS_N_INSNS (74), /* DI */
2254 COSTS_N_INSNS (74)}, /* other */
2255 COSTS_N_INSNS (1), /* cost of movsx */
2256 COSTS_N_INSNS (1), /* cost of movzx */
2257 8, /* "large" insn */
2258 17, /* MOVE_RATIO */
2260 /* All move costs are relative to integer->integer move times 2 and thus
2261 they are latency*2. */
2262 6, /* cost for loading QImode using movzbl */
2263 {6, 6, 6}, /* cost of loading integer registers
2264 in QImode, HImode and SImode.
2265 Relative to reg-reg move (2). */
2266 {6, 6, 6}, /* cost of storing integer registers */
2267 4, /* cost of reg,reg fld/fst */
2268 {6, 6, 12}, /* cost of loading fp registers
2269 in SFmode, DFmode and XFmode */
2270 {6, 6, 12}, /* cost of storing fp registers
2271 in SFmode, DFmode and XFmode */
2272 2, /* cost of moving MMX register */
2273 {6, 6}, /* cost of loading MMX registers
2274 in SImode and DImode */
2275 {6, 6}, /* cost of storing MMX registers
2276 in SImode and DImode */
2277 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2278 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2279 in 32,64,128,256 and 512-bit */
2280 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
2281 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2282 in 32,64,128,256 and 512-bit */
2283 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2284 6, 6, /* SSE->integer and integer->SSE moves */
2285 18, 6, /* Gather load static, per_elt. */
2286 18, 6, /* Gather store static, per_elt. */
2287 32, /* size of l1 cache. */
2288 512, /* size of l2 cache. */
2289 64, /* size of prefetch block */
2290 6, /* number of parallel prefetches */
2291 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2292 value is increased to perhaps more appropriate value of 5. */
2293 3, /* Branch cost */
2294 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2295 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2296 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2297 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2298 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2299 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2301 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2302 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2303 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2304 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2305 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2306 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2307 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2308 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2309 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2310 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2311 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2312 generic_memcpy,
2313 generic_memset,
2314 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2315 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2318 /* core_cost should produce code tuned for Core familly of CPUs. */
2319 static stringop_algs core_memcpy[2] = {
2320 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2321 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2322 {-1, libcall, false}}}};
2323 static stringop_algs core_memset[2] = {
2324 {libcall, {{6, loop_1_byte, true},
2325 {24, loop, true},
2326 {8192, rep_prefix_4_byte, true},
2327 {-1, libcall, false}}},
2328 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2329 {-1, libcall, false}}}};
2331 static const
2332 struct processor_costs core_cost = {
2333 COSTS_N_INSNS (1), /* cost of an add instruction */
2334 /* On all chips taken into consideration lea is 2 cycles and more. With
2335 this cost however our current implementation of synth_mult results in
2336 use of unnecessary temporary registers causing regression on several
2337 SPECfp benchmarks. */
2338 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2339 COSTS_N_INSNS (1), /* variable shift costs */
2340 COSTS_N_INSNS (1), /* constant shift costs */
2341 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2342 COSTS_N_INSNS (4), /* HI */
2343 COSTS_N_INSNS (3), /* SI */
2344 /* Here we tune for Sandybridge or newer. */
2345 COSTS_N_INSNS (3), /* DI */
2346 COSTS_N_INSNS (3)}, /* other */
2347 0, /* cost of multiply per each bit set */
2348 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2349 model is not realistic. We compensate by increasing the latencies a bit. */
2350 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2351 COSTS_N_INSNS (11), /* HI */
2352 COSTS_N_INSNS (14), /* SI */
2353 COSTS_N_INSNS (81), /* DI */
2354 COSTS_N_INSNS (81)}, /* other */
2355 COSTS_N_INSNS (1), /* cost of movsx */
2356 COSTS_N_INSNS (1), /* cost of movzx */
2357 8, /* "large" insn */
2358 17, /* MOVE_RATIO */
2360 /* All move costs are relative to integer->integer move times 2 and thus
2361 they are latency*2. */
2362 6, /* cost for loading QImode using movzbl */
2363 {4, 4, 4}, /* cost of loading integer registers
2364 in QImode, HImode and SImode.
2365 Relative to reg-reg move (2). */
2366 {6, 6, 6}, /* cost of storing integer registers */
2367 2, /* cost of reg,reg fld/fst */
2368 {6, 6, 8}, /* cost of loading fp registers
2369 in SFmode, DFmode and XFmode */
2370 {6, 6, 10}, /* cost of storing fp registers
2371 in SFmode, DFmode and XFmode */
2372 2, /* cost of moving MMX register */
2373 {6, 6}, /* cost of loading MMX registers
2374 in SImode and DImode */
2375 {6, 6}, /* cost of storing MMX registers
2376 in SImode and DImode */
2377 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2378 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2379 in 32,64,128,256 and 512-bit */
2380 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2381 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2382 in 32,64,128,256 and 512-bit */
2383 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2384 2, 2, /* SSE->integer and integer->SSE moves */
2385 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2386 rec. throughput 6.
2387 So 5 uops statically and one uops per load. */
2388 10, 6, /* Gather load static, per_elt. */
2389 10, 6, /* Gather store static, per_elt. */
2390 64, /* size of l1 cache. */
2391 512, /* size of l2 cache. */
2392 64, /* size of prefetch block */
2393 6, /* number of parallel prefetches */
2394 /* FIXME perhaps more appropriate value is 5. */
2395 3, /* Branch cost */
2396 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2397 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2398 /* 10-24 */
2399 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2400 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2401 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2402 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
2404 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2405 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2406 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2407 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2408 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2409 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2410 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2411 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2412 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2413 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2414 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2415 core_memcpy,
2416 core_memset,
2417 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2418 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */