RISC-V: Improve length attributes for atomic insn sequences
[official-gcc.git] / gcc / config / i386 / x86-tune-costs.h
blob2bfaee554d53601f2b289d75d772752becbc9e3f
1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2024 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
30 static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
40 /* Start of register allocator costs. integer->integer move cost is 2. */
41 2, /* cost for loading QImode using movzbl */
42 {2, 2, 2}, /* cost of loading integer registers
43 in QImode, HImode and SImode.
44 Relative to reg-reg move (2). */
45 {2, 2, 2}, /* cost of storing integer registers */
46 2, /* cost of reg,reg fld/fst */
47 {2, 2, 2}, /* cost of loading fp registers
48 in SFmode, DFmode and XFmode */
49 {2, 2, 2}, /* cost of storing fp registers
50 in SFmode, DFmode and XFmode */
51 3, /* cost of moving MMX register */
52 {3, 3}, /* cost of loading MMX registers
53 in SImode and DImode */
54 {3, 3}, /* cost of storing MMX registers
55 in SImode and DImode */
56 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
57 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
58 in 32,64,128,256 and 512-bit */
59 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
60 in 32,64,128,256 and 512-bit */
61 3, 3, /* SSE->integer and integer->SSE moves */
62 3, 3, /* mask->integer and integer->mask moves */
63 {2, 2, 2}, /* cost of loading mask register
64 in QImode, HImode, SImode. */
65 {2, 2, 2}, /* cost if storing mask register
66 in QImode, HImode, SImode. */
67 2, /* cost of moving mask register. */
68 /* End of register allocator costs. */
71 COSTS_N_BYTES (2), /* cost of an add instruction */
72 COSTS_N_BYTES (3), /* cost of a lea instruction */
73 COSTS_N_BYTES (2), /* variable shift costs */
74 COSTS_N_BYTES (3), /* constant shift costs */
75 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
76 COSTS_N_BYTES (3), /* HI */
77 COSTS_N_BYTES (3), /* SI */
78 COSTS_N_BYTES (3), /* DI */
79 COSTS_N_BYTES (5)}, /* other */
80 0, /* cost of multiply per each bit set */
81 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
82 COSTS_N_BYTES (3), /* HI */
83 COSTS_N_BYTES (3), /* SI */
84 COSTS_N_BYTES (3), /* DI */
85 COSTS_N_BYTES (5)}, /* other */
86 COSTS_N_BYTES (3), /* cost of movsx */
87 COSTS_N_BYTES (3), /* cost of movzx */
88 0, /* "large" insn */
89 2, /* MOVE_RATIO */
90 2, /* CLEAR_RATIO */
91 {2, 2, 2}, /* cost of loading integer registers
92 in QImode, HImode and SImode.
93 Relative to reg-reg move (2). */
94 {2, 2, 2}, /* cost of storing integer registers */
95 {3, 3, 3, 3, 3}, /* cost of loading SSE register
96 in 32bit, 64bit, 128bit, 256bit and 512bit */
97 {3, 3, 3, 3, 3}, /* cost of storing SSE register
98 in 32bit, 64bit, 128bit, 256bit and 512bit */
99 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
100 in 128bit, 256bit and 512bit */
101 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
102 in 128bit, 256bit and 512bit */
103 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
104 3, /* cost of moving SSE register to integer. */
105 5, 0, /* Gather load static, per_elt. */
106 5, 0, /* Gather store static, per_elt. */
107 0, /* size of l1 cache */
108 0, /* size of l2 cache */
109 0, /* size of prefetch block */
110 0, /* number of parallel prefetches */
111 2, /* Branch cost */
112 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
113 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
114 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
115 COSTS_N_BYTES (2), /* cost of FABS instruction. */
116 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
117 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
119 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
120 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
121 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
122 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
123 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
124 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
125 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
126 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
127 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
128 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
129 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
130 ix86_size_memcpy,
131 ix86_size_memset,
132 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
133 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
134 NULL, /* Loop alignment. */
135 NULL, /* Jump alignment. */
136 NULL, /* Label alignment. */
137 NULL, /* Func alignment. */
138 4, /* Small unroll limit. */
139 2, /* Small unroll factor. */
142 /* Processor costs (relative to an add) */
143 static stringop_algs i386_memcpy[2] = {
144 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
145 DUMMY_STRINGOP_ALGS};
146 static stringop_algs i386_memset[2] = {
147 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
148 DUMMY_STRINGOP_ALGS};
150 static const
151 struct processor_costs i386_cost = { /* 386 specific costs */
153 /* Start of register allocator costs. integer->integer move cost is 2. */
154 4, /* cost for loading QImode using movzbl */
155 {2, 4, 2}, /* cost of loading integer registers
156 in QImode, HImode and SImode.
157 Relative to reg-reg move (2). */
158 {2, 4, 2}, /* cost of storing integer registers */
159 2, /* cost of reg,reg fld/fst */
160 {8, 8, 8}, /* cost of loading fp registers
161 in SFmode, DFmode and XFmode */
162 {8, 8, 8}, /* cost of storing fp registers
163 in SFmode, DFmode and XFmode */
164 2, /* cost of moving MMX register */
165 {4, 8}, /* cost of loading MMX registers
166 in SImode and DImode */
167 {4, 8}, /* cost of storing MMX registers
168 in SImode and DImode */
169 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
170 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
171 in 32,64,128,256 and 512-bit */
172 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
173 in 32,64,128,256 and 512-bit */
174 3, 3, /* SSE->integer and integer->SSE moves */
175 3, 3, /* mask->integer and integer->mask moves */
176 {2, 4, 2}, /* cost of loading mask register
177 in QImode, HImode, SImode. */
178 {2, 4, 2}, /* cost if storing mask register
179 in QImode, HImode, SImode. */
180 2, /* cost of moving mask register. */
181 /* End of register allocator costs. */
184 COSTS_N_INSNS (1), /* cost of an add instruction */
185 COSTS_N_INSNS (1), /* cost of a lea instruction */
186 COSTS_N_INSNS (3), /* variable shift costs */
187 COSTS_N_INSNS (2), /* constant shift costs */
188 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
189 COSTS_N_INSNS (6), /* HI */
190 COSTS_N_INSNS (6), /* SI */
191 COSTS_N_INSNS (6), /* DI */
192 COSTS_N_INSNS (6)}, /* other */
193 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
194 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
195 COSTS_N_INSNS (23), /* HI */
196 COSTS_N_INSNS (23), /* SI */
197 COSTS_N_INSNS (23), /* DI */
198 COSTS_N_INSNS (23)}, /* other */
199 COSTS_N_INSNS (3), /* cost of movsx */
200 COSTS_N_INSNS (2), /* cost of movzx */
201 15, /* "large" insn */
202 3, /* MOVE_RATIO */
203 3, /* CLEAR_RATIO */
204 {2, 4, 2}, /* cost of loading integer registers
205 in QImode, HImode and SImode.
206 Relative to reg-reg move (2). */
207 {2, 4, 2}, /* cost of storing integer registers */
208 {4, 8, 16, 32, 64}, /* cost of loading SSE register
209 in 32bit, 64bit, 128bit, 256bit and 512bit */
210 {4, 8, 16, 32, 64}, /* cost of storing SSE register
211 in 32bit, 64bit, 128bit, 256bit and 512bit */
212 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
213 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
214 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
215 3, /* cost of moving SSE register to integer. */
216 4, 4, /* Gather load static, per_elt. */
217 4, 4, /* Gather store static, per_elt. */
218 0, /* size of l1 cache */
219 0, /* size of l2 cache */
220 0, /* size of prefetch block */
221 0, /* number of parallel prefetches */
222 1, /* Branch cost */
223 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
224 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
225 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
226 COSTS_N_INSNS (22), /* cost of FABS instruction. */
227 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
228 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
230 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
231 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
232 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
233 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
234 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
235 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
236 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
237 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
238 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
239 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
240 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
241 i386_memcpy,
242 i386_memset,
243 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
244 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
245 "4", /* Loop alignment. */
246 "4", /* Jump alignment. */
247 NULL, /* Label alignment. */
248 "4", /* Func alignment. */
249 4, /* Small unroll limit. */
250 2, /* Small unroll factor. */
253 static stringop_algs i486_memcpy[2] = {
254 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
255 DUMMY_STRINGOP_ALGS};
256 static stringop_algs i486_memset[2] = {
257 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
258 DUMMY_STRINGOP_ALGS};
260 static const
261 struct processor_costs i486_cost = { /* 486 specific costs */
263 /* Start of register allocator costs. integer->integer move cost is 2. */
264 4, /* cost for loading QImode using movzbl */
265 {2, 4, 2}, /* cost of loading integer registers
266 in QImode, HImode and SImode.
267 Relative to reg-reg move (2). */
268 {2, 4, 2}, /* cost of storing integer registers */
269 2, /* cost of reg,reg fld/fst */
270 {8, 8, 8}, /* cost of loading fp registers
271 in SFmode, DFmode and XFmode */
272 {8, 8, 8}, /* cost of storing fp registers
273 in SFmode, DFmode and XFmode */
274 2, /* cost of moving MMX register */
275 {4, 8}, /* cost of loading MMX registers
276 in SImode and DImode */
277 {4, 8}, /* cost of storing MMX registers
278 in SImode and DImode */
279 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
280 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
281 in 32,64,128,256 and 512-bit */
282 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
283 in 32,64,128,256 and 512-bit */
284 3, 3, /* SSE->integer and integer->SSE moves */
285 3, 3, /* mask->integer and integer->mask moves */
286 {2, 4, 2}, /* cost of loading mask register
287 in QImode, HImode, SImode. */
288 {2, 4, 2}, /* cost if storing mask register
289 in QImode, HImode, SImode. */
290 2, /* cost of moving mask register. */
291 /* End of register allocator costs. */
294 COSTS_N_INSNS (1), /* cost of an add instruction */
295 COSTS_N_INSNS (1), /* cost of a lea instruction */
296 COSTS_N_INSNS (3), /* variable shift costs */
297 COSTS_N_INSNS (2), /* constant shift costs */
298 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
299 COSTS_N_INSNS (12), /* HI */
300 COSTS_N_INSNS (12), /* SI */
301 COSTS_N_INSNS (12), /* DI */
302 COSTS_N_INSNS (12)}, /* other */
303 1, /* cost of multiply per each bit set */
304 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
305 COSTS_N_INSNS (40), /* HI */
306 COSTS_N_INSNS (40), /* SI */
307 COSTS_N_INSNS (40), /* DI */
308 COSTS_N_INSNS (40)}, /* other */
309 COSTS_N_INSNS (3), /* cost of movsx */
310 COSTS_N_INSNS (2), /* cost of movzx */
311 15, /* "large" insn */
312 3, /* MOVE_RATIO */
313 3, /* CLEAR_RATIO */
314 {2, 4, 2}, /* cost of loading integer registers
315 in QImode, HImode and SImode.
316 Relative to reg-reg move (2). */
317 {2, 4, 2}, /* cost of storing integer registers */
318 {4, 8, 16, 32, 64}, /* cost of loading SSE register
319 in 32bit, 64bit, 128bit, 256bit and 512bit */
320 {4, 8, 16, 32, 64}, /* cost of storing SSE register
321 in 32bit, 64bit, 128bit, 256bit and 512bit */
322 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
323 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
324 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
325 3, /* cost of moving SSE register to integer. */
326 4, 4, /* Gather load static, per_elt. */
327 4, 4, /* Gather store static, per_elt. */
328 4, /* size of l1 cache. 486 has 8kB cache
329 shared for code and data, so 4kB is
330 not really precise. */
331 4, /* size of l2 cache */
332 0, /* size of prefetch block */
333 0, /* number of parallel prefetches */
334 1, /* Branch cost */
335 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
336 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
337 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
338 COSTS_N_INSNS (3), /* cost of FABS instruction. */
339 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
340 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
342 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
343 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
344 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
345 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
346 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
347 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
348 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
349 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
350 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
351 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
352 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
353 i486_memcpy,
354 i486_memset,
355 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
356 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
357 "16", /* Loop alignment. */
358 "16", /* Jump alignment. */
359 "0:0:8", /* Label alignment. */
360 "16", /* Func alignment. */
361 4, /* Small unroll limit. */
362 2, /* Small unroll factor. */
365 static stringop_algs pentium_memcpy[2] = {
366 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
367 DUMMY_STRINGOP_ALGS};
368 static stringop_algs pentium_memset[2] = {
369 {libcall, {{-1, rep_prefix_4_byte, false}}},
370 DUMMY_STRINGOP_ALGS};
372 static const
373 struct processor_costs pentium_cost = {
375 /* Start of register allocator costs. integer->integer move cost is 2. */
376 6, /* cost for loading QImode using movzbl */
377 {2, 4, 2}, /* cost of loading integer registers
378 in QImode, HImode and SImode.
379 Relative to reg-reg move (2). */
380 {2, 4, 2}, /* cost of storing integer registers */
381 2, /* cost of reg,reg fld/fst */
382 {2, 2, 6}, /* cost of loading fp registers
383 in SFmode, DFmode and XFmode */
384 {4, 4, 6}, /* cost of storing fp registers
385 in SFmode, DFmode and XFmode */
386 8, /* cost of moving MMX register */
387 {8, 8}, /* cost of loading MMX registers
388 in SImode and DImode */
389 {8, 8}, /* cost of storing MMX registers
390 in SImode and DImode */
391 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
392 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
393 in 32,64,128,256 and 512-bit */
394 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
395 in 32,64,128,256 and 512-bit */
396 3, 3, /* SSE->integer and integer->SSE moves */
397 3, 3, /* mask->integer and integer->mask moves */
398 {2, 4, 2}, /* cost of loading mask register
399 in QImode, HImode, SImode. */
400 {2, 4, 2}, /* cost if storing mask register
401 in QImode, HImode, SImode. */
402 2, /* cost of moving mask register. */
403 /* End of register allocator costs. */
406 COSTS_N_INSNS (1), /* cost of an add instruction */
407 COSTS_N_INSNS (1), /* cost of a lea instruction */
408 COSTS_N_INSNS (4), /* variable shift costs */
409 COSTS_N_INSNS (1), /* constant shift costs */
410 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
411 COSTS_N_INSNS (11), /* HI */
412 COSTS_N_INSNS (11), /* SI */
413 COSTS_N_INSNS (11), /* DI */
414 COSTS_N_INSNS (11)}, /* other */
415 0, /* cost of multiply per each bit set */
416 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
417 COSTS_N_INSNS (25), /* HI */
418 COSTS_N_INSNS (25), /* SI */
419 COSTS_N_INSNS (25), /* DI */
420 COSTS_N_INSNS (25)}, /* other */
421 COSTS_N_INSNS (3), /* cost of movsx */
422 COSTS_N_INSNS (2), /* cost of movzx */
423 8, /* "large" insn */
424 6, /* MOVE_RATIO */
425 6, /* CLEAR_RATIO */
426 {2, 4, 2}, /* cost of loading integer registers
427 in QImode, HImode and SImode.
428 Relative to reg-reg move (2). */
429 {2, 4, 2}, /* cost of storing integer registers */
430 {4, 8, 16, 32, 64}, /* cost of loading SSE register
431 in 32bit, 64bit, 128bit, 256bit and 512bit */
432 {4, 8, 16, 32, 64}, /* cost of storing SSE register
433 in 32bit, 64bit, 128bit, 256bit and 512bit */
434 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
435 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
436 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
437 3, /* cost of moving SSE register to integer. */
438 4, 4, /* Gather load static, per_elt. */
439 4, 4, /* Gather store static, per_elt. */
440 8, /* size of l1 cache. */
441 8, /* size of l2 cache */
442 0, /* size of prefetch block */
443 0, /* number of parallel prefetches */
444 2, /* Branch cost */
445 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
446 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
447 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
448 COSTS_N_INSNS (1), /* cost of FABS instruction. */
449 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
450 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
452 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
453 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
454 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
455 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
456 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
457 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
458 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
459 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
460 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
461 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
462 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
463 pentium_memcpy,
464 pentium_memset,
465 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
466 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
467 "16:8:8", /* Loop alignment. */
468 "16:8:8", /* Jump alignment. */
469 "0:0:8", /* Label alignment. */
470 "16", /* Func alignment. */
471 4, /* Small unroll limit. */
472 2, /* Small unroll factor. */
475 static const
476 struct processor_costs lakemont_cost = {
478 /* Start of register allocator costs. integer->integer move cost is 2. */
479 6, /* cost for loading QImode using movzbl */
480 {2, 4, 2}, /* cost of loading integer registers
481 in QImode, HImode and SImode.
482 Relative to reg-reg move (2). */
483 {2, 4, 2}, /* cost of storing integer registers */
484 2, /* cost of reg,reg fld/fst */
485 {2, 2, 6}, /* cost of loading fp registers
486 in SFmode, DFmode and XFmode */
487 {4, 4, 6}, /* cost of storing fp registers
488 in SFmode, DFmode and XFmode */
489 8, /* cost of moving MMX register */
490 {8, 8}, /* cost of loading MMX registers
491 in SImode and DImode */
492 {8, 8}, /* cost of storing MMX registers
493 in SImode and DImode */
494 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
495 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
496 in 32,64,128,256 and 512-bit */
497 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
498 in 32,64,128,256 and 512-bit */
499 3, 3, /* SSE->integer and integer->SSE moves */
500 3, 3, /* mask->integer and integer->mask moves */
501 {2, 4, 2}, /* cost of loading mask register
502 in QImode, HImode, SImode. */
503 {2, 4, 2}, /* cost if storing mask register
504 in QImode, HImode, SImode. */
505 2, /* cost of moving mask register. */
506 /* End of register allocator costs. */
509 COSTS_N_INSNS (1), /* cost of an add instruction */
510 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
511 COSTS_N_INSNS (1), /* variable shift costs */
512 COSTS_N_INSNS (1), /* constant shift costs */
513 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
514 COSTS_N_INSNS (11), /* HI */
515 COSTS_N_INSNS (11), /* SI */
516 COSTS_N_INSNS (11), /* DI */
517 COSTS_N_INSNS (11)}, /* other */
518 0, /* cost of multiply per each bit set */
519 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
520 COSTS_N_INSNS (25), /* HI */
521 COSTS_N_INSNS (25), /* SI */
522 COSTS_N_INSNS (25), /* DI */
523 COSTS_N_INSNS (25)}, /* other */
524 COSTS_N_INSNS (3), /* cost of movsx */
525 COSTS_N_INSNS (2), /* cost of movzx */
526 8, /* "large" insn */
527 17, /* MOVE_RATIO */
528 6, /* CLEAR_RATIO */
529 {2, 4, 2}, /* cost of loading integer registers
530 in QImode, HImode and SImode.
531 Relative to reg-reg move (2). */
532 {2, 4, 2}, /* cost of storing integer registers */
533 {4, 8, 16, 32, 64}, /* cost of loading SSE register
534 in 32bit, 64bit, 128bit, 256bit and 512bit */
535 {4, 8, 16, 32, 64}, /* cost of storing SSE register
536 in 32bit, 64bit, 128bit, 256bit and 512bit */
537 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
538 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
539 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
540 3, /* cost of moving SSE register to integer. */
541 4, 4, /* Gather load static, per_elt. */
542 4, 4, /* Gather store static, per_elt. */
543 8, /* size of l1 cache. */
544 8, /* size of l2 cache */
545 0, /* size of prefetch block */
546 0, /* number of parallel prefetches */
547 2, /* Branch cost */
548 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
549 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
550 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
551 COSTS_N_INSNS (1), /* cost of FABS instruction. */
552 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
553 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
555 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
556 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
557 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
558 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
559 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
560 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
561 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
562 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
563 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
564 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
565 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
566 pentium_memcpy,
567 pentium_memset,
568 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
569 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
570 "16:8:8", /* Loop alignment. */
571 "16:8:8", /* Jump alignment. */
572 "0:0:8", /* Label alignment. */
573 "16", /* Func alignment. */
574 4, /* Small unroll limit. */
575 2, /* Small unroll factor. */
578 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
579 (we ensure the alignment). For small blocks inline loop is still a
580 noticeable win, for bigger blocks either rep movsl or rep movsb is
581 way to go. Rep movsb has apparently more expensive startup time in CPU,
582 but after 4K the difference is down in the noise. */
583 static stringop_algs pentiumpro_memcpy[2] = {
584 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
585 {8192, rep_prefix_4_byte, false},
586 {-1, rep_prefix_1_byte, false}}},
587 DUMMY_STRINGOP_ALGS};
588 static stringop_algs pentiumpro_memset[2] = {
589 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
590 {8192, rep_prefix_4_byte, false},
591 {-1, libcall, false}}},
592 DUMMY_STRINGOP_ALGS};
593 static const
594 struct processor_costs pentiumpro_cost = {
596 /* Start of register allocator costs. integer->integer move cost is 2. */
597 2, /* cost for loading QImode using movzbl */
598 {4, 4, 4}, /* cost of loading integer registers
599 in QImode, HImode and SImode.
600 Relative to reg-reg move (2). */
601 {2, 2, 2}, /* cost of storing integer registers */
602 2, /* cost of reg,reg fld/fst */
603 {2, 2, 6}, /* cost of loading fp registers
604 in SFmode, DFmode and XFmode */
605 {4, 4, 6}, /* cost of storing fp registers
606 in SFmode, DFmode and XFmode */
607 2, /* cost of moving MMX register */
608 {2, 2}, /* cost of loading MMX registers
609 in SImode and DImode */
610 {2, 2}, /* cost of storing MMX registers
611 in SImode and DImode */
612 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
613 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
614 in 32,64,128,256 and 512-bit */
615 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
616 in 32,64,128,256 and 512-bit */
617 3, 3, /* SSE->integer and integer->SSE moves */
618 3, 3, /* mask->integer and integer->mask moves */
619 {4, 4, 4}, /* cost of loading mask register
620 in QImode, HImode, SImode. */
621 {2, 2, 2}, /* cost if storing mask register
622 in QImode, HImode, SImode. */
623 2, /* cost of moving mask register. */
624 /* End of register allocator costs. */
627 COSTS_N_INSNS (1), /* cost of an add instruction */
628 COSTS_N_INSNS (1), /* cost of a lea instruction */
629 COSTS_N_INSNS (1), /* variable shift costs */
630 COSTS_N_INSNS (1), /* constant shift costs */
631 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
632 COSTS_N_INSNS (4), /* HI */
633 COSTS_N_INSNS (4), /* SI */
634 COSTS_N_INSNS (4), /* DI */
635 COSTS_N_INSNS (4)}, /* other */
636 0, /* cost of multiply per each bit set */
637 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
638 COSTS_N_INSNS (17), /* HI */
639 COSTS_N_INSNS (17), /* SI */
640 COSTS_N_INSNS (17), /* DI */
641 COSTS_N_INSNS (17)}, /* other */
642 COSTS_N_INSNS (1), /* cost of movsx */
643 COSTS_N_INSNS (1), /* cost of movzx */
644 8, /* "large" insn */
645 6, /* MOVE_RATIO */
646 6, /* CLEAR_RATIO */
647 {4, 4, 4}, /* cost of loading integer registers
648 in QImode, HImode and SImode.
649 Relative to reg-reg move (2). */
650 {2, 2, 2}, /* cost of storing integer registers */
651 {4, 8, 16, 32, 64}, /* cost of loading SSE register
652 in 32bit, 64bit, 128bit, 256bit and 512bit */
653 {4, 8, 16, 32, 64}, /* cost of storing SSE register
654 in 32bit, 64bit, 128bit, 256bit and 512bit */
655 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
656 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
657 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
658 3, /* cost of moving SSE register to integer. */
659 4, 4, /* Gather load static, per_elt. */
660 4, 4, /* Gather store static, per_elt. */
661 8, /* size of l1 cache. */
662 256, /* size of l2 cache */
663 32, /* size of prefetch block */
664 6, /* number of parallel prefetches */
665 2, /* Branch cost */
666 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
667 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
668 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
669 COSTS_N_INSNS (2), /* cost of FABS instruction. */
670 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
671 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
673 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
674 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
675 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
676 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
677 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
678 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
679 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
680 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
681 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
682 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
683 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
684 pentiumpro_memcpy,
685 pentiumpro_memset,
686 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
687 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
688 "16", /* Loop alignment. */
689 "16:11:8", /* Jump alignment. */
690 "0:0:8", /* Label alignment. */
691 "16", /* Func alignment. */
692 4, /* Small unroll limit. */
693 2, /* Small unroll factor. */
696 static stringop_algs geode_memcpy[2] = {
697 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
698 DUMMY_STRINGOP_ALGS};
699 static stringop_algs geode_memset[2] = {
700 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
701 DUMMY_STRINGOP_ALGS};
702 static const
703 struct processor_costs geode_cost = {
705 /* Start of register allocator costs. integer->integer move cost is 2. */
706 2, /* cost for loading QImode using movzbl */
707 {2, 2, 2}, /* cost of loading integer registers
708 in QImode, HImode and SImode.
709 Relative to reg-reg move (2). */
710 {2, 2, 2}, /* cost of storing integer registers */
711 2, /* cost of reg,reg fld/fst */
712 {2, 2, 2}, /* cost of loading fp registers
713 in SFmode, DFmode and XFmode */
714 {4, 6, 6}, /* cost of storing fp registers
715 in SFmode, DFmode and XFmode */
716 2, /* cost of moving MMX register */
717 {2, 2}, /* cost of loading MMX registers
718 in SImode and DImode */
719 {2, 2}, /* cost of storing MMX registers
720 in SImode and DImode */
721 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
722 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
723 in 32,64,128,256 and 512-bit */
724 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
725 in 32,64,128,256 and 512-bit */
726 6, 6, /* SSE->integer and integer->SSE moves */
727 6, 6, /* mask->integer and integer->mask moves */
728 {2, 2, 2}, /* cost of loading mask register
729 in QImode, HImode, SImode. */
730 {2, 2, 2}, /* cost if storing mask register
731 in QImode, HImode, SImode. */
732 2, /* cost of moving mask register. */
733 /* End of register allocator costs. */
736 COSTS_N_INSNS (1), /* cost of an add instruction */
737 COSTS_N_INSNS (1), /* cost of a lea instruction */
738 COSTS_N_INSNS (2), /* variable shift costs */
739 COSTS_N_INSNS (1), /* constant shift costs */
740 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
741 COSTS_N_INSNS (4), /* HI */
742 COSTS_N_INSNS (7), /* SI */
743 COSTS_N_INSNS (7), /* DI */
744 COSTS_N_INSNS (7)}, /* other */
745 0, /* cost of multiply per each bit set */
746 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
747 COSTS_N_INSNS (23), /* HI */
748 COSTS_N_INSNS (39), /* SI */
749 COSTS_N_INSNS (39), /* DI */
750 COSTS_N_INSNS (39)}, /* other */
751 COSTS_N_INSNS (1), /* cost of movsx */
752 COSTS_N_INSNS (1), /* cost of movzx */
753 8, /* "large" insn */
754 4, /* MOVE_RATIO */
755 4, /* CLEAR_RATIO */
756 {2, 2, 2}, /* cost of loading integer registers
757 in QImode, HImode and SImode.
758 Relative to reg-reg move (2). */
759 {2, 2, 2}, /* cost of storing integer registers */
760 {2, 2, 8, 16, 32}, /* cost of loading SSE register
761 in 32bit, 64bit, 128bit, 256bit and 512bit */
762 {2, 2, 8, 16, 32}, /* cost of storing SSE register
763 in 32bit, 64bit, 128bit, 256bit and 512bit */
764 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
765 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
766 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
767 6, /* cost of moving SSE register to integer. */
768 2, 2, /* Gather load static, per_elt. */
769 2, 2, /* Gather store static, per_elt. */
770 64, /* size of l1 cache. */
771 128, /* size of l2 cache. */
772 32, /* size of prefetch block */
773 1, /* number of parallel prefetches */
774 1, /* Branch cost */
775 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
776 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
777 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
778 COSTS_N_INSNS (1), /* cost of FABS instruction. */
779 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
780 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
782 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
783 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
784 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
785 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
786 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
787 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
788 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
789 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
790 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
791 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
792 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
793 geode_memcpy,
794 geode_memset,
795 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
796 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
797 NULL, /* Loop alignment. */
798 NULL, /* Jump alignment. */
799 NULL, /* Label alignment. */
800 NULL, /* Func alignment. */
801 4, /* Small unroll limit. */
802 2, /* Small unroll factor. */
805 static stringop_algs k6_memcpy[2] = {
806 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
807 DUMMY_STRINGOP_ALGS};
808 static stringop_algs k6_memset[2] = {
809 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
810 DUMMY_STRINGOP_ALGS};
811 static const
812 struct processor_costs k6_cost = {
814 /* Start of register allocator costs. integer->integer move cost is 2. */
815 3, /* cost for loading QImode using movzbl */
816 {4, 5, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 3, 2}, /* cost of storing integer registers */
820 4, /* cost of reg,reg fld/fst */
821 {6, 6, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 4}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
831 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
832 in 32,64,128,256 and 512-bit */
833 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
834 in 32,64,128,256 and 512-bit */
835 6, 6, /* SSE->integer and integer->SSE moves */
836 6, 6, /* mask->integer and integer->mask moves */
837 {4, 5, 4}, /* cost of loading mask register
838 in QImode, HImode, SImode. */
839 {2, 3, 2}, /* cost if storing mask register
840 in QImode, HImode, SImode. */
841 2, /* cost of moving mask register. */
842 /* End of register allocator costs. */
845 COSTS_N_INSNS (1), /* cost of an add instruction */
846 COSTS_N_INSNS (2), /* cost of a lea instruction */
847 COSTS_N_INSNS (1), /* variable shift costs */
848 COSTS_N_INSNS (1), /* constant shift costs */
849 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
850 COSTS_N_INSNS (3), /* HI */
851 COSTS_N_INSNS (3), /* SI */
852 COSTS_N_INSNS (3), /* DI */
853 COSTS_N_INSNS (3)}, /* other */
854 0, /* cost of multiply per each bit set */
855 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
856 COSTS_N_INSNS (18), /* HI */
857 COSTS_N_INSNS (18), /* SI */
858 COSTS_N_INSNS (18), /* DI */
859 COSTS_N_INSNS (18)}, /* other */
860 COSTS_N_INSNS (2), /* cost of movsx */
861 COSTS_N_INSNS (2), /* cost of movzx */
862 8, /* "large" insn */
863 4, /* MOVE_RATIO */
864 4, /* CLEAR_RATIO */
865 {4, 5, 4}, /* cost of loading integer registers
866 in QImode, HImode and SImode.
867 Relative to reg-reg move (2). */
868 {2, 3, 2}, /* cost of storing integer registers */
869 {2, 2, 8, 16, 32}, /* cost of loading SSE register
870 in 32bit, 64bit, 128bit, 256bit and 512bit */
871 {2, 2, 8, 16, 32}, /* cost of storing SSE register
872 in 32bit, 64bit, 128bit, 256bit and 512bit */
873 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
874 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
875 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
876 6, /* cost of moving SSE register to integer. */
877 2, 2, /* Gather load static, per_elt. */
878 2, 2, /* Gather store static, per_elt. */
879 32, /* size of l1 cache. */
880 32, /* size of l2 cache. Some models
881 have integrated l2 cache, but
882 optimizing for k6 is not important
883 enough to worry about that. */
884 32, /* size of prefetch block */
885 1, /* number of parallel prefetches */
886 1, /* Branch cost */
887 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
888 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
889 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
890 COSTS_N_INSNS (2), /* cost of FABS instruction. */
891 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
892 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
894 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
895 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
896 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
897 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
898 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
899 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
900 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
901 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
902 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
903 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
904 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
905 k6_memcpy,
906 k6_memset,
907 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
908 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
909 "32:8:8", /* Loop alignment. */
910 "32:8:8", /* Jump alignment. */
911 "0:0:8", /* Label alignment. */
912 "32", /* Func alignment. */
913 4, /* Small unroll limit. */
914 2, /* Small unroll factor. */
917 /* For some reason, Athlon deals better with REP prefix (relative to loops)
918 compared to K8. Alignment becomes important after 8 bytes for memcpy and
919 128 bytes for memset. */
920 static stringop_algs athlon_memcpy[2] = {
921 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
922 DUMMY_STRINGOP_ALGS};
923 static stringop_algs athlon_memset[2] = {
924 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
925 DUMMY_STRINGOP_ALGS};
926 static const
927 struct processor_costs athlon_cost = {
929 /* Start of register allocator costs. integer->integer move cost is 2. */
930 4, /* cost for loading QImode using movzbl */
931 {3, 4, 3}, /* cost of loading integer registers
932 in QImode, HImode and SImode.
933 Relative to reg-reg move (2). */
934 {3, 4, 3}, /* cost of storing integer registers */
935 4, /* cost of reg,reg fld/fst */
936 {4, 4, 12}, /* cost of loading fp registers
937 in SFmode, DFmode and XFmode */
938 {6, 6, 8}, /* cost of storing fp registers
939 in SFmode, DFmode and XFmode */
940 2, /* cost of moving MMX register */
941 {4, 4}, /* cost of loading MMX registers
942 in SImode and DImode */
943 {4, 4}, /* cost of storing MMX registers
944 in SImode and DImode */
945 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
946 {4, 4, 12, 12, 24}, /* cost of loading SSE registers
947 in 32,64,128,256 and 512-bit */
948 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
949 in 32,64,128,256 and 512-bit */
950 5, 5, /* SSE->integer and integer->SSE moves */
951 5, 5, /* mask->integer and integer->mask moves */
952 {3, 4, 3}, /* cost of loading mask register
953 in QImode, HImode, SImode. */
954 {3, 4, 3}, /* cost if storing mask register
955 in QImode, HImode, SImode. */
956 2, /* cost of moving mask register. */
957 /* End of register allocator costs. */
960 COSTS_N_INSNS (1), /* cost of an add instruction */
961 COSTS_N_INSNS (2), /* cost of a lea instruction */
962 COSTS_N_INSNS (1), /* variable shift costs */
963 COSTS_N_INSNS (1), /* constant shift costs */
964 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
965 COSTS_N_INSNS (5), /* HI */
966 COSTS_N_INSNS (5), /* SI */
967 COSTS_N_INSNS (5), /* DI */
968 COSTS_N_INSNS (5)}, /* other */
969 0, /* cost of multiply per each bit set */
970 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
971 COSTS_N_INSNS (26), /* HI */
972 COSTS_N_INSNS (42), /* SI */
973 COSTS_N_INSNS (74), /* DI */
974 COSTS_N_INSNS (74)}, /* other */
975 COSTS_N_INSNS (1), /* cost of movsx */
976 COSTS_N_INSNS (1), /* cost of movzx */
977 8, /* "large" insn */
978 9, /* MOVE_RATIO */
979 6, /* CLEAR_RATIO */
980 {3, 4, 3}, /* cost of loading integer registers
981 in QImode, HImode and SImode.
982 Relative to reg-reg move (2). */
983 {3, 4, 3}, /* cost of storing integer registers */
984 {4, 4, 12, 12, 24}, /* cost of loading SSE register
985 in 32bit, 64bit, 128bit, 256bit and 512bit */
986 {4, 4, 10, 10, 20}, /* cost of storing SSE register
987 in 32bit, 64bit, 128bit, 256bit and 512bit */
988 {4, 4, 12, 12, 24}, /* cost of unaligned loads. */
989 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
990 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
991 5, /* cost of moving SSE register to integer. */
992 4, 4, /* Gather load static, per_elt. */
993 4, 4, /* Gather store static, per_elt. */
994 64, /* size of l1 cache. */
995 256, /* size of l2 cache. */
996 64, /* size of prefetch block */
997 6, /* number of parallel prefetches */
998 5, /* Branch cost */
999 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1000 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1001 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1002 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1003 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1004 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1006 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1007 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1008 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1009 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1010 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1011 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1012 /* 11-16 */
1013 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1014 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
1015 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1016 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
1017 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1018 athlon_memcpy,
1019 athlon_memset,
1020 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1021 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1022 "16:8:8", /* Loop alignment. */
1023 "16:8:8", /* Jump alignment. */
1024 "0:0:8", /* Label alignment. */
1025 "16", /* Func alignment. */
1026 4, /* Small unroll limit. */
1027 2, /* Small unroll factor. */
1030 /* K8 has optimized REP instruction for medium sized blocks, but for very
1031 small blocks it is better to use loop. For large blocks, libcall can
1032 do nontemporary accesses and beat inline considerably. */
1033 static stringop_algs k8_memcpy[2] = {
1034 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1035 {-1, rep_prefix_4_byte, false}}},
1036 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1037 {-1, libcall, false}}}};
1038 static stringop_algs k8_memset[2] = {
1039 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1040 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1041 {libcall, {{48, unrolled_loop, false},
1042 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1043 static const
1044 struct processor_costs k8_cost = {
1046 /* Start of register allocator costs. integer->integer move cost is 2. */
1047 4, /* cost for loading QImode using movzbl */
1048 {3, 4, 3}, /* cost of loading integer registers
1049 in QImode, HImode and SImode.
1050 Relative to reg-reg move (2). */
1051 {3, 4, 3}, /* cost of storing integer registers */
1052 4, /* cost of reg,reg fld/fst */
1053 {4, 4, 12}, /* cost of loading fp registers
1054 in SFmode, DFmode and XFmode */
1055 {6, 6, 8}, /* cost of storing fp registers
1056 in SFmode, DFmode and XFmode */
1057 2, /* cost of moving MMX register */
1058 {3, 3}, /* cost of loading MMX registers
1059 in SImode and DImode */
1060 {4, 4}, /* cost of storing MMX registers
1061 in SImode and DImode */
1062 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1063 {4, 3, 12, 12, 24}, /* cost of loading SSE registers
1064 in 32,64,128,256 and 512-bit */
1065 {4, 4, 10, 10, 20}, /* cost of storing SSE registers
1066 in 32,64,128,256 and 512-bit */
1067 5, 5, /* SSE->integer and integer->SSE moves */
1068 5, 5, /* mask->integer and integer->mask moves */
1069 {3, 4, 3}, /* cost of loading mask register
1070 in QImode, HImode, SImode. */
1071 {3, 4, 3}, /* cost if storing mask register
1072 in QImode, HImode, SImode. */
1073 2, /* cost of moving mask register. */
1074 /* End of register allocator costs. */
1077 COSTS_N_INSNS (1), /* cost of an add instruction */
1078 COSTS_N_INSNS (2), /* cost of a lea instruction */
1079 COSTS_N_INSNS (1), /* variable shift costs */
1080 COSTS_N_INSNS (1), /* constant shift costs */
1081 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1082 COSTS_N_INSNS (4), /* HI */
1083 COSTS_N_INSNS (3), /* SI */
1084 COSTS_N_INSNS (4), /* DI */
1085 COSTS_N_INSNS (5)}, /* other */
1086 0, /* cost of multiply per each bit set */
1087 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1088 COSTS_N_INSNS (26), /* HI */
1089 COSTS_N_INSNS (42), /* SI */
1090 COSTS_N_INSNS (74), /* DI */
1091 COSTS_N_INSNS (74)}, /* other */
1092 COSTS_N_INSNS (1), /* cost of movsx */
1093 COSTS_N_INSNS (1), /* cost of movzx */
1094 8, /* "large" insn */
1095 9, /* MOVE_RATIO */
1096 6, /* CLEAR_RATIO */
1097 {3, 4, 3}, /* cost of loading integer registers
1098 in QImode, HImode and SImode.
1099 Relative to reg-reg move (2). */
1100 {3, 4, 3}, /* cost of storing integer registers */
1101 {4, 3, 12, 12, 24}, /* cost of loading SSE register
1102 in 32bit, 64bit, 128bit, 256bit and 512bit */
1103 {4, 4, 10, 10, 20}, /* cost of storing SSE register
1104 in 32bit, 64bit, 128bit, 256bit and 512bit */
1105 {4, 3, 12, 12, 24}, /* cost of unaligned loads. */
1106 {4, 4, 10, 10, 20}, /* cost of unaligned stores. */
1107 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1108 5, /* cost of moving SSE register to integer. */
1109 4, 4, /* Gather load static, per_elt. */
1110 4, 4, /* Gather store static, per_elt. */
1111 64, /* size of l1 cache. */
1112 512, /* size of l2 cache. */
1113 64, /* size of prefetch block */
1114 /* New AMD processors never drop prefetches; if they cannot be performed
1115 immediately, they are queued. We set number of simultaneous prefetches
1116 to a large constant to reflect this (it probably is not a good idea not
1117 to limit number of prefetches at all, as their execution also takes some
1118 time). */
1119 100, /* number of parallel prefetches */
1120 3, /* Branch cost */
1121 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1122 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1123 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1124 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1125 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1126 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1128 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1129 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1130 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1131 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1132 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1133 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1134 /* 11-16 */
1135 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1136 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1137 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1138 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1139 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1140 k8_memcpy,
1141 k8_memset,
1142 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1143 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1144 "16:8:8", /* Loop alignment. */
1145 "16:8:8", /* Jump alignment. */
1146 "0:0:8", /* Label alignment. */
1147 "16", /* Func alignment. */
1148 4, /* Small unroll limit. */
1149 2, /* Small unroll factor. */
1152 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1153 very small blocks it is better to use loop. For large blocks, libcall can
1154 do nontemporary accesses and beat inline considerably. */
1155 static stringop_algs amdfam10_memcpy[2] = {
1156 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1157 {-1, rep_prefix_4_byte, false}}},
1158 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1159 {-1, libcall, false}}}};
1160 static stringop_algs amdfam10_memset[2] = {
1161 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1162 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1163 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1164 {-1, libcall, false}}}};
1165 struct processor_costs amdfam10_cost = {
1167 /* Start of register allocator costs. integer->integer move cost is 2. */
1168 4, /* cost for loading QImode using movzbl */
1169 {3, 4, 3}, /* cost of loading integer registers
1170 in QImode, HImode and SImode.
1171 Relative to reg-reg move (2). */
1172 {3, 4, 3}, /* cost of storing integer registers */
1173 4, /* cost of reg,reg fld/fst */
1174 {4, 4, 12}, /* cost of loading fp registers
1175 in SFmode, DFmode and XFmode */
1176 {6, 6, 8}, /* cost of storing fp registers
1177 in SFmode, DFmode and XFmode */
1178 2, /* cost of moving MMX register */
1179 {3, 3}, /* cost of loading MMX registers
1180 in SImode and DImode */
1181 {4, 4}, /* cost of storing MMX registers
1182 in SImode and DImode */
1183 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1184 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
1185 in 32,64,128,256 and 512-bit */
1186 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
1187 in 32,64,128,256 and 512-bit */
1188 3, 3, /* SSE->integer and integer->SSE moves */
1189 3, 3, /* mask->integer and integer->mask moves */
1190 {3, 4, 3}, /* cost of loading mask register
1191 in QImode, HImode, SImode. */
1192 {3, 4, 3}, /* cost if storing mask register
1193 in QImode, HImode, SImode. */
1194 2, /* cost of moving mask register. */
1196 /* On K8:
1197 MOVD reg64, xmmreg Double FSTORE 4
1198 MOVD reg32, xmmreg Double FSTORE 4
1199 On AMDFAM10:
1200 MOVD reg64, xmmreg Double FADD 3
1201 1/1 1/1
1202 MOVD reg32, xmmreg Double FADD 3
1203 1/1 1/1 */
1204 /* End of register allocator costs. */
1207 COSTS_N_INSNS (1), /* cost of an add instruction */
1208 COSTS_N_INSNS (2), /* cost of a lea instruction */
1209 COSTS_N_INSNS (1), /* variable shift costs */
1210 COSTS_N_INSNS (1), /* constant shift costs */
1211 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1212 COSTS_N_INSNS (4), /* HI */
1213 COSTS_N_INSNS (3), /* SI */
1214 COSTS_N_INSNS (4), /* DI */
1215 COSTS_N_INSNS (5)}, /* other */
1216 0, /* cost of multiply per each bit set */
1217 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1218 COSTS_N_INSNS (35), /* HI */
1219 COSTS_N_INSNS (51), /* SI */
1220 COSTS_N_INSNS (83), /* DI */
1221 COSTS_N_INSNS (83)}, /* other */
1222 COSTS_N_INSNS (1), /* cost of movsx */
1223 COSTS_N_INSNS (1), /* cost of movzx */
1224 8, /* "large" insn */
1225 9, /* MOVE_RATIO */
1226 6, /* CLEAR_RATIO */
1227 {3, 4, 3}, /* cost of loading integer registers
1228 in QImode, HImode and SImode.
1229 Relative to reg-reg move (2). */
1230 {3, 4, 3}, /* cost of storing integer registers */
1231 {4, 4, 3, 6, 12}, /* cost of loading SSE register
1232 in 32bit, 64bit, 128bit, 256bit and 512bit */
1233 {4, 4, 5, 10, 20}, /* cost of storing SSE register
1234 in 32bit, 64bit, 128bit, 256bit and 512bit */
1235 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
1236 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1237 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1238 3, /* cost of moving SSE register to integer. */
1239 4, 4, /* Gather load static, per_elt. */
1240 4, 4, /* Gather store static, per_elt. */
1241 64, /* size of l1 cache. */
1242 512, /* size of l2 cache. */
1243 64, /* size of prefetch block */
1244 /* New AMD processors never drop prefetches; if they cannot be performed
1245 immediately, they are queued. We set number of simultaneous prefetches
1246 to a large constant to reflect this (it probably is not a good idea not
1247 to limit number of prefetches at all, as their execution also takes some
1248 time). */
1249 100, /* number of parallel prefetches */
1250 2, /* Branch cost */
1251 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1252 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1253 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1254 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1255 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1256 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1258 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1259 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1260 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1261 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1262 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1263 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1264 /* 11-16 */
1265 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1266 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1267 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1268 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1269 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1270 amdfam10_memcpy,
1271 amdfam10_memset,
1272 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1273 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1274 "32:25:8", /* Loop alignment. */
1275 "32:8:8", /* Jump alignment. */
1276 "0:0:8", /* Label alignment. */
1277 "32", /* Func alignment. */
1278 4, /* Small unroll limit. */
1279 2, /* Small unroll factor. */
1282 /* BDVER has optimized REP instruction for medium sized blocks, but for
1283 very small blocks it is better to use loop. For large blocks, libcall
1284 can do nontemporary accesses and beat inline considerably. */
1285 static stringop_algs bdver_memcpy[2] = {
1286 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1287 {-1, rep_prefix_4_byte, false}}},
1288 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1289 {-1, libcall, false}}}};
1290 static stringop_algs bdver_memset[2] = {
1291 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1292 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1293 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1294 {-1, libcall, false}}}};
1296 const struct processor_costs bdver_cost = {
1298 /* Start of register allocator costs. integer->integer move cost is 2. */
1299 8, /* cost for loading QImode using movzbl */
1300 {8, 8, 8}, /* cost of loading integer registers
1301 in QImode, HImode and SImode.
1302 Relative to reg-reg move (2). */
1303 {8, 8, 8}, /* cost of storing integer registers */
1304 4, /* cost of reg,reg fld/fst */
1305 {12, 12, 28}, /* cost of loading fp registers
1306 in SFmode, DFmode and XFmode */
1307 {10, 10, 18}, /* cost of storing fp registers
1308 in SFmode, DFmode and XFmode */
1309 4, /* cost of moving MMX register */
1310 {12, 12}, /* cost of loading MMX registers
1311 in SImode and DImode */
1312 {10, 10}, /* cost of storing MMX registers
1313 in SImode and DImode */
1314 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1315 {12, 12, 10, 40, 60}, /* cost of loading SSE registers
1316 in 32,64,128,256 and 512-bit */
1317 {10, 10, 10, 40, 60}, /* cost of storing SSE registers
1318 in 32,64,128,256 and 512-bit */
1319 16, 20, /* SSE->integer and integer->SSE moves */
1320 16, 20, /* mask->integer and integer->mask moves */
1321 {8, 8, 8}, /* cost of loading mask register
1322 in QImode, HImode, SImode. */
1323 {8, 8, 8}, /* cost if storing mask register
1324 in QImode, HImode, SImode. */
1325 2, /* cost of moving mask register. */
1326 /* End of register allocator costs. */
1329 COSTS_N_INSNS (1), /* cost of an add instruction */
1330 COSTS_N_INSNS (1), /* cost of a lea instruction */
1331 COSTS_N_INSNS (1), /* variable shift costs */
1332 COSTS_N_INSNS (1), /* constant shift costs */
1333 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1334 COSTS_N_INSNS (4), /* HI */
1335 COSTS_N_INSNS (4), /* SI */
1336 COSTS_N_INSNS (6), /* DI */
1337 COSTS_N_INSNS (6)}, /* other */
1338 0, /* cost of multiply per each bit set */
1339 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1340 COSTS_N_INSNS (35), /* HI */
1341 COSTS_N_INSNS (51), /* SI */
1342 COSTS_N_INSNS (83), /* DI */
1343 COSTS_N_INSNS (83)}, /* other */
1344 COSTS_N_INSNS (1), /* cost of movsx */
1345 COSTS_N_INSNS (1), /* cost of movzx */
1346 8, /* "large" insn */
1347 9, /* MOVE_RATIO */
1348 6, /* CLEAR_RATIO */
1349 {8, 8, 8}, /* cost of loading integer registers
1350 in QImode, HImode and SImode.
1351 Relative to reg-reg move (2). */
1352 {8, 8, 8}, /* cost of storing integer registers */
1353 {12, 12, 10, 40, 60}, /* cost of loading SSE register
1354 in 32bit, 64bit, 128bit, 256bit and 512bit */
1355 {10, 10, 10, 40, 60}, /* cost of storing SSE register
1356 in 32bit, 64bit, 128bit, 256bit and 512bit */
1357 {12, 12, 10, 40, 60}, /* cost of unaligned loads. */
1358 {10, 10, 10, 40, 60}, /* cost of unaligned stores. */
1359 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1360 16, /* cost of moving SSE register to integer. */
1361 12, 12, /* Gather load static, per_elt. */
1362 10, 10, /* Gather store static, per_elt. */
1363 16, /* size of l1 cache. */
1364 2048, /* size of l2 cache. */
1365 64, /* size of prefetch block */
1366 /* New AMD processors never drop prefetches; if they cannot be performed
1367 immediately, they are queued. We set number of simultaneous prefetches
1368 to a large constant to reflect this (it probably is not a good idea not
1369 to limit number of prefetches at all, as their execution also takes some
1370 time). */
1371 100, /* number of parallel prefetches */
1372 2, /* Branch cost */
1373 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1374 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1375 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1376 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1377 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1378 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1380 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1381 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1382 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1383 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1384 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1385 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1386 /* 9-24 */
1387 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1388 /* 9-27 */
1389 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1390 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1391 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1392 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1393 bdver_memcpy,
1394 bdver_memset,
1395 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1396 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1397 "16:11:8", /* Loop alignment. */
1398 "16:8:8", /* Jump alignment. */
1399 "0:0:8", /* Label alignment. */
1400 "11", /* Func alignment. */
1401 4, /* Small unroll limit. */
1402 2, /* Small unroll factor. */
1406 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1407 very small blocks it is better to use loop. For large blocks, libcall
1408 can do nontemporary accesses and beat inline considerably. */
1409 static stringop_algs znver1_memcpy[2] = {
1410 /* 32-bit tuning. */
1411 {libcall, {{6, loop, false},
1412 {14, unrolled_loop, false},
1413 {-1, libcall, false}}},
1414 /* 64-bit tuning. */
1415 {libcall, {{16, loop, false},
1416 {128, rep_prefix_8_byte, false},
1417 {-1, libcall, false}}}};
1418 static stringop_algs znver1_memset[2] = {
1419 /* 32-bit tuning. */
1420 {libcall, {{8, loop, false},
1421 {24, unrolled_loop, false},
1422 {128, rep_prefix_4_byte, false},
1423 {-1, libcall, false}}},
1424 /* 64-bit tuning. */
1425 {libcall, {{48, unrolled_loop, false},
1426 {128, rep_prefix_8_byte, false},
1427 {-1, libcall, false}}}};
1428 struct processor_costs znver1_cost = {
1430 /* Start of register allocator costs. integer->integer move cost is 2. */
1432 /* reg-reg moves are done by renaming and thus they are even cheaper than
1433 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1434 to doubles of latencies, we do not model this correctly. It does not
1435 seem to make practical difference to bump prices up even more. */
1436 6, /* cost for loading QImode using
1437 movzbl. */
1438 {6, 6, 6}, /* cost of loading integer registers
1439 in QImode, HImode and SImode.
1440 Relative to reg-reg move (2). */
1441 {8, 8, 8}, /* cost of storing integer
1442 registers. */
1443 2, /* cost of reg,reg fld/fst. */
1444 {6, 6, 16}, /* cost of loading fp registers
1445 in SFmode, DFmode and XFmode. */
1446 {8, 8, 16}, /* cost of storing fp registers
1447 in SFmode, DFmode and XFmode. */
1448 2, /* cost of moving MMX register. */
1449 {6, 6}, /* cost of loading MMX registers
1450 in SImode and DImode. */
1451 {8, 8}, /* cost of storing MMX registers
1452 in SImode and DImode. */
1453 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1454 {6, 6, 6, 12, 24}, /* cost of loading SSE registers
1455 in 32,64,128,256 and 512-bit. */
1456 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1457 in 32,64,128,256 and 512-bit. */
1458 6, 6, /* SSE->integer and integer->SSE moves. */
1459 8, 8, /* mask->integer and integer->mask moves */
1460 {6, 6, 6}, /* cost of loading mask register
1461 in QImode, HImode, SImode. */
1462 {8, 8, 8}, /* cost if storing mask register
1463 in QImode, HImode, SImode. */
1464 2, /* cost of moving mask register. */
1465 /* End of register allocator costs. */
1468 COSTS_N_INSNS (1), /* cost of an add instruction. */
1469 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1470 COSTS_N_INSNS (1), /* variable shift costs. */
1471 COSTS_N_INSNS (1), /* constant shift costs. */
1472 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1473 COSTS_N_INSNS (3), /* HI. */
1474 COSTS_N_INSNS (3), /* SI. */
1475 COSTS_N_INSNS (3), /* DI. */
1476 COSTS_N_INSNS (3)}, /* other. */
1477 0, /* cost of multiply per each bit
1478 set. */
1479 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1480 bound. */
1481 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1482 COSTS_N_INSNS (22), /* HI. */
1483 COSTS_N_INSNS (30), /* SI. */
1484 COSTS_N_INSNS (45), /* DI. */
1485 COSTS_N_INSNS (45)}, /* other. */
1486 COSTS_N_INSNS (1), /* cost of movsx. */
1487 COSTS_N_INSNS (1), /* cost of movzx. */
1488 8, /* "large" insn. */
1489 9, /* MOVE_RATIO. */
1490 6, /* CLEAR_RATIO */
1491 {6, 6, 6}, /* cost of loading integer registers
1492 in QImode, HImode and SImode.
1493 Relative to reg-reg move (2). */
1494 {8, 8, 8}, /* cost of storing integer
1495 registers. */
1496 {6, 6, 6, 12, 24}, /* cost of loading SSE register
1497 in 32bit, 64bit, 128bit, 256bit and 512bit */
1498 {8, 8, 8, 16, 32}, /* cost of storing SSE register
1499 in 32bit, 64bit, 128bit, 256bit and 512bit */
1500 {6, 6, 6, 12, 24}, /* cost of unaligned loads. */
1501 {8, 8, 8, 16, 32}, /* cost of unaligned stores. */
1502 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1503 6, /* cost of moving SSE register to integer. */
1504 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1505 throughput 12. Approx 9 uops do not depend on vector size and every load
1506 is 7 uops. */
1507 18, 8, /* Gather load static, per_elt. */
1508 18, 10, /* Gather store static, per_elt. */
1509 32, /* size of l1 cache. */
1510 512, /* size of l2 cache. */
1511 64, /* size of prefetch block. */
1512 /* New AMD processors never drop prefetches; if they cannot be performed
1513 immediately, they are queued. We set number of simultaneous prefetches
1514 to a large constant to reflect this (it probably is not a good idea not
1515 to limit number of prefetches at all, as their execution also takes some
1516 time). */
1517 100, /* number of parallel prefetches. */
1518 3, /* Branch cost. */
1519 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1520 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1521 /* Latency of fdiv is 8-15. */
1522 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1523 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1524 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1525 /* Latency of fsqrt is 4-10. */
1526 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1528 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1529 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1530 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1531 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1532 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1533 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1534 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1535 /* 9-13 */
1536 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1537 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1538 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1539 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1540 and it can execute 2 integer additions and 2 multiplications thus
1541 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1542 that 4 works better than 6 probably due to register pressure.
1544 Integer vector operations are taken by FP unit and execute 3 vector
1545 plus/minus operations per cycle but only one multiply. This is adjusted
1546 in ix86_reassociation_width. */
1547 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1548 znver1_memcpy,
1549 znver1_memset,
1550 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1551 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1552 "16", /* Loop alignment. */
1553 "16", /* Jump alignment. */
1554 "0:0:8", /* Label alignment. */
1555 "16", /* Func alignment. */
1556 4, /* Small unroll limit. */
1557 2, /* Small unroll factor. */
1560 /* ZNVER2 has optimized REP instruction for medium sized blocks, but for
1561 very small blocks it is better to use loop. For large blocks, libcall
1562 can do nontemporary accesses and beat inline considerably. */
1563 static stringop_algs znver2_memcpy[2] = {
1564 /* 32-bit tuning. */
1565 {libcall, {{6, loop, false},
1566 {14, unrolled_loop, false},
1567 {-1, libcall, false}}},
1568 /* 64-bit tuning. */
1569 {libcall, {{16, loop, false},
1570 {64, rep_prefix_4_byte, false},
1571 {-1, libcall, false}}}};
1572 static stringop_algs znver2_memset[2] = {
1573 /* 32-bit tuning. */
1574 {libcall, {{8, loop, false},
1575 {24, unrolled_loop, false},
1576 {128, rep_prefix_4_byte, false},
1577 {-1, libcall, false}}},
1578 /* 64-bit tuning. */
1579 {libcall, {{24, rep_prefix_4_byte, false},
1580 {128, rep_prefix_8_byte, false},
1581 {-1, libcall, false}}}};
1583 struct processor_costs znver2_cost = {
1585 /* Start of register allocator costs. integer->integer move cost is 2. */
1587 /* reg-reg moves are done by renaming and thus they are even cheaper than
1588 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1589 to doubles of latencies, we do not model this correctly. It does not
1590 seem to make practical difference to bump prices up even more. */
1591 6, /* cost for loading QImode using
1592 movzbl. */
1593 {6, 6, 6}, /* cost of loading integer registers
1594 in QImode, HImode and SImode.
1595 Relative to reg-reg move (2). */
1596 {8, 8, 8}, /* cost of storing integer
1597 registers. */
1598 2, /* cost of reg,reg fld/fst. */
1599 {6, 6, 16}, /* cost of loading fp registers
1600 in SFmode, DFmode and XFmode. */
1601 {8, 8, 16}, /* cost of storing fp registers
1602 in SFmode, DFmode and XFmode. */
1603 2, /* cost of moving MMX register. */
1604 {6, 6}, /* cost of loading MMX registers
1605 in SImode and DImode. */
1606 {8, 8}, /* cost of storing MMX registers
1607 in SImode and DImode. */
1608 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1609 register. */
1610 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1611 in 32,64,128,256 and 512-bit. */
1612 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1613 in 32,64,128,256 and 512-bit. */
1614 6, 6, /* SSE->integer and integer->SSE
1615 moves. */
1616 8, 8, /* mask->integer and integer->mask moves */
1617 {6, 6, 6}, /* cost of loading mask register
1618 in QImode, HImode, SImode. */
1619 {8, 8, 8}, /* cost if storing mask register
1620 in QImode, HImode, SImode. */
1621 2, /* cost of moving mask register. */
1622 /* End of register allocator costs. */
1625 COSTS_N_INSNS (1), /* cost of an add instruction. */
1626 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1627 COSTS_N_INSNS (1), /* variable shift costs. */
1628 COSTS_N_INSNS (1), /* constant shift costs. */
1629 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1630 COSTS_N_INSNS (3), /* HI. */
1631 COSTS_N_INSNS (3), /* SI. */
1632 COSTS_N_INSNS (3), /* DI. */
1633 COSTS_N_INSNS (3)}, /* other. */
1634 0, /* cost of multiply per each bit
1635 set. */
1636 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1637 bound. */
1638 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1639 COSTS_N_INSNS (22), /* HI. */
1640 COSTS_N_INSNS (30), /* SI. */
1641 COSTS_N_INSNS (45), /* DI. */
1642 COSTS_N_INSNS (45)}, /* other. */
1643 COSTS_N_INSNS (1), /* cost of movsx. */
1644 COSTS_N_INSNS (1), /* cost of movzx. */
1645 8, /* "large" insn. */
1646 9, /* MOVE_RATIO. */
1647 6, /* CLEAR_RATIO */
1648 {6, 6, 6}, /* cost of loading integer registers
1649 in QImode, HImode and SImode.
1650 Relative to reg-reg move (2). */
1651 {8, 8, 8}, /* cost of storing integer
1652 registers. */
1653 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1654 in 32bit, 64bit, 128bit, 256bit and 512bit */
1655 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1656 in 32bit, 64bit, 128bit, 256bit and 512bit */
1657 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1658 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1659 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1660 register. */
1661 6, /* cost of moving SSE register to integer. */
1662 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1663 throughput 12. Approx 9 uops do not depend on vector size and every load
1664 is 7 uops. */
1665 18, 8, /* Gather load static, per_elt. */
1666 18, 10, /* Gather store static, per_elt. */
1667 32, /* size of l1 cache. */
1668 512, /* size of l2 cache. */
1669 64, /* size of prefetch block. */
1670 /* New AMD processors never drop prefetches; if they cannot be performed
1671 immediately, they are queued. We set number of simultaneous prefetches
1672 to a large constant to reflect this (it probably is not a good idea not
1673 to limit number of prefetches at all, as their execution also takes some
1674 time). */
1675 100, /* number of parallel prefetches. */
1676 3, /* Branch cost. */
1677 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1678 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1679 /* Latency of fdiv is 8-15. */
1680 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1681 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1682 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1683 /* Latency of fsqrt is 4-10. */
1684 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1686 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1687 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1688 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1689 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1690 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1691 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1692 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1693 /* 9-13. */
1694 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1695 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1696 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1697 /* Zen can execute 4 integer operations per cycle. FP operations
1698 take 3 cycles and it can execute 2 integer additions and 2
1699 multiplications thus reassociation may make sense up to with of 6.
1700 SPEC2k6 bencharks suggests
1701 that 4 works better than 6 probably due to register pressure.
1703 Integer vector operations are taken by FP unit and execute 3 vector
1704 plus/minus operations per cycle but only one multiply. This is adjusted
1705 in ix86_reassociation_width. */
1706 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1707 znver2_memcpy,
1708 znver2_memset,
1709 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1710 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1711 "16", /* Loop alignment. */
1712 "16", /* Jump alignment. */
1713 "0:0:8", /* Label alignment. */
1714 "16", /* Func alignment. */
1715 4, /* Small unroll limit. */
1716 2, /* Small unroll factor. */
1719 struct processor_costs znver3_cost = {
1721 /* Start of register allocator costs. integer->integer move cost is 2. */
1723 /* reg-reg moves are done by renaming and thus they are even cheaper than
1724 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1725 to doubles of latencies, we do not model this correctly. It does not
1726 seem to make practical difference to bump prices up even more. */
1727 6, /* cost for loading QImode using
1728 movzbl. */
1729 {6, 6, 6}, /* cost of loading integer registers
1730 in QImode, HImode and SImode.
1731 Relative to reg-reg move (2). */
1732 {8, 8, 8}, /* cost of storing integer
1733 registers. */
1734 2, /* cost of reg,reg fld/fst. */
1735 {6, 6, 16}, /* cost of loading fp registers
1736 in SFmode, DFmode and XFmode. */
1737 {8, 8, 16}, /* cost of storing fp registers
1738 in SFmode, DFmode and XFmode. */
1739 2, /* cost of moving MMX register. */
1740 {6, 6}, /* cost of loading MMX registers
1741 in SImode and DImode. */
1742 {8, 8}, /* cost of storing MMX registers
1743 in SImode and DImode. */
1744 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1745 register. */
1746 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1747 in 32,64,128,256 and 512-bit. */
1748 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1749 in 32,64,128,256 and 512-bit. */
1750 6, 6, /* SSE->integer and integer->SSE
1751 moves. */
1752 8, 8, /* mask->integer and integer->mask moves */
1753 {6, 6, 6}, /* cost of loading mask register
1754 in QImode, HImode, SImode. */
1755 {8, 8, 8}, /* cost if storing mask register
1756 in QImode, HImode, SImode. */
1757 2, /* cost of moving mask register. */
1758 /* End of register allocator costs. */
1761 COSTS_N_INSNS (1), /* cost of an add instruction. */
1762 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1763 COSTS_N_INSNS (1), /* variable shift costs. */
1764 COSTS_N_INSNS (1), /* constant shift costs. */
1765 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1766 COSTS_N_INSNS (3), /* HI. */
1767 COSTS_N_INSNS (3), /* SI. */
1768 COSTS_N_INSNS (3), /* DI. */
1769 COSTS_N_INSNS (3)}, /* other. */
1770 0, /* cost of multiply per each bit
1771 set. */
1772 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
1773 COSTS_N_INSNS (10), /* HI. */
1774 COSTS_N_INSNS (12), /* SI. */
1775 COSTS_N_INSNS (17), /* DI. */
1776 COSTS_N_INSNS (17)}, /* other. */
1777 COSTS_N_INSNS (1), /* cost of movsx. */
1778 COSTS_N_INSNS (1), /* cost of movzx. */
1779 8, /* "large" insn. */
1780 9, /* MOVE_RATIO. */
1781 6, /* CLEAR_RATIO */
1782 {6, 6, 6}, /* cost of loading integer registers
1783 in QImode, HImode and SImode.
1784 Relative to reg-reg move (2). */
1785 {8, 8, 8}, /* cost of storing integer
1786 registers. */
1787 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
1788 in 32bit, 64bit, 128bit, 256bit and 512bit */
1789 {8, 8, 8, 8, 16}, /* cost of storing SSE register
1790 in 32bit, 64bit, 128bit, 256bit and 512bit */
1791 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
1792 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1793 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1794 register. */
1795 6, /* cost of moving SSE register to integer. */
1796 /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
1797 throughput 9. Approx 7 uops do not depend on vector size and every load
1798 is 4 uops. */
1799 14, 8, /* Gather load static, per_elt. */
1800 14, 10, /* Gather store static, per_elt. */
1801 32, /* size of l1 cache. */
1802 512, /* size of l2 cache. */
1803 64, /* size of prefetch block. */
1804 /* New AMD processors never drop prefetches; if they cannot be performed
1805 immediately, they are queued. We set number of simultaneous prefetches
1806 to a large constant to reflect this (it probably is not a good idea not
1807 to limit number of prefetches at all, as their execution also takes some
1808 time). */
1809 100, /* number of parallel prefetches. */
1810 3, /* Branch cost. */
1811 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1812 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1813 /* Latency of fdiv is 8-15. */
1814 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1815 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1816 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1817 /* Latency of fsqrt is 4-10. */
1818 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1820 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1821 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1822 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1823 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1824 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1825 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1826 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1827 /* 9-13. */
1828 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1829 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1830 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1831 /* Zen can execute 4 integer operations per cycle. FP operations
1832 take 3 cycles and it can execute 2 integer additions and 2
1833 multiplications thus reassociation may make sense up to with of 6.
1834 SPEC2k6 bencharks suggests
1835 that 4 works better than 6 probably due to register pressure.
1837 Integer vector operations are taken by FP unit and execute 3 vector
1838 plus/minus operations per cycle but only one multiply. This is adjusted
1839 in ix86_reassociation_width. */
1840 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1841 znver2_memcpy,
1842 znver2_memset,
1843 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1844 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1845 "16", /* Loop alignment. */
1846 "16", /* Jump alignment. */
1847 "0:0:8", /* Label alignment. */
1848 "16", /* Func alignment. */
1849 4, /* Small unroll limit. */
1850 2, /* Small unroll factor. */
1853 /* This table currently replicates znver3_cost table. */
1854 struct processor_costs znver4_cost = {
1856 /* Start of register allocator costs. integer->integer move cost is 2. */
1858 /* reg-reg moves are done by renaming and thus they are even cheaper than
1859 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1860 to doubles of latencies, we do not model this correctly. It does not
1861 seem to make practical difference to bump prices up even more. */
1862 6, /* cost for loading QImode using
1863 movzbl. */
1864 {6, 6, 6}, /* cost of loading integer registers
1865 in QImode, HImode and SImode.
1866 Relative to reg-reg move (2). */
1867 {8, 8, 8}, /* cost of storing integer
1868 registers. */
1869 2, /* cost of reg,reg fld/fst. */
1870 {14, 14, 17}, /* cost of loading fp registers
1871 in SFmode, DFmode and XFmode. */
1872 {12, 12, 16}, /* cost of storing fp registers
1873 in SFmode, DFmode and XFmode. */
1874 2, /* cost of moving MMX register. */
1875 {6, 6}, /* cost of loading MMX registers
1876 in SImode and DImode. */
1877 {8, 8}, /* cost of storing MMX registers
1878 in SImode and DImode. */
1879 2, 2, 3, /* cost of moving XMM,YMM,ZMM
1880 register. */
1881 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
1882 in 32,64,128,256 and 512-bit. */
1883 {8, 8, 8, 12, 12}, /* cost of storing SSE registers
1884 in 32,64,128,256 and 512-bit. */
1885 6, 8, /* SSE->integer and integer->SSE
1886 moves. */
1887 8, 8, /* mask->integer and integer->mask moves */
1888 {6, 6, 6}, /* cost of loading mask register
1889 in QImode, HImode, SImode. */
1890 {8, 8, 8}, /* cost if storing mask register
1891 in QImode, HImode, SImode. */
1892 2, /* cost of moving mask register. */
1893 /* End of register allocator costs. */
1896 COSTS_N_INSNS (1), /* cost of an add instruction. */
1897 /* TODO: Lea with 3 components has cost 2. */
1898 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1899 COSTS_N_INSNS (1), /* variable shift costs. */
1900 COSTS_N_INSNS (1), /* constant shift costs. */
1901 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1902 COSTS_N_INSNS (3), /* HI. */
1903 COSTS_N_INSNS (3), /* SI. */
1904 COSTS_N_INSNS (3), /* DI. */
1905 COSTS_N_INSNS (3)}, /* other. */
1906 0, /* cost of multiply per each bit
1907 set. */
1908 {COSTS_N_INSNS (12), /* cost of a divide/mod for QI. */
1909 COSTS_N_INSNS (13), /* HI. */
1910 COSTS_N_INSNS (13), /* SI. */
1911 COSTS_N_INSNS (18), /* DI. */
1912 COSTS_N_INSNS (18)}, /* other. */
1913 COSTS_N_INSNS (1), /* cost of movsx. */
1914 COSTS_N_INSNS (1), /* cost of movzx. */
1915 8, /* "large" insn. */
1916 9, /* MOVE_RATIO. */
1917 6, /* CLEAR_RATIO */
1918 {6, 6, 6}, /* cost of loading integer registers
1919 in QImode, HImode and SImode.
1920 Relative to reg-reg move (2). */
1921 {8, 8, 8}, /* cost of storing integer
1922 registers. */
1923 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
1924 in 32bit, 64bit, 128bit, 256bit and 512bit */
1925 {8, 8, 8, 12, 12}, /* cost of storing SSE register
1926 in 32bit, 64bit, 128bit, 256bit and 512bit */
1927 {6, 6, 10, 10, 12}, /* cost of unaligned loads. */
1928 {8, 8, 8, 12, 12}, /* cost of unaligned stores. */
1929 2, 2, 2, /* cost of moving XMM,YMM,ZMM
1930 register. */
1931 6, /* cost of moving SSE register to integer. */
1932 /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
1933 throughput 5. Approx 7 uops do not depend on vector size and every load
1934 is 5 uops. */
1935 14, 10, /* Gather load static, per_elt. */
1936 14, 20, /* Gather store static, per_elt. */
1937 32, /* size of l1 cache. */
1938 1024, /* size of l2 cache. */
1939 64, /* size of prefetch block. */
1940 /* New AMD processors never drop prefetches; if they cannot be performed
1941 immediately, they are queued. We set number of simultaneous prefetches
1942 to a large constant to reflect this (it probably is not a good idea not
1943 to limit number of prefetches at all, as their execution also takes some
1944 time). */
1945 100, /* number of parallel prefetches. */
1946 3, /* Branch cost. */
1947 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
1948 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1949 /* Latency of fdiv is 8-15. */
1950 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1951 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1952 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1953 /* Latency of fsqrt is 4-10. */
1954 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
1956 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1957 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1958 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1959 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
1960 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1961 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1962 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1963 /* 9-13. */
1964 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1965 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1966 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1967 /* Zen can execute 4 integer operations per cycle. FP operations
1968 take 3 cycles and it can execute 2 integer additions and 2
1969 multiplications thus reassociation may make sense up to with of 6.
1970 SPEC2k6 bencharks suggests
1971 that 4 works better than 6 probably due to register pressure.
1973 Integer vector operations are taken by FP unit and execute 3 vector
1974 plus/minus operations per cycle but only one multiply. This is adjusted
1975 in ix86_reassociation_width. */
1976 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1977 znver2_memcpy,
1978 znver2_memset,
1979 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1980 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1981 "16", /* Loop alignment. */
1982 "16", /* Jump alignment. */
1983 "0:0:8", /* Label alignment. */
1984 "16", /* Func alignment. */
1985 4, /* Small unroll limit. */
1986 2, /* Small unroll factor. */
1989 /* This table currently replicates znver4_cost table. */
1990 struct processor_costs znver5_cost = {
1992 /* Start of register allocator costs. integer->integer move cost is 2. */
1994 /* reg-reg moves are done by renaming and thus they are even cheaper than
1995 1 cycle. Because reg-reg move cost is 2 and following tables correspond
1996 to doubles of latencies, we do not model this correctly. It does not
1997 seem to make practical difference to bump prices up even more. */
1998 6, /* cost for loading QImode using
1999 movzbl. */
2000 {6, 6, 6}, /* cost of loading integer registers
2001 in QImode, HImode and SImode.
2002 Relative to reg-reg move (2). */
2003 {8, 8, 8}, /* cost of storing integer
2004 registers. */
2005 2, /* cost of reg,reg fld/fst. */
2006 {14, 14, 17}, /* cost of loading fp registers
2007 in SFmode, DFmode and XFmode. */
2008 {12, 12, 16}, /* cost of storing fp registers
2009 in SFmode, DFmode and XFmode. */
2010 2, /* cost of moving MMX register. */
2011 {6, 6}, /* cost of loading MMX registers
2012 in SImode and DImode. */
2013 {8, 8}, /* cost of storing MMX registers
2014 in SImode and DImode. */
2015 2, 2, 3, /* cost of moving XMM,YMM,ZMM
2016 register. */
2017 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
2018 in 32,64,128,256 and 512-bit. */
2019 {8, 8, 8, 12, 12}, /* cost of storing SSE registers
2020 in 32,64,128,256 and 512-bit. */
2021 6, 8, /* SSE->integer and integer->SSE
2022 moves. */
2023 8, 8, /* mask->integer and integer->mask moves */
2024 {6, 6, 6}, /* cost of loading mask register
2025 in QImode, HImode, SImode. */
2026 {8, 8, 8}, /* cost if storing mask register
2027 in QImode, HImode, SImode. */
2028 2, /* cost of moving mask register. */
2029 /* End of register allocator costs. */
2032 COSTS_N_INSNS (1), /* cost of an add instruction. */
2033 /* TODO: Lea with 3 components has cost 2. */
2034 COSTS_N_INSNS (1), /* cost of a lea instruction. */
2035 COSTS_N_INSNS (1), /* variable shift costs. */
2036 COSTS_N_INSNS (1), /* constant shift costs. */
2037 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
2038 COSTS_N_INSNS (3), /* HI. */
2039 COSTS_N_INSNS (3), /* SI. */
2040 COSTS_N_INSNS (3), /* DI. */
2041 COSTS_N_INSNS (3)}, /* other. */
2042 0, /* cost of multiply per each bit
2043 set. */
2044 {COSTS_N_INSNS (10), /* cost of a divide/mod for QI. */
2045 COSTS_N_INSNS (11), /* HI. */
2046 COSTS_N_INSNS (13), /* SI. */
2047 COSTS_N_INSNS (16), /* DI. */
2048 COSTS_N_INSNS (16)}, /* other. */
2049 COSTS_N_INSNS (1), /* cost of movsx. */
2050 COSTS_N_INSNS (1), /* cost of movzx. */
2051 8, /* "large" insn. */
2052 9, /* MOVE_RATIO. */
2053 6, /* CLEAR_RATIO */
2054 {6, 6, 6}, /* cost of loading integer registers
2055 in QImode, HImode and SImode.
2056 Relative to reg-reg move (2). */
2057 {8, 8, 8}, /* cost of storing integer
2058 registers. */
2059 {6, 6, 10, 10, 12}, /* cost of loading SSE registers
2060 in 32bit, 64bit, 128bit, 256bit and 512bit */
2061 {8, 8, 8, 12, 12}, /* cost of storing SSE register
2062 in 32bit, 64bit, 128bit, 256bit and 512bit */
2063 {6, 6, 10, 10, 12}, /* cost of unaligned loads. */
2064 {8, 8, 8, 12, 12}, /* cost of unaligned stores. */
2065 2, 2, 2, /* cost of moving XMM,YMM,ZMM
2066 register. */
2067 6, /* cost of moving SSE register to integer. */
2068 /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
2069 throughput 5. Approx 7 uops do not depend on vector size and every load
2070 is 5 uops. */
2071 14, 10, /* Gather load static, per_elt. */
2072 14, 20, /* Gather store static, per_elt. */
2073 32, /* size of l1 cache. */
2074 1024, /* size of l2 cache. */
2075 64, /* size of prefetch block. */
2076 /* New AMD processors never drop prefetches; if they cannot be performed
2077 immediately, they are queued. We set number of simultaneous prefetches
2078 to a large constant to reflect this (it probably is not a good idea not
2079 to limit number of prefetches at all, as their execution also takes some
2080 time). */
2081 100, /* number of parallel prefetches. */
2082 3, /* Branch cost. */
2083 COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */
2084 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2085 /* Latency of fdiv is 8-15. */
2086 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
2087 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2088 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2089 /* Latency of fsqrt is 4-10. */
2090 COSTS_N_INSNS (25), /* cost of FSQRT instruction. */
2092 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2093 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2094 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
2095 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
2096 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2097 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2098 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
2099 /* 9-13. */
2100 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
2101 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2102 COSTS_N_INSNS (20), /* cost of SQRTSD instruction. */
2103 /* Zen can execute 4 integer operations per cycle. FP operations
2104 take 3 cycles and it can execute 2 integer additions and 2
2105 multiplications thus reassociation may make sense up to with of 6.
2106 SPEC2k6 bencharks suggests
2107 that 4 works better than 6 probably due to register pressure.
2109 Integer vector operations are taken by FP unit and execute 3 vector
2110 plus/minus operations per cycle but only one multiply. This is adjusted
2111 in ix86_reassociation_width. */
2112 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
2113 znver2_memcpy,
2114 znver2_memset,
2115 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2116 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2117 "16", /* Loop alignment. */
2118 "16", /* Jump alignment. */
2119 "0:0:8", /* Label alignment. */
2120 "16", /* Func alignment. */
2121 4, /* Small unroll limit. */
2122 2, /* Small unroll factor. */
2125 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
2126 static stringop_algs skylake_memcpy[2] = {
2127 {libcall,
2128 {{256, rep_prefix_1_byte, true},
2129 {256, loop, false},
2130 {-1, libcall, false}}},
2131 {libcall,
2132 {{256, rep_prefix_1_byte, true},
2133 {256, loop, false},
2134 {-1, libcall, false}}}};
2136 static stringop_algs skylake_memset[2] = {
2137 {libcall,
2138 {{256, rep_prefix_1_byte, true},
2139 {256, loop, false},
2140 {-1, libcall, false}}},
2141 {libcall,
2142 {{256, rep_prefix_1_byte, true},
2143 {256, loop, false},
2144 {-1, libcall, false}}}};
2146 static const
2147 struct processor_costs skylake_cost = {
2149 /* Start of register allocator costs. integer->integer move cost is 2. */
2150 6, /* cost for loading QImode using movzbl */
2151 {4, 4, 4}, /* cost of loading integer registers
2152 in QImode, HImode and SImode.
2153 Relative to reg-reg move (2). */
2154 {6, 6, 6}, /* cost of storing integer registers */
2155 2, /* cost of reg,reg fld/fst */
2156 {6, 6, 8}, /* cost of loading fp registers
2157 in SFmode, DFmode and XFmode */
2158 {6, 6, 10}, /* cost of storing fp registers
2159 in SFmode, DFmode and XFmode */
2160 2, /* cost of moving MMX register */
2161 {6, 6}, /* cost of loading MMX registers
2162 in SImode and DImode */
2163 {6, 6}, /* cost of storing MMX registers
2164 in SImode and DImode */
2165 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2166 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2167 in 32,64,128,256 and 512-bit */
2168 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2169 in 32,64,128,256 and 512-bit */
2170 6, 6, /* SSE->integer and integer->SSE moves */
2171 6, 6, /* mask->integer and integer->mask moves */
2172 {8, 8, 8}, /* cost of loading mask register
2173 in QImode, HImode, SImode. */
2174 {6, 6, 6}, /* cost if storing mask register
2175 in QImode, HImode, SImode. */
2176 3, /* cost of moving mask register. */
2177 /* End of register allocator costs. */
2180 COSTS_N_INSNS (1), /* cost of an add instruction */
2181 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2182 COSTS_N_INSNS (1), /* variable shift costs */
2183 COSTS_N_INSNS (1), /* constant shift costs */
2184 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2185 COSTS_N_INSNS (3), /* HI */
2186 COSTS_N_INSNS (3), /* SI */
2187 COSTS_N_INSNS (3), /* DI */
2188 COSTS_N_INSNS (3)}, /* other */
2189 0, /* cost of multiply per each bit set */
2190 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2191 model is not realistic. We compensate by increasing the latencies a bit. */
2192 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2193 COSTS_N_INSNS (11), /* HI */
2194 COSTS_N_INSNS (14), /* SI */
2195 COSTS_N_INSNS (76), /* DI */
2196 COSTS_N_INSNS (76)}, /* other */
2197 COSTS_N_INSNS (1), /* cost of movsx */
2198 COSTS_N_INSNS (0), /* cost of movzx */
2199 8, /* "large" insn */
2200 17, /* MOVE_RATIO */
2201 17, /* CLEAR_RATIO */
2202 {6, 6, 6}, /* cost of loading integer registers
2203 in QImode, HImode and SImode.
2204 Relative to reg-reg move (2). */
2205 {8, 8, 8}, /* cost of storing integer registers */
2206 {8, 8, 8, 8, 16}, /* cost of loading SSE register
2207 in 32bit, 64bit, 128bit, 256bit and 512bit */
2208 {8, 8, 8, 8, 16}, /* cost of storing SSE register
2209 in 32bit, 64bit, 128bit, 256bit and 512bit */
2210 {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
2211 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2212 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2213 6, /* cost of moving SSE register to integer. */
2214 20, 8, /* Gather load static, per_elt. */
2215 22, 10, /* Gather store static, per_elt. */
2216 64, /* size of l1 cache. */
2217 512, /* size of l2 cache. */
2218 64, /* size of prefetch block */
2219 6, /* number of parallel prefetches */
2220 3, /* Branch cost */
2221 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2222 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2223 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2224 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2225 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2226 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2228 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2229 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2230 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2231 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2232 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2233 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2234 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2235 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2236 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2237 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2238 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2239 skylake_memcpy,
2240 skylake_memset,
2241 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2242 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2243 "16:11:8", /* Loop alignment. */
2244 "16:11:8", /* Jump alignment. */
2245 "0:0:8", /* Label alignment. */
2246 "16", /* Func alignment. */
2247 4, /* Small unroll limit. */
2248 2, /* Small unroll factor. */
2251 /* icelake_cost should produce code tuned for Icelake family of CPUs.
2252 NB: rep_prefix_1_byte is used only for known size. */
2254 static stringop_algs icelake_memcpy[2] = {
2255 {libcall,
2256 {{256, rep_prefix_1_byte, true},
2257 {256, loop, false},
2258 {-1, libcall, false}}},
2259 {libcall,
2260 {{256, rep_prefix_1_byte, true},
2261 {256, loop, false},
2262 {-1, libcall, false}}}};
2264 static stringop_algs icelake_memset[2] = {
2265 {libcall,
2266 {{256, rep_prefix_1_byte, true},
2267 {256, loop, false},
2268 {-1, libcall, false}}},
2269 {libcall,
2270 {{256, rep_prefix_1_byte, true},
2271 {256, loop, false},
2272 {-1, libcall, false}}}};
2274 static const
2275 struct processor_costs icelake_cost = {
2277 /* Start of register allocator costs. integer->integer move cost is 2. */
2278 6, /* cost for loading QImode using movzbl */
2279 {4, 4, 4}, /* cost of loading integer registers
2280 in QImode, HImode and SImode.
2281 Relative to reg-reg move (2). */
2282 {6, 6, 6}, /* cost of storing integer registers */
2283 2, /* cost of reg,reg fld/fst */
2284 {6, 6, 8}, /* cost of loading fp registers
2285 in SFmode, DFmode and XFmode */
2286 {6, 6, 10}, /* cost of storing fp registers
2287 in SFmode, DFmode and XFmode */
2288 2, /* cost of moving MMX register */
2289 {6, 6}, /* cost of loading MMX registers
2290 in SImode and DImode */
2291 {6, 6}, /* cost of storing MMX registers
2292 in SImode and DImode */
2293 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2294 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
2295 in 32,64,128,256 and 512-bit */
2296 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
2297 in 32,64,128,256 and 512-bit */
2298 6, 6, /* SSE->integer and integer->SSE moves */
2299 6, 6, /* mask->integer and integer->mask moves */
2300 {8, 8, 8}, /* cost of loading mask register
2301 in QImode, HImode, SImode. */
2302 {6, 6, 6}, /* cost if storing mask register
2303 in QImode, HImode, SImode. */
2304 3, /* cost of moving mask register. */
2305 /* End of register allocator costs. */
2308 COSTS_N_INSNS (1), /* cost of an add instruction */
2309 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
2310 COSTS_N_INSNS (1), /* variable shift costs */
2311 COSTS_N_INSNS (1), /* constant shift costs */
2312 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2313 COSTS_N_INSNS (3), /* HI */
2314 COSTS_N_INSNS (3), /* SI */
2315 COSTS_N_INSNS (3), /* DI */
2316 COSTS_N_INSNS (3)}, /* other */
2317 0, /* cost of multiply per each bit set */
2318 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2319 model is not realistic. We compensate by increasing the latencies a bit. */
2320 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2321 COSTS_N_INSNS (11), /* HI */
2322 COSTS_N_INSNS (14), /* SI */
2323 COSTS_N_INSNS (76), /* DI */
2324 COSTS_N_INSNS (76)}, /* other */
2325 COSTS_N_INSNS (1), /* cost of movsx */
2326 COSTS_N_INSNS (0), /* cost of movzx */
2327 8, /* "large" insn */
2328 17, /* MOVE_RATIO */
2329 17, /* CLEAR_RATIO */
2330 {6, 6, 6}, /* cost of loading integer registers
2331 in QImode, HImode and SImode.
2332 Relative to reg-reg move (2). */
2333 {8, 8, 8}, /* cost of storing integer registers */
2334 {8, 8, 8, 8, 16}, /* cost of loading SSE register
2335 in 32bit, 64bit, 128bit, 256bit and 512bit */
2336 {8, 8, 8, 8, 16}, /* cost of storing SSE register
2337 in 32bit, 64bit, 128bit, 256bit and 512bit */
2338 {8, 8, 8, 8, 16}, /* cost of unaligned loads. */
2339 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
2340 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2341 6, /* cost of moving SSE register to integer. */
2342 20, 8, /* Gather load static, per_elt. */
2343 22, 10, /* Gather store static, per_elt. */
2344 64, /* size of l1 cache. */
2345 512, /* size of l2 cache. */
2346 64, /* size of prefetch block */
2347 6, /* number of parallel prefetches */
2348 3, /* Branch cost */
2349 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2350 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2351 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2352 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2353 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2354 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
2356 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2357 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2358 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2359 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2360 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
2361 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
2362 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
2363 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
2364 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
2365 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2366 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2367 icelake_memcpy,
2368 icelake_memset,
2369 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2370 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2371 "16:11:8", /* Loop alignment. */
2372 "16:11:8", /* Jump alignment. */
2373 "0:0:8", /* Label alignment. */
2374 "16", /* Func alignment. */
2375 4, /* Small unroll limit. */
2376 2, /* Small unroll factor. */
2379 /* alderlake_cost should produce code tuned for alderlake family of CPUs. */
2380 static stringop_algs alderlake_memcpy[2] = {
2381 {libcall,
2382 {{256, rep_prefix_1_byte, true},
2383 {256, loop, false},
2384 {-1, libcall, false}}},
2385 {libcall,
2386 {{256, rep_prefix_1_byte, true},
2387 {256, loop, false},
2388 {-1, libcall, false}}}};
2389 static stringop_algs alderlake_memset[2] = {
2390 {libcall,
2391 {{256, rep_prefix_1_byte, true},
2392 {256, loop, false},
2393 {-1, libcall, false}}},
2394 {libcall,
2395 {{256, rep_prefix_1_byte, true},
2396 {256, loop, false},
2397 {-1, libcall, false}}}};
2398 static const
2399 struct processor_costs alderlake_cost = {
2401 /* Start of register allocator costs. integer->integer move cost is 2. */
2402 6, /* cost for loading QImode using movzbl */
2403 {6, 6, 6}, /* cost of loading integer registers
2404 in QImode, HImode and SImode.
2405 Relative to reg-reg move (2). */
2406 {6, 6, 6}, /* cost of storing integer registers */
2407 4, /* cost of reg,reg fld/fst */
2408 {6, 6, 12}, /* cost of loading fp registers
2409 in SFmode, DFmode and XFmode */
2410 {6, 6, 12}, /* cost of storing fp registers
2411 in SFmode, DFmode and XFmode */
2412 2, /* cost of moving MMX register */
2413 {6, 6}, /* cost of loading MMX registers
2414 in SImode and DImode */
2415 {6, 6}, /* cost of storing MMX registers
2416 in SImode and DImode */
2417 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2418 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2419 in 32,64,128,256 and 512-bit */
2420 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2421 in 32,64,128,256 and 512-bit */
2422 6, 6, /* SSE->integer and integer->SSE moves */
2423 6, 6, /* mask->integer and integer->mask moves */
2424 {6, 6, 6}, /* cost of loading mask register
2425 in QImode, HImode, SImode. */
2426 {6, 6, 6}, /* cost if storing mask register
2427 in QImode, HImode, SImode. */
2428 2, /* cost of moving mask register. */
2429 /* End of register allocator costs. */
2432 COSTS_N_INSNS (1), /* cost of an add instruction */
2433 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2434 COSTS_N_INSNS (1), /* variable shift costs */
2435 COSTS_N_INSNS (1), /* constant shift costs */
2436 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2437 COSTS_N_INSNS (3), /* HI */
2438 COSTS_N_INSNS (3), /* SI */
2439 COSTS_N_INSNS (3), /* DI */
2440 COSTS_N_INSNS (4)}, /* other */
2441 0, /* cost of multiply per each bit set */
2442 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2443 COSTS_N_INSNS (22), /* HI */
2444 COSTS_N_INSNS (30), /* SI */
2445 COSTS_N_INSNS (74), /* DI */
2446 COSTS_N_INSNS (74)}, /* other */
2447 COSTS_N_INSNS (1), /* cost of movsx */
2448 COSTS_N_INSNS (1), /* cost of movzx */
2449 8, /* "large" insn */
2450 17, /* MOVE_RATIO */
2451 17, /* CLEAR_RATIO */
2452 {6, 6, 6}, /* cost of loading integer registers
2453 in QImode, HImode and SImode.
2454 Relative to reg-reg move (2). */
2455 {8, 8, 8}, /* cost of storing integer registers */
2456 {8, 8, 8, 10, 15}, /* cost of loading SSE register
2457 in 32bit, 64bit, 128bit, 256bit and 512bit */
2458 {8, 8, 8, 10, 15}, /* cost of storing SSE register
2459 in 32bit, 64bit, 128bit, 256bit and 512bit */
2460 {8, 8, 8, 10, 15}, /* cost of unaligned loads. */
2461 {8, 8, 8, 10, 15}, /* cost of unaligned storess. */
2462 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2463 6, /* cost of moving SSE register to integer. */
2464 18, 6, /* Gather load static, per_elt. */
2465 18, 6, /* Gather store static, per_elt. */
2466 32, /* size of l1 cache. */
2467 512, /* size of l2 cache. */
2468 64, /* size of prefetch block */
2469 6, /* number of parallel prefetches */
2470 3, /* Branch cost */
2471 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2472 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2473 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2474 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2475 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2476 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2478 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2479 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2480 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2481 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2482 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2483 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2484 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2485 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2486 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2487 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2488 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2489 alderlake_memcpy,
2490 alderlake_memset,
2491 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2492 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2493 "16:11:8", /* Loop alignment. */
2494 "16:11:8", /* Jump alignment. */
2495 "0:0:8", /* Label alignment. */
2496 "16", /* Func alignment. */
2497 4, /* Small unroll limit. */
2498 2, /* Small unroll factor. */
2501 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
2502 very small blocks it is better to use loop. For large blocks, libcall can
2503 do nontemporary accesses and beat inline considerably. */
2504 static stringop_algs btver1_memcpy[2] = {
2505 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2506 {-1, rep_prefix_4_byte, false}}},
2507 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2508 {-1, libcall, false}}}};
2509 static stringop_algs btver1_memset[2] = {
2510 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2511 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2512 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2513 {-1, libcall, false}}}};
2514 const struct processor_costs btver1_cost = {
2516 /* Start of register allocator costs. integer->integer move cost is 2. */
2517 8, /* cost for loading QImode using movzbl */
2518 {6, 8, 6}, /* cost of loading integer registers
2519 in QImode, HImode and SImode.
2520 Relative to reg-reg move (2). */
2521 {6, 8, 6}, /* cost of storing integer registers */
2522 4, /* cost of reg,reg fld/fst */
2523 {12, 12, 28}, /* cost of loading fp registers
2524 in SFmode, DFmode and XFmode */
2525 {12, 12, 38}, /* cost of storing fp registers
2526 in SFmode, DFmode and XFmode */
2527 4, /* cost of moving MMX register */
2528 {10, 10}, /* cost of loading MMX registers
2529 in SImode and DImode */
2530 {12, 12}, /* cost of storing MMX registers
2531 in SImode and DImode */
2532 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2533 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2534 in 32,64,128,256 and 512-bit */
2535 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2536 in 32,64,128,256 and 512-bit */
2537 14, 14, /* SSE->integer and integer->SSE moves */
2538 14, 14, /* mask->integer and integer->mask moves */
2539 {6, 8, 6}, /* cost of loading mask register
2540 in QImode, HImode, SImode. */
2541 {6, 8, 6}, /* cost if storing mask register
2542 in QImode, HImode, SImode. */
2543 2, /* cost of moving mask register. */
2544 /* End of register allocator costs. */
2547 COSTS_N_INSNS (1), /* cost of an add instruction */
2548 COSTS_N_INSNS (2), /* cost of a lea instruction */
2549 COSTS_N_INSNS (1), /* variable shift costs */
2550 COSTS_N_INSNS (1), /* constant shift costs */
2551 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2552 COSTS_N_INSNS (4), /* HI */
2553 COSTS_N_INSNS (3), /* SI */
2554 COSTS_N_INSNS (4), /* DI */
2555 COSTS_N_INSNS (5)}, /* other */
2556 0, /* cost of multiply per each bit set */
2557 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2558 COSTS_N_INSNS (35), /* HI */
2559 COSTS_N_INSNS (51), /* SI */
2560 COSTS_N_INSNS (83), /* DI */
2561 COSTS_N_INSNS (83)}, /* other */
2562 COSTS_N_INSNS (1), /* cost of movsx */
2563 COSTS_N_INSNS (1), /* cost of movzx */
2564 8, /* "large" insn */
2565 9, /* MOVE_RATIO */
2566 6, /* CLEAR_RATIO */
2567 {6, 8, 6}, /* cost of loading integer registers
2568 in QImode, HImode and SImode.
2569 Relative to reg-reg move (2). */
2570 {6, 8, 6}, /* cost of storing integer registers */
2571 {10, 10, 12, 48, 96}, /* cost of loading SSE register
2572 in 32bit, 64bit, 128bit, 256bit and 512bit */
2573 {10, 10, 12, 48, 96}, /* cost of storing SSE register
2574 in 32bit, 64bit, 128bit, 256bit and 512bit */
2575 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
2576 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2577 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2578 14, /* cost of moving SSE register to integer. */
2579 10, 10, /* Gather load static, per_elt. */
2580 10, 10, /* Gather store static, per_elt. */
2581 32, /* size of l1 cache. */
2582 512, /* size of l2 cache. */
2583 64, /* size of prefetch block */
2584 100, /* number of parallel prefetches */
2585 2, /* Branch cost */
2586 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2587 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2588 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2589 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2590 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2591 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
2593 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2594 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2595 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2596 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2597 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2598 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2599 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2600 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2601 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2602 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
2603 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2604 btver1_memcpy,
2605 btver1_memset,
2606 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2607 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2608 "16:11:8", /* Loop alignment. */
2609 "16:8:8", /* Jump alignment. */
2610 "0:0:8", /* Label alignment. */
2611 "11", /* Func alignment. */
2612 4, /* Small unroll limit. */
2613 2, /* Small unroll factor. */
2616 static stringop_algs btver2_memcpy[2] = {
2617 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
2618 {-1, rep_prefix_4_byte, false}}},
2619 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
2620 {-1, libcall, false}}}};
2621 static stringop_algs btver2_memset[2] = {
2622 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
2623 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2624 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
2625 {-1, libcall, false}}}};
2626 const struct processor_costs btver2_cost = {
2628 /* Start of register allocator costs. integer->integer move cost is 2. */
2629 8, /* cost for loading QImode using movzbl */
2630 {8, 8, 6}, /* cost of loading integer registers
2631 in QImode, HImode and SImode.
2632 Relative to reg-reg move (2). */
2633 {8, 8, 6}, /* cost of storing integer registers */
2634 4, /* cost of reg,reg fld/fst */
2635 {12, 12, 28}, /* cost of loading fp registers
2636 in SFmode, DFmode and XFmode */
2637 {12, 12, 38}, /* cost of storing fp registers
2638 in SFmode, DFmode and XFmode */
2639 4, /* cost of moving MMX register */
2640 {10, 10}, /* cost of loading MMX registers
2641 in SImode and DImode */
2642 {12, 12}, /* cost of storing MMX registers
2643 in SImode and DImode */
2644 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2645 {10, 10, 12, 48, 96}, /* cost of loading SSE registers
2646 in 32,64,128,256 and 512-bit */
2647 {10, 10, 12, 48, 96}, /* cost of storing SSE registers
2648 in 32,64,128,256 and 512-bit */
2649 14, 14, /* SSE->integer and integer->SSE moves */
2650 14, 14, /* mask->integer and integer->mask moves */
2651 {8, 8, 6}, /* cost of loading mask register
2652 in QImode, HImode, SImode. */
2653 {8, 8, 6}, /* cost if storing mask register
2654 in QImode, HImode, SImode. */
2655 2, /* cost of moving mask register. */
2656 /* End of register allocator costs. */
2659 COSTS_N_INSNS (1), /* cost of an add instruction */
2660 COSTS_N_INSNS (2), /* cost of a lea instruction */
2661 COSTS_N_INSNS (1), /* variable shift costs */
2662 COSTS_N_INSNS (1), /* constant shift costs */
2663 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2664 COSTS_N_INSNS (4), /* HI */
2665 COSTS_N_INSNS (3), /* SI */
2666 COSTS_N_INSNS (4), /* DI */
2667 COSTS_N_INSNS (5)}, /* other */
2668 0, /* cost of multiply per each bit set */
2669 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
2670 COSTS_N_INSNS (35), /* HI */
2671 COSTS_N_INSNS (51), /* SI */
2672 COSTS_N_INSNS (83), /* DI */
2673 COSTS_N_INSNS (83)}, /* other */
2674 COSTS_N_INSNS (1), /* cost of movsx */
2675 COSTS_N_INSNS (1), /* cost of movzx */
2676 8, /* "large" insn */
2677 9, /* MOVE_RATIO */
2678 6, /* CLEAR_RATIO */
2679 {8, 8, 6}, /* cost of loading integer registers
2680 in QImode, HImode and SImode.
2681 Relative to reg-reg move (2). */
2682 {8, 8, 6}, /* cost of storing integer registers */
2683 {10, 10, 12, 48, 96}, /* cost of loading SSE register
2684 in 32bit, 64bit, 128bit, 256bit and 512bit */
2685 {10, 10, 12, 48, 96}, /* cost of storing SSE register
2686 in 32bit, 64bit, 128bit, 256bit and 512bit */
2687 {10, 10, 12, 48, 96}, /* cost of unaligned loads. */
2688 {10, 10, 12, 48, 96}, /* cost of unaligned stores. */
2689 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2690 14, /* cost of moving SSE register to integer. */
2691 10, 10, /* Gather load static, per_elt. */
2692 10, 10, /* Gather store static, per_elt. */
2693 32, /* size of l1 cache. */
2694 2048, /* size of l2 cache. */
2695 64, /* size of prefetch block */
2696 100, /* number of parallel prefetches */
2697 2, /* Branch cost */
2698 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
2699 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
2700 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
2701 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2702 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2703 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
2705 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2706 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2707 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
2708 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
2709 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2710 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2711 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2712 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
2713 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
2714 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
2715 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2716 btver2_memcpy,
2717 btver2_memset,
2718 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
2719 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2720 "16:11:8", /* Loop alignment. */
2721 "16:8:8", /* Jump alignment. */
2722 "0:0:8", /* Label alignment. */
2723 "11", /* Func alignment. */
2724 4, /* Small unroll limit. */
2725 2, /* Small unroll factor. */
2728 static stringop_algs pentium4_memcpy[2] = {
2729 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2730 DUMMY_STRINGOP_ALGS};
2731 static stringop_algs pentium4_memset[2] = {
2732 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2733 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2734 DUMMY_STRINGOP_ALGS};
2736 static const
2737 struct processor_costs pentium4_cost = {
2739 /* Start of register allocator costs. integer->integer move cost is 2. */
2740 5, /* cost for loading QImode using movzbl */
2741 {4, 5, 4}, /* cost of loading integer registers
2742 in QImode, HImode and SImode.
2743 Relative to reg-reg move (2). */
2744 {2, 3, 2}, /* cost of storing integer registers */
2745 12, /* cost of reg,reg fld/fst */
2746 {14, 14, 14}, /* cost of loading fp registers
2747 in SFmode, DFmode and XFmode */
2748 {14, 14, 14}, /* cost of storing fp registers
2749 in SFmode, DFmode and XFmode */
2750 12, /* cost of moving MMX register */
2751 {16, 16}, /* cost of loading MMX registers
2752 in SImode and DImode */
2753 {16, 16}, /* cost of storing MMX registers
2754 in SImode and DImode */
2755 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2756 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
2757 in 32,64,128,256 and 512-bit */
2758 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
2759 in 32,64,128,256 and 512-bit */
2760 20, 12, /* SSE->integer and integer->SSE moves */
2761 20, 12, /* mask->integer and integer->mask moves */
2762 {4, 5, 4}, /* cost of loading mask register
2763 in QImode, HImode, SImode. */
2764 {2, 3, 2}, /* cost if storing mask register
2765 in QImode, HImode, SImode. */
2766 2, /* cost of moving mask register. */
2767 /* End of register allocator costs. */
2770 COSTS_N_INSNS (1), /* cost of an add instruction */
2771 COSTS_N_INSNS (3), /* cost of a lea instruction */
2772 COSTS_N_INSNS (4), /* variable shift costs */
2773 COSTS_N_INSNS (4), /* constant shift costs */
2774 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
2775 COSTS_N_INSNS (15), /* HI */
2776 COSTS_N_INSNS (15), /* SI */
2777 COSTS_N_INSNS (15), /* DI */
2778 COSTS_N_INSNS (15)}, /* other */
2779 0, /* cost of multiply per each bit set */
2780 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
2781 COSTS_N_INSNS (56), /* HI */
2782 COSTS_N_INSNS (56), /* SI */
2783 COSTS_N_INSNS (56), /* DI */
2784 COSTS_N_INSNS (56)}, /* other */
2785 COSTS_N_INSNS (1), /* cost of movsx */
2786 COSTS_N_INSNS (1), /* cost of movzx */
2787 16, /* "large" insn */
2788 6, /* MOVE_RATIO */
2789 6, /* CLEAR_RATIO */
2790 {4, 5, 4}, /* cost of loading integer registers
2791 in QImode, HImode and SImode.
2792 Relative to reg-reg move (2). */
2793 {2, 3, 2}, /* cost of storing integer registers */
2794 {16, 16, 16, 32, 64}, /* cost of loading SSE register
2795 in 32bit, 64bit, 128bit, 256bit and 512bit */
2796 {16, 16, 16, 32, 64}, /* cost of storing SSE register
2797 in 32bit, 64bit, 128bit, 256bit and 512bit */
2798 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
2799 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
2800 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
2801 20, /* cost of moving SSE register to integer. */
2802 16, 16, /* Gather load static, per_elt. */
2803 16, 16, /* Gather store static, per_elt. */
2804 8, /* size of l1 cache. */
2805 256, /* size of l2 cache. */
2806 64, /* size of prefetch block */
2807 6, /* number of parallel prefetches */
2808 2, /* Branch cost */
2809 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
2810 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
2811 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
2812 COSTS_N_INSNS (2), /* cost of FABS instruction. */
2813 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
2814 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
2816 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2817 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
2818 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
2819 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
2820 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2821 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2822 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
2823 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
2824 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
2825 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
2826 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2827 pentium4_memcpy,
2828 pentium4_memset,
2829 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2830 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2831 NULL, /* Loop alignment. */
2832 NULL, /* Jump alignment. */
2833 NULL, /* Label alignment. */
2834 NULL, /* Func alignment. */
2835 4, /* Small unroll limit. */
2836 2, /* Small unroll factor. */
2839 static stringop_algs nocona_memcpy[2] = {
2840 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
2841 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
2842 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
2844 static stringop_algs nocona_memset[2] = {
2845 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
2846 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2847 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
2848 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2850 static const
2851 struct processor_costs nocona_cost = {
2853 /* Start of register allocator costs. integer->integer move cost is 2. */
2854 4, /* cost for loading QImode using movzbl */
2855 {4, 4, 4}, /* cost of loading integer registers
2856 in QImode, HImode and SImode.
2857 Relative to reg-reg move (2). */
2858 {4, 4, 4}, /* cost of storing integer registers */
2859 12, /* cost of reg,reg fld/fst */
2860 {14, 14, 14}, /* cost of loading fp registers
2861 in SFmode, DFmode and XFmode */
2862 {14, 14, 14}, /* cost of storing fp registers
2863 in SFmode, DFmode and XFmode */
2864 14, /* cost of moving MMX register */
2865 {12, 12}, /* cost of loading MMX registers
2866 in SImode and DImode */
2867 {12, 12}, /* cost of storing MMX registers
2868 in SImode and DImode */
2869 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2870 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
2871 in 32,64,128,256 and 512-bit */
2872 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
2873 in 32,64,128,256 and 512-bit */
2874 20, 12, /* SSE->integer and integer->SSE moves */
2875 20, 12, /* mask->integer and integer->mask moves */
2876 {4, 4, 4}, /* cost of loading mask register
2877 in QImode, HImode, SImode. */
2878 {4, 4, 4}, /* cost if storing mask register
2879 in QImode, HImode, SImode. */
2880 2, /* cost of moving mask register. */
2881 /* End of register allocator costs. */
2884 COSTS_N_INSNS (1), /* cost of an add instruction */
2885 COSTS_N_INSNS (1), /* cost of a lea instruction */
2886 COSTS_N_INSNS (1), /* variable shift costs */
2887 COSTS_N_INSNS (1), /* constant shift costs */
2888 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
2889 COSTS_N_INSNS (10), /* HI */
2890 COSTS_N_INSNS (10), /* SI */
2891 COSTS_N_INSNS (10), /* DI */
2892 COSTS_N_INSNS (10)}, /* other */
2893 0, /* cost of multiply per each bit set */
2894 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
2895 COSTS_N_INSNS (66), /* HI */
2896 COSTS_N_INSNS (66), /* SI */
2897 COSTS_N_INSNS (66), /* DI */
2898 COSTS_N_INSNS (66)}, /* other */
2899 COSTS_N_INSNS (1), /* cost of movsx */
2900 COSTS_N_INSNS (1), /* cost of movzx */
2901 16, /* "large" insn */
2902 17, /* MOVE_RATIO */
2903 6, /* CLEAR_RATIO */
2904 {4, 4, 4}, /* cost of loading integer registers
2905 in QImode, HImode and SImode.
2906 Relative to reg-reg move (2). */
2907 {4, 4, 4}, /* cost of storing integer registers */
2908 {12, 12, 12, 24, 48}, /* cost of loading SSE register
2909 in 32bit, 64bit, 128bit, 256bit and 512bit */
2910 {12, 12, 12, 24, 48}, /* cost of storing SSE register
2911 in 32bit, 64bit, 128bit, 256bit and 512bit */
2912 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
2913 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
2914 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2915 20, /* cost of moving SSE register to integer. */
2916 12, 12, /* Gather load static, per_elt. */
2917 12, 12, /* Gather store static, per_elt. */
2918 8, /* size of l1 cache. */
2919 1024, /* size of l2 cache. */
2920 64, /* size of prefetch block */
2921 8, /* number of parallel prefetches */
2922 1, /* Branch cost */
2923 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
2924 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2925 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
2926 COSTS_N_INSNS (3), /* cost of FABS instruction. */
2927 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
2928 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
2930 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2931 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2932 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
2933 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
2934 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
2935 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
2936 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
2937 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
2938 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
2939 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
2940 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2941 nocona_memcpy,
2942 nocona_memset,
2943 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2944 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2945 NULL, /* Loop alignment. */
2946 NULL, /* Jump alignment. */
2947 NULL, /* Label alignment. */
2948 NULL, /* Func alignment. */
2949 4, /* Small unroll limit. */
2950 2, /* Small unroll factor. */
2953 static stringop_algs atom_memcpy[2] = {
2954 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2955 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2956 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2957 static stringop_algs atom_memset[2] = {
2958 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2959 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2960 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2961 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2962 static const
2963 struct processor_costs atom_cost = {
2965 /* Start of register allocator costs. integer->integer move cost is 2. */
2966 6, /* cost for loading QImode using movzbl */
2967 {6, 6, 6}, /* cost of loading integer registers
2968 in QImode, HImode and SImode.
2969 Relative to reg-reg move (2). */
2970 {6, 6, 6}, /* cost of storing integer registers */
2971 4, /* cost of reg,reg fld/fst */
2972 {6, 6, 18}, /* cost of loading fp registers
2973 in SFmode, DFmode and XFmode */
2974 {14, 14, 24}, /* cost of storing fp registers
2975 in SFmode, DFmode and XFmode */
2976 2, /* cost of moving MMX register */
2977 {8, 8}, /* cost of loading MMX registers
2978 in SImode and DImode */
2979 {10, 10}, /* cost of storing MMX registers
2980 in SImode and DImode */
2981 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2982 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2983 in 32,64,128,256 and 512-bit */
2984 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2985 in 32,64,128,256 and 512-bit */
2986 8, 6, /* SSE->integer and integer->SSE moves */
2987 8, 6, /* mask->integer and integer->mask moves */
2988 {6, 6, 6}, /* cost of loading mask register
2989 in QImode, HImode, SImode. */
2990 {6, 6, 6}, /* cost if storing mask register
2991 in QImode, HImode, SImode. */
2992 2, /* cost of moving mask register. */
2993 /* End of register allocator costs. */
2996 COSTS_N_INSNS (1), /* cost of an add instruction */
2997 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2998 COSTS_N_INSNS (1), /* variable shift costs */
2999 COSTS_N_INSNS (1), /* constant shift costs */
3000 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3001 COSTS_N_INSNS (4), /* HI */
3002 COSTS_N_INSNS (3), /* SI */
3003 COSTS_N_INSNS (4), /* DI */
3004 COSTS_N_INSNS (2)}, /* other */
3005 0, /* cost of multiply per each bit set */
3006 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3007 COSTS_N_INSNS (26), /* HI */
3008 COSTS_N_INSNS (42), /* SI */
3009 COSTS_N_INSNS (74), /* DI */
3010 COSTS_N_INSNS (74)}, /* other */
3011 COSTS_N_INSNS (1), /* cost of movsx */
3012 COSTS_N_INSNS (1), /* cost of movzx */
3013 8, /* "large" insn */
3014 17, /* MOVE_RATIO */
3015 6, /* CLEAR_RATIO */
3016 {6, 6, 6}, /* cost of loading integer registers
3017 in QImode, HImode and SImode.
3018 Relative to reg-reg move (2). */
3019 {6, 6, 6}, /* cost of storing integer registers */
3020 {8, 8, 8, 16, 32}, /* cost of loading SSE register
3021 in 32bit, 64bit, 128bit, 256bit and 512bit */
3022 {8, 8, 8, 16, 32}, /* cost of storing SSE register
3023 in 32bit, 64bit, 128bit, 256bit and 512bit */
3024 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
3025 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
3026 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3027 8, /* cost of moving SSE register to integer. */
3028 8, 8, /* Gather load static, per_elt. */
3029 8, 8, /* Gather store static, per_elt. */
3030 32, /* size of l1 cache. */
3031 256, /* size of l2 cache. */
3032 64, /* size of prefetch block */
3033 6, /* number of parallel prefetches */
3034 3, /* Branch cost */
3035 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3036 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3037 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3038 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3039 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3040 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3042 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3043 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
3044 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3045 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3046 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3047 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3048 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
3049 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
3050 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
3051 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
3052 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
3053 atom_memcpy,
3054 atom_memset,
3055 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3056 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3057 "16", /* Loop alignment. */
3058 "16:8:8", /* Jump alignment. */
3059 "0:0:8", /* Label alignment. */
3060 "16", /* Func alignment. */
3061 4, /* Small unroll limit. */
3062 2, /* Small unroll factor. */
3065 static stringop_algs slm_memcpy[2] = {
3066 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3067 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3068 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3069 static stringop_algs slm_memset[2] = {
3070 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3071 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3072 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3073 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3074 static const
3075 struct processor_costs slm_cost = {
3077 /* Start of register allocator costs. integer->integer move cost is 2. */
3078 8, /* cost for loading QImode using movzbl */
3079 {8, 8, 8}, /* cost of loading integer registers
3080 in QImode, HImode and SImode.
3081 Relative to reg-reg move (2). */
3082 {6, 6, 6}, /* cost of storing integer registers */
3083 2, /* cost of reg,reg fld/fst */
3084 {8, 8, 18}, /* cost of loading fp registers
3085 in SFmode, DFmode and XFmode */
3086 {6, 6, 18}, /* cost of storing fp registers
3087 in SFmode, DFmode and XFmode */
3088 2, /* cost of moving MMX register */
3089 {8, 8}, /* cost of loading MMX registers
3090 in SImode and DImode */
3091 {6, 6}, /* cost of storing MMX registers
3092 in SImode and DImode */
3093 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3094 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
3095 in 32,64,128,256 and 512-bit */
3096 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
3097 in 32,64,128,256 and 512-bit */
3098 8, 6, /* SSE->integer and integer->SSE moves */
3099 8, 6, /* mask->integer and integer->mask moves */
3100 {8, 8, 8}, /* cost of loading mask register
3101 in QImode, HImode, SImode. */
3102 {6, 6, 6}, /* cost if storing mask register
3103 in QImode, HImode, SImode. */
3104 2, /* cost of moving mask register. */
3105 /* End of register allocator costs. */
3108 COSTS_N_INSNS (1), /* cost of an add instruction */
3109 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3110 COSTS_N_INSNS (1), /* variable shift costs */
3111 COSTS_N_INSNS (1), /* constant shift costs */
3112 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3113 COSTS_N_INSNS (3), /* HI */
3114 COSTS_N_INSNS (3), /* SI */
3115 COSTS_N_INSNS (4), /* DI */
3116 COSTS_N_INSNS (2)}, /* other */
3117 0, /* cost of multiply per each bit set */
3118 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3119 COSTS_N_INSNS (26), /* HI */
3120 COSTS_N_INSNS (42), /* SI */
3121 COSTS_N_INSNS (74), /* DI */
3122 COSTS_N_INSNS (74)}, /* other */
3123 COSTS_N_INSNS (1), /* cost of movsx */
3124 COSTS_N_INSNS (1), /* cost of movzx */
3125 8, /* "large" insn */
3126 17, /* MOVE_RATIO */
3127 6, /* CLEAR_RATIO */
3128 {8, 8, 8}, /* cost of loading integer registers
3129 in QImode, HImode and SImode.
3130 Relative to reg-reg move (2). */
3131 {6, 6, 6}, /* cost of storing integer registers */
3132 {8, 8, 8, 16, 32}, /* cost of loading SSE register
3133 in 32bit, 64bit, 128bit, 256bit and 512bit */
3134 {8, 8, 8, 16, 32}, /* cost of storing SSE register
3135 in SImode, DImode and TImode. */
3136 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
3137 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
3138 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
3139 8, /* cost of moving SSE register to integer. */
3140 8, 8, /* Gather load static, per_elt. */
3141 8, 8, /* Gather store static, per_elt. */
3142 32, /* size of l1 cache. */
3143 256, /* size of l2 cache. */
3144 64, /* size of prefetch block */
3145 6, /* number of parallel prefetches */
3146 3, /* Branch cost */
3147 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3148 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3149 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3150 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3151 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3152 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3154 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3155 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3156 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3157 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3158 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3159 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3160 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
3161 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
3162 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3163 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
3164 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
3165 slm_memcpy,
3166 slm_memset,
3167 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3168 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3169 "16", /* Loop alignment. */
3170 "16:8:8", /* Jump alignment. */
3171 "0:0:8", /* Label alignment. */
3172 "16", /* Func alignment. */
3173 4, /* Small unroll limit. */
3174 2, /* Small unroll factor. */
3177 static stringop_algs tremont_memcpy[2] = {
3178 {libcall,
3179 {{256, rep_prefix_1_byte, true},
3180 {256, loop, false},
3181 {-1, libcall, false}}},
3182 {libcall,
3183 {{256, rep_prefix_1_byte, true},
3184 {256, loop, false},
3185 {-1, libcall, false}}}};
3186 static stringop_algs tremont_memset[2] = {
3187 {libcall,
3188 {{256, rep_prefix_1_byte, true},
3189 {256, loop, false},
3190 {-1, libcall, false}}},
3191 {libcall,
3192 {{256, rep_prefix_1_byte, true},
3193 {256, loop, false},
3194 {-1, libcall, false}}}};
3195 static const
3196 struct processor_costs tremont_cost = {
3198 /* Start of register allocator costs. integer->integer move cost is 2. */
3199 6, /* cost for loading QImode using movzbl */
3200 {6, 6, 6}, /* cost of loading integer registers
3201 in QImode, HImode and SImode.
3202 Relative to reg-reg move (2). */
3203 {6, 6, 6}, /* cost of storing integer registers */
3204 4, /* cost of reg,reg fld/fst */
3205 {6, 6, 12}, /* cost of loading fp registers
3206 in SFmode, DFmode and XFmode */
3207 {6, 6, 12}, /* cost of storing fp registers
3208 in SFmode, DFmode and XFmode */
3209 2, /* cost of moving MMX register */
3210 {6, 6}, /* cost of loading MMX registers
3211 in SImode and DImode */
3212 {6, 6}, /* cost of storing MMX registers
3213 in SImode and DImode */
3214 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3215 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3216 in 32,64,128,256 and 512-bit */
3217 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3218 in 32,64,128,256 and 512-bit */
3219 6, 6, /* SSE->integer and integer->SSE moves */
3220 6, 6, /* mask->integer and integer->mask moves */
3221 {6, 6, 6}, /* cost of loading mask register
3222 in QImode, HImode, SImode. */
3223 {6, 6, 6}, /* cost if storing mask register
3224 in QImode, HImode, SImode. */
3225 2, /* cost of moving mask register. */
3226 /* End of register allocator costs. */
3229 COSTS_N_INSNS (1), /* cost of an add instruction */
3230 /* Setting cost to 2 makes our current implementation of synth_mult result in
3231 use of unnecessary temporary registers causing regression on several
3232 SPECfp benchmarks. */
3233 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3234 COSTS_N_INSNS (1), /* variable shift costs */
3235 COSTS_N_INSNS (1), /* constant shift costs */
3236 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3237 COSTS_N_INSNS (3), /* HI */
3238 COSTS_N_INSNS (3), /* SI */
3239 COSTS_N_INSNS (3), /* DI */
3240 COSTS_N_INSNS (4)}, /* other */
3241 0, /* cost of multiply per each bit set */
3242 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3243 COSTS_N_INSNS (22), /* HI */
3244 COSTS_N_INSNS (30), /* SI */
3245 COSTS_N_INSNS (74), /* DI */
3246 COSTS_N_INSNS (74)}, /* other */
3247 COSTS_N_INSNS (1), /* cost of movsx */
3248 COSTS_N_INSNS (1), /* cost of movzx */
3249 8, /* "large" insn */
3250 17, /* MOVE_RATIO */
3251 17, /* CLEAR_RATIO */
3252 {6, 6, 6}, /* cost of loading integer registers
3253 in QImode, HImode and SImode.
3254 Relative to reg-reg move (2). */
3255 {6, 6, 6}, /* cost of storing integer registers */
3256 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3257 in 32bit, 64bit, 128bit, 256bit and 512bit */
3258 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3259 in 32bit, 64bit, 128bit, 256bit and 512bit */
3260 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3261 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3262 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3263 6, /* cost of moving SSE register to integer. */
3264 18, 6, /* Gather load static, per_elt. */
3265 18, 6, /* Gather store static, per_elt. */
3266 32, /* size of l1 cache. */
3267 512, /* size of l2 cache. */
3268 64, /* size of prefetch block */
3269 6, /* number of parallel prefetches */
3270 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3271 value is increased to perhaps more appropriate value of 5. */
3272 3, /* Branch cost */
3273 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3274 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3275 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
3276 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3277 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3278 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
3280 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3281 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3282 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3283 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3284 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3285 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3286 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3287 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3288 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3289 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3290 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3291 tremont_memcpy,
3292 tremont_memset,
3293 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3294 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3295 "16:11:8", /* Loop alignment. */
3296 "16:11:8", /* Jump alignment. */
3297 "0:0:8", /* Label alignment. */
3298 "16", /* Func alignment. */
3299 4, /* Small unroll limit. */
3300 2, /* Small unroll factor. */
3303 static stringop_algs intel_memcpy[2] = {
3304 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
3305 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
3306 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3307 static stringop_algs intel_memset[2] = {
3308 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
3309 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
3310 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
3311 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
3312 static const
3313 struct processor_costs intel_cost = {
3315 /* Start of register allocator costs. integer->integer move cost is 2. */
3316 6, /* cost for loading QImode using movzbl */
3317 {4, 4, 4}, /* cost of loading integer registers
3318 in QImode, HImode and SImode.
3319 Relative to reg-reg move (2). */
3320 {6, 6, 6}, /* cost of storing integer registers */
3321 2, /* cost of reg,reg fld/fst */
3322 {6, 6, 8}, /* cost of loading fp registers
3323 in SFmode, DFmode and XFmode */
3324 {6, 6, 10}, /* cost of storing fp registers
3325 in SFmode, DFmode and XFmode */
3326 2, /* cost of moving MMX register */
3327 {6, 6}, /* cost of loading MMX registers
3328 in SImode and DImode */
3329 {6, 6}, /* cost of storing MMX registers
3330 in SImode and DImode */
3331 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
3332 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
3333 in 32,64,128,256 and 512-bit */
3334 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
3335 in 32,64,128,256 and 512-bit */
3336 4, 4, /* SSE->integer and integer->SSE moves */
3337 4, 4, /* mask->integer and integer->mask moves */
3338 {4, 4, 4}, /* cost of loading mask register
3339 in QImode, HImode, SImode. */
3340 {6, 6, 6}, /* cost if storing mask register
3341 in QImode, HImode, SImode. */
3342 2, /* cost of moving mask register. */
3343 /* End of register allocator costs. */
3346 COSTS_N_INSNS (1), /* cost of an add instruction */
3347 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3348 COSTS_N_INSNS (1), /* variable shift costs */
3349 COSTS_N_INSNS (1), /* constant shift costs */
3350 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3351 COSTS_N_INSNS (3), /* HI */
3352 COSTS_N_INSNS (3), /* SI */
3353 COSTS_N_INSNS (4), /* DI */
3354 COSTS_N_INSNS (2)}, /* other */
3355 0, /* cost of multiply per each bit set */
3356 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
3357 COSTS_N_INSNS (26), /* HI */
3358 COSTS_N_INSNS (42), /* SI */
3359 COSTS_N_INSNS (74), /* DI */
3360 COSTS_N_INSNS (74)}, /* other */
3361 COSTS_N_INSNS (1), /* cost of movsx */
3362 COSTS_N_INSNS (1), /* cost of movzx */
3363 8, /* "large" insn */
3364 17, /* MOVE_RATIO */
3365 6, /* CLEAR_RATIO */
3366 {4, 4, 4}, /* cost of loading integer registers
3367 in QImode, HImode and SImode.
3368 Relative to reg-reg move (2). */
3369 {6, 6, 6}, /* cost of storing integer registers */
3370 {6, 6, 6, 6, 6}, /* cost of loading SSE register
3371 in 32bit, 64bit, 128bit, 256bit and 512bit */
3372 {6, 6, 6, 6, 6}, /* cost of storing SSE register
3373 in 32bit, 64bit, 128bit, 256bit and 512bit */
3374 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
3375 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
3376 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
3377 4, /* cost of moving SSE register to integer. */
3378 6, 6, /* Gather load static, per_elt. */
3379 6, 6, /* Gather store static, per_elt. */
3380 32, /* size of l1 cache. */
3381 256, /* size of l2 cache. */
3382 64, /* size of prefetch block */
3383 6, /* number of parallel prefetches */
3384 3, /* Branch cost */
3385 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
3386 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
3387 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
3388 COSTS_N_INSNS (8), /* cost of FABS instruction. */
3389 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
3390 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3392 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3393 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
3394 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
3395 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
3396 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3397 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3398 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
3399 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
3400 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
3401 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
3402 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
3403 intel_memcpy,
3404 intel_memset,
3405 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3406 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3407 "16", /* Loop alignment. */
3408 "16:8:8", /* Jump alignment. */
3409 "0:0:8", /* Label alignment. */
3410 "16", /* Func alignment. */
3411 4, /* Small unroll limit. */
3412 2, /* Small unroll factor. */
3415 /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU. */
3416 static stringop_algs lujiazui_memcpy[2] = {
3417 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3418 {-1, libcall, false}}},
3419 {libcall, {{12, unrolled_loop, true}, {32, loop, false},
3420 {6144, rep_prefix_8_byte, false},
3421 {-1, libcall, false}}}};
3422 static stringop_algs lujiazui_memset[2] = {
3423 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3424 {-1, libcall, false}}},
3425 {libcall, {{12, loop, true}, {32, loop, false},
3426 {640, rep_prefix_8_byte, false},
3427 {-1, libcall, false}}}};
3428 static const
3429 struct processor_costs lujiazui_cost = {
3431 /* Start of register allocator costs. integer->integer move cost is 2. */
3432 6, /* cost for loading QImode using movzbl. */
3433 {6, 6, 6}, /* cost of loading integer registers
3434 in QImode, HImode and SImode.
3435 Relative to reg-reg move (2). */
3436 {6, 6, 6}, /* cost of storing integer registers. */
3437 2, /* cost of reg,reg fld/fst. */
3438 {6, 6, 8}, /* cost of loading fp registers
3439 in SFmode, DFmode and XFmode. */
3440 {6, 6, 8}, /* cost of storing fp registers
3441 in SFmode, DFmode and XFmode. */
3442 2, /* cost of moving MMX register. */
3443 {6, 6}, /* cost of loading MMX registers
3444 in SImode and DImode. */
3445 {6, 6}, /* cost of storing MMX registers
3446 in SImode and DImode. */
3447 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3448 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3449 in 32,64,128,256 and 512-bit. */
3450 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3451 in 32,64,128,256 and 512-bit. */
3452 6, 6, /* SSE->integer and integer->SSE moves. */
3453 6, 6, /* mask->integer and integer->mask moves. */
3454 {6, 6, 6}, /* cost of loading mask register
3455 in QImode, HImode, SImode. */
3456 {6, 6, 6}, /* cost if storing mask register
3457 in QImode, HImode, SImode. */
3458 2, /* cost of moving mask register. */
3459 /* End of register allocator costs. */
3462 COSTS_N_INSNS (1), /* cost of an add instruction. */
3463 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction. */
3464 COSTS_N_INSNS (1), /* variable shift costs. */
3465 COSTS_N_INSNS (1), /* constant shift costs. */
3466 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3467 COSTS_N_INSNS (3), /* HI. */
3468 COSTS_N_INSNS (3), /* SI. */
3469 COSTS_N_INSNS (12), /* DI. */
3470 COSTS_N_INSNS (14)}, /* other. */
3471 0, /* cost of multiply per each bit set. */
3472 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI. */
3473 COSTS_N_INSNS (24), /* HI. */
3474 COSTS_N_INSNS (24), /* SI. */
3475 COSTS_N_INSNS (150), /* DI. */
3476 COSTS_N_INSNS (152)}, /* other. */
3477 COSTS_N_INSNS (1), /* cost of movsx. */
3478 COSTS_N_INSNS (1), /* cost of movzx. */
3479 8, /* "large" insn. */
3480 17, /* MOVE_RATIO. */
3481 6, /* CLEAR_RATIO. */
3482 {6, 6, 6}, /* cost of loading integer registers
3483 in QImode, HImode and SImode.
3484 Relative to reg-reg move (2). */
3485 {6, 6, 6}, /* cost of storing integer registers. */
3486 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3487 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3488 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3489 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3490 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3491 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3492 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3493 6, /* cost of moving SSE register to integer. */
3494 18, 6, /* Gather load static, per_elt. */
3495 18, 6, /* Gather store static, per_elt. */
3496 32, /* size of l1 cache. */
3497 4096, /* size of l2 cache. */
3498 64, /* size of prefetch block. */
3499 /* Lujiazui processor never drop prefetches, like AMD processors. */
3500 100, /* number of parallel prefetches. */
3501 3, /* Branch cost. */
3502 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3503 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
3504 COSTS_N_INSNS (22), /* cost of FDIV instruction. */
3505 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3506 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3507 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
3509 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3510 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3511 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3512 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
3513 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
3514 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
3515 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3516 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3517 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
3518 COSTS_N_INSNS (60), /* cost of SQRTSD instruction. */
3519 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3520 lujiazui_memcpy,
3521 lujiazui_memset,
3522 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3523 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3524 "16:11:8", /* Loop alignment. */
3525 "16:11:8", /* Jump alignment. */
3526 "0:0:8", /* Label alignment. */
3527 "16", /* Func alignment. */
3528 4, /* Small unroll limit. */
3529 2, /* Small unroll factor. */
3532 /* yongfeng_cost should produce code tuned for ZHAOXIN yongfeng CPU. */
3533 static stringop_algs yongfeng_memcpy[2] = {
3534 {libcall, {{6, unrolled_loop, true}, {256, unrolled_loop, false},
3535 {-1, libcall, false}}},
3536 {libcall, {{8, loop, false}, {512, unrolled_loop, false},
3537 {-1, libcall, false}}}};
3538 static stringop_algs yongfeng_memset[2] = {
3539 {libcall, {{6, loop_1_byte, false}, {128, loop, false},
3540 {-1, libcall, false}}},
3541 {libcall, {{2, rep_prefix_4_byte, false}, {64, loop, false},
3542 {1024, vector_loop, false},
3543 {-1, libcall, false}}}};
3544 static const
3545 struct processor_costs yongfeng_cost = {
3547 /* Start of register allocator costs. integer->integer move cost is 2. */
3548 8, /* cost for loading QImode using movzbl. */
3549 {8, 8, 8}, /* cost of loading integer registers
3550 in QImode, HImode and SImode.
3551 Relative to reg-reg move (2). */
3552 {8, 8, 8}, /* cost of storing integer registers. */
3553 2, /* cost of reg,reg fld/fst. */
3554 {8, 8, 8}, /* cost of loading fp registers
3555 in SFmode, DFmode and XFmode. */
3556 {8, 8, 8}, /* cost of storing fp registers
3557 in SFmode, DFmode and XFmode. */
3558 2, /* cost of moving MMX register. */
3559 {8, 8}, /* cost of loading MMX registers
3560 in SImode and DImode. */
3561 {8, 8}, /* cost of storing MMX registers
3562 in SImode and DImode. */
3563 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3564 {8, 8, 8, 10, 15}, /* cost of loading SSE registers
3565 in 32,64,128,256 and 512-bit. */
3566 {8, 8, 8, 10, 15}, /* cost of storing SSE registers
3567 in 32,64,128,256 and 512-bit. */
3568 8, 8, /* SSE->integer and integer->SSE moves. */
3569 8, 8, /* mask->integer and integer->mask moves. */
3570 {8, 8, 8}, /* cost of loading mask register
3571 in QImode, HImode, SImode. */
3572 {8, 8, 8}, /* cost if storing mask register
3573 in QImode, HImode, SImode. */
3574 2, /* cost of moving mask register. */
3575 /* End of register allocator costs. */
3578 COSTS_N_INSNS (1), /* cost of an add instruction. */
3579 COSTS_N_INSNS (1), /* cost of a lea instruction. */
3580 COSTS_N_INSNS (1), /* variable shift costs. */
3581 COSTS_N_INSNS (1), /* constant shift costs. */
3582 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3583 COSTS_N_INSNS (3), /* HI. */
3584 COSTS_N_INSNS (2), /* SI. */
3585 COSTS_N_INSNS (2), /* DI. */
3586 COSTS_N_INSNS (3)}, /* other. */
3587 0, /* cost of multiply per each bit set. */
3588 {COSTS_N_INSNS (8), /* cost of a divide/mod for QI. */
3589 COSTS_N_INSNS (9), /* HI. */
3590 COSTS_N_INSNS (8), /* SI. */
3591 COSTS_N_INSNS (41), /* DI. */
3592 COSTS_N_INSNS (41)}, /* other. */
3593 COSTS_N_INSNS (1), /* cost of movsx. */
3594 COSTS_N_INSNS (1), /* cost of movzx. */
3595 8, /* "large" insn. */
3596 17, /* MOVE_RATIO. */
3597 6, /* CLEAR_RATIO. */
3598 {8, 8, 8}, /* cost of loading integer registers
3599 in QImode, HImode and SImode.
3600 Relative to reg-reg move (2). */
3601 {8, 8, 8}, /* cost of storing integer registers. */
3602 {8, 8, 8, 12, 15}, /* cost of loading SSE register
3603 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3604 {8, 8, 8, 12, 15}, /* cost of storing SSE register
3605 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3606 {8, 8, 8, 12, 15}, /* cost of unaligned loads. */
3607 {8, 8, 8, 12, 15}, /* cost of unaligned storess. */
3608 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3609 8, /* cost of moving SSE register to integer. */
3610 18, 6, /* Gather load static, per_elt. */
3611 18, 6, /* Gather store static, per_elt. */
3612 32, /* size of l1 cache. */
3613 256, /* size of l2 cache. */
3614 64, /* size of prefetch block. */
3615 12, /* number of parallel prefetches. */
3616 3, /* Branch cost. */
3617 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3618 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
3619 COSTS_N_INSNS (14), /* cost of FDIV instruction. */
3620 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3621 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3622 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
3624 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3625 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3626 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3627 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
3628 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3629 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3630 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
3631 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
3632 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
3633 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
3634 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
3635 yongfeng_memcpy,
3636 yongfeng_memset,
3637 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3638 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3639 "16:11:8", /* Loop alignment. */
3640 "16:11:8", /* Jump alignment. */
3641 "0:0:8", /* Label alignment. */
3642 "16", /* Func alignment. */
3643 4, /* Small unroll limit. */
3644 2, /* Small unroll factor. */
3647 /* shijidadao_cost should produce code tuned for ZHAOXIN shijidadao CPU. */
3648 static stringop_algs shijidadao_memcpy[2] = {
3649 {libcall, {{8, unrolled_loop, true}, {256, unrolled_loop, false},
3650 {-1, libcall, false}}},
3651 {libcall, {{10, loop, true}, {256, unrolled_loop, false},
3652 {-1, libcall, false}}}};
3653 static stringop_algs shijidadao_memset[2] = {
3654 {libcall, {{4, loop, true}, {128, unrolled_loop, false},
3655 {-1, libcall, false}}},
3656 {libcall, {{1, rep_prefix_4_byte, false}, {14, loop, true},
3657 {1024, vector_loop, false},
3658 {-1, libcall, false}}}};
3659 static const
3660 struct processor_costs shijidadao_cost = {
3662 /* Start of register allocator costs. integer->integer move cost is 2. */
3663 8, /* cost for loading QImode using movzbl. */
3664 {8, 8, 8}, /* cost of loading integer registers
3665 in QImode, HImode and SImode.
3666 Relative to reg-reg move (2). */
3667 {8, 8, 8}, /* cost of storing integer registers. */
3668 2, /* cost of reg,reg fld/fst. */
3669 {8, 8, 8}, /* cost of loading fp registers
3670 in SFmode, DFmode and XFmode. */
3671 {8, 8, 8}, /* cost of storing fp registers
3672 in SFmode, DFmode and XFmode. */
3673 2, /* cost of moving MMX register. */
3674 {8, 8}, /* cost of loading MMX registers
3675 in SImode and DImode. */
3676 {8, 8}, /* cost of storing MMX registers
3677 in SImode and DImode. */
3678 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3679 {8, 8, 8, 10, 15}, /* cost of loading SSE registers
3680 in 32,64,128,256 and 512-bit. */
3681 {8, 8, 8, 10, 15}, /* cost of storing SSE registers
3682 in 32,64,128,256 and 512-bit. */
3683 8, 8, /* SSE->integer and integer->SSE moves. */
3684 8, 8, /* mask->integer and integer->mask moves. */
3685 {8, 8, 8}, /* cost of loading mask register
3686 in QImode, HImode, SImode. */
3687 {8, 8, 8}, /* cost if storing mask register
3688 in QImode, HImode, SImode. */
3689 2, /* cost of moving mask register. */
3690 /* End of register allocator costs. */
3693 COSTS_N_INSNS (1), /* cost of an add instruction. */
3694 COSTS_N_INSNS (1), /* cost of a lea instruction. */
3695 COSTS_N_INSNS (1), /* variable shift costs. */
3696 COSTS_N_INSNS (1), /* constant shift costs. */
3697 {COSTS_N_INSNS (2), /* cost of starting multiply for QI. */
3698 COSTS_N_INSNS (3), /* HI. */
3699 COSTS_N_INSNS (2), /* SI. */
3700 COSTS_N_INSNS (2), /* DI. */
3701 COSTS_N_INSNS (3)}, /* other. */
3702 0, /* cost of multiply per each bit set. */
3703 {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */
3704 COSTS_N_INSNS (10), /* HI. */
3705 COSTS_N_INSNS (9), /* SI. */
3706 COSTS_N_INSNS (50), /* DI. */
3707 COSTS_N_INSNS (50)}, /* other. */
3708 COSTS_N_INSNS (1), /* cost of movsx. */
3709 COSTS_N_INSNS (1), /* cost of movzx. */
3710 8, /* "large" insn. */
3711 17, /* MOVE_RATIO. */
3712 6, /* CLEAR_RATIO. */
3713 {8, 8, 8}, /* cost of loading integer registers
3714 in QImode, HImode and SImode.
3715 Relative to reg-reg move (2). */
3716 {8, 8, 8}, /* cost of storing integer registers. */
3717 {8, 8, 8, 12, 15}, /* cost of loading SSE register
3718 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3719 {8, 8, 8, 12, 15}, /* cost of storing SSE register
3720 in 32bit, 64bit, 128bit, 256bit and 512bit. */
3721 {8, 8, 8, 12, 15}, /* cost of unaligned loads. */
3722 {8, 8, 8, 12, 15}, /* cost of unaligned storess. */
3723 2, 3, 4, /* cost of moving XMM,YMM,ZMM register. */
3724 8, /* cost of moving SSE register to integer. */
3725 18, 6, /* Gather load static, per_elt. */
3726 18, 6, /* Gather store static, per_elt. */
3727 32, /* size of l1 cache. */
3728 256, /* size of l2 cache. */
3729 64, /* size of prefetch block. */
3730 12, /* number of parallel prefetches. */
3731 3, /* Branch cost. */
3732 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3733 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
3734 COSTS_N_INSNS (13), /* cost of FDIV instruction. */
3735 COSTS_N_INSNS (2), /* cost of FABS instruction. */
3736 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
3737 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
3739 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3740 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3741 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
3742 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
3743 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3744 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3745 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
3746 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
3747 COSTS_N_INSNS (11), /* cost of SQRTSS instruction. */
3748 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3749 4, 4, 4, 4, /* reassoc int, fp, vec_int, vec_fp. */
3750 shijidadao_memcpy,
3751 shijidadao_memset,
3752 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
3753 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
3754 "16:11:8", /* Loop alignment. */
3755 "16:11:8", /* Jump alignment. */
3756 "0:0:8", /* Label alignment. */
3757 "16", /* Func alignment. */
3758 4, /* Small unroll limit. */
3759 2, /* Small unroll factor. */
3764 /* Generic should produce code tuned for Core-i7 (and newer chips)
3765 and btver1 (and newer chips). */
3767 static stringop_algs generic_memcpy[2] = {
3768 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3769 {-1, libcall, false}}},
3770 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3771 {-1, libcall, false}}}};
3772 static stringop_algs generic_memset[2] = {
3773 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
3774 {-1, libcall, false}}},
3775 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
3776 {-1, libcall, false}}}};
3777 static const
3778 struct processor_costs generic_cost = {
3780 /* Start of register allocator costs. integer->integer move cost is 2. */
3781 6, /* cost for loading QImode using movzbl */
3782 {6, 6, 6}, /* cost of loading integer registers
3783 in QImode, HImode and SImode.
3784 Relative to reg-reg move (2). */
3785 {6, 6, 6}, /* cost of storing integer registers */
3786 4, /* cost of reg,reg fld/fst */
3787 {6, 6, 12}, /* cost of loading fp registers
3788 in SFmode, DFmode and XFmode */
3789 {6, 6, 12}, /* cost of storing fp registers
3790 in SFmode, DFmode and XFmode */
3791 2, /* cost of moving MMX register */
3792 {6, 6}, /* cost of loading MMX registers
3793 in SImode and DImode */
3794 {6, 6}, /* cost of storing MMX registers
3795 in SImode and DImode */
3796 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3797 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
3798 in 32,64,128,256 and 512-bit */
3799 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
3800 in 32,64,128,256 and 512-bit */
3801 6, 6, /* SSE->integer and integer->SSE moves */
3802 6, 6, /* mask->integer and integer->mask moves */
3803 {6, 6, 6}, /* cost of loading mask register
3804 in QImode, HImode, SImode. */
3805 {6, 6, 6}, /* cost if storing mask register
3806 in QImode, HImode, SImode. */
3807 2, /* cost of moving mask register. */
3808 /* End of register allocator costs. */
3811 COSTS_N_INSNS (1), /* cost of an add instruction */
3812 /* Setting cost to 2 makes our current implementation of synth_mult result in
3813 use of unnecessary temporary registers causing regression on several
3814 SPECfp benchmarks. */
3815 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3816 COSTS_N_INSNS (1), /* variable shift costs */
3817 COSTS_N_INSNS (1), /* constant shift costs */
3818 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3819 COSTS_N_INSNS (3), /* HI */
3820 COSTS_N_INSNS (3), /* SI */
3821 COSTS_N_INSNS (3), /* DI */
3822 COSTS_N_INSNS (4)}, /* other */
3823 0, /* cost of multiply per each bit set */
3824 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
3825 COSTS_N_INSNS (22), /* HI */
3826 COSTS_N_INSNS (30), /* SI */
3827 COSTS_N_INSNS (74), /* DI */
3828 COSTS_N_INSNS (74)}, /* other */
3829 COSTS_N_INSNS (1), /* cost of movsx */
3830 COSTS_N_INSNS (1), /* cost of movzx */
3831 8, /* "large" insn */
3832 17, /* MOVE_RATIO */
3833 6, /* CLEAR_RATIO */
3834 {6, 6, 6}, /* cost of loading integer registers
3835 in QImode, HImode and SImode.
3836 Relative to reg-reg move (2). */
3837 {6, 6, 6}, /* cost of storing integer registers */
3838 {6, 6, 6, 10, 15}, /* cost of loading SSE register
3839 in 32bit, 64bit, 128bit, 256bit and 512bit */
3840 {6, 6, 6, 10, 15}, /* cost of storing SSE register
3841 in 32bit, 64bit, 128bit, 256bit and 512bit */
3842 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
3843 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
3844 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
3845 6, /* cost of moving SSE register to integer. */
3846 18, 6, /* Gather load static, per_elt. */
3847 18, 6, /* Gather store static, per_elt. */
3848 32, /* size of l1 cache. */
3849 512, /* size of l2 cache. */
3850 64, /* size of prefetch block */
3851 6, /* number of parallel prefetches */
3852 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
3853 value is increased to perhaps more appropriate value of 5. */
3854 3, /* Branch cost */
3855 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3856 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3857 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
3858 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3859 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3860 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
3862 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3863 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3864 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3865 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3866 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3867 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3868 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
3869 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
3870 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
3871 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
3872 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
3873 generic_memcpy,
3874 generic_memset,
3875 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
3876 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
3877 "16", /* Loop alignment. */
3878 "16:11:8", /* Jump alignment. */
3879 "0:0:8", /* Label alignment. */
3880 "16", /* Func alignment. */
3881 4, /* Small unroll limit. */
3882 2, /* Small unroll factor. */
3885 /* core_cost should produce code tuned for Core familly of CPUs. */
3886 static stringop_algs core_memcpy[2] = {
3887 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
3888 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
3889 {-1, libcall, false}}}};
3890 static stringop_algs core_memset[2] = {
3891 {libcall, {{6, loop_1_byte, true},
3892 {24, loop, true},
3893 {8192, rep_prefix_4_byte, true},
3894 {-1, libcall, false}}},
3895 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
3896 {-1, libcall, false}}}};
3898 static const
3899 struct processor_costs core_cost = {
3901 /* Start of register allocator costs. integer->integer move cost is 2. */
3902 6, /* cost for loading QImode using movzbl */
3903 {4, 4, 4}, /* cost of loading integer registers
3904 in QImode, HImode and SImode.
3905 Relative to reg-reg move (2). */
3906 {6, 6, 6}, /* cost of storing integer registers */
3907 2, /* cost of reg,reg fld/fst */
3908 {6, 6, 8}, /* cost of loading fp registers
3909 in SFmode, DFmode and XFmode */
3910 {6, 6, 10}, /* cost of storing fp registers
3911 in SFmode, DFmode and XFmode */
3912 2, /* cost of moving MMX register */
3913 {6, 6}, /* cost of loading MMX registers
3914 in SImode and DImode */
3915 {6, 6}, /* cost of storing MMX registers
3916 in SImode and DImode */
3917 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
3918 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
3919 in 32,64,128,256 and 512-bit */
3920 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
3921 in 32,64,128,256 and 512-bit */
3922 6, 6, /* SSE->integer and integer->SSE moves */
3923 6, 6, /* mask->integer and integer->mask moves */
3924 {4, 4, 4}, /* cost of loading mask register
3925 in QImode, HImode, SImode. */
3926 {6, 6, 6}, /* cost if storing mask register
3927 in QImode, HImode, SImode. */
3928 2, /* cost of moving mask register. */
3929 /* End of register allocator costs. */
3932 COSTS_N_INSNS (1), /* cost of an add instruction */
3933 /* On all chips taken into consideration lea is 2 cycles and more. With
3934 this cost however our current implementation of synth_mult results in
3935 use of unnecessary temporary registers causing regression on several
3936 SPECfp benchmarks. */
3937 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
3938 COSTS_N_INSNS (1), /* variable shift costs */
3939 COSTS_N_INSNS (1), /* constant shift costs */
3940 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
3941 COSTS_N_INSNS (4), /* HI */
3942 COSTS_N_INSNS (3), /* SI */
3943 /* Here we tune for Sandybridge or newer. */
3944 COSTS_N_INSNS (3), /* DI */
3945 COSTS_N_INSNS (3)}, /* other */
3946 0, /* cost of multiply per each bit set */
3947 /* Expanding div/mod currently doesn't consider parallelism. So the cost
3948 model is not realistic. We compensate by increasing the latencies a bit. */
3949 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
3950 COSTS_N_INSNS (11), /* HI */
3951 COSTS_N_INSNS (14), /* SI */
3952 COSTS_N_INSNS (81), /* DI */
3953 COSTS_N_INSNS (81)}, /* other */
3954 COSTS_N_INSNS (1), /* cost of movsx */
3955 COSTS_N_INSNS (1), /* cost of movzx */
3956 8, /* "large" insn */
3957 17, /* MOVE_RATIO */
3958 6, /* CLEAR_RATIO */
3959 {4, 4, 4}, /* cost of loading integer registers
3960 in QImode, HImode and SImode.
3961 Relative to reg-reg move (2). */
3962 {6, 6, 6}, /* cost of storing integer registers */
3963 {6, 6, 6, 6, 12}, /* cost of loading SSE register
3964 in 32bit, 64bit, 128bit, 256bit and 512bit */
3965 {6, 6, 6, 6, 12}, /* cost of storing SSE register
3966 in 32bit, 64bit, 128bit, 256bit and 512bit */
3967 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
3968 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
3969 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
3970 2, /* cost of moving SSE register to integer. */
3971 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
3972 rec. throughput 6.
3973 So 5 uops statically and one uops per load. */
3974 10, 6, /* Gather load static, per_elt. */
3975 10, 6, /* Gather store static, per_elt. */
3976 64, /* size of l1 cache. */
3977 512, /* size of l2 cache. */
3978 64, /* size of prefetch block */
3979 6, /* number of parallel prefetches */
3980 /* FIXME perhaps more appropriate value is 5. */
3981 3, /* Branch cost */
3982 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
3983 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
3984 /* 10-24 */
3985 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
3986 COSTS_N_INSNS (1), /* cost of FABS instruction. */
3987 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
3988 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
3990 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
3991 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
3992 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
3993 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
3994 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
3995 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
3996 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
3997 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
3998 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
3999 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
4000 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
4001 core_memcpy,
4002 core_memset,
4003 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
4004 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
4005 "16:11:8", /* Loop alignment. */
4006 "16:11:8", /* Jump alignment. */
4007 "0:0:8", /* Label alignment. */
4008 "16", /* Func alignment. */
4009 4, /* Small unroll limit. */
4010 2, /* Small unroll factor. */