i386: move alignment defaults to processor_costs.
[official-gcc.git] / gcc / config / i386 / x86-tune-costs.h
blob71a5854c09a6183e91a52b0ed910aac2384b8e5a
1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2018 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
30 static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 COSTS_N_BYTES (2), /* cost of an add instruction */
40 COSTS_N_BYTES (3), /* cost of a lea instruction */
41 COSTS_N_BYTES (2), /* variable shift costs */
42 COSTS_N_BYTES (3), /* constant shift costs */
43 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
44 COSTS_N_BYTES (3), /* HI */
45 COSTS_N_BYTES (3), /* SI */
46 COSTS_N_BYTES (3), /* DI */
47 COSTS_N_BYTES (5)}, /* other */
48 0, /* cost of multiply per each bit set */
49 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
50 COSTS_N_BYTES (3), /* HI */
51 COSTS_N_BYTES (3), /* SI */
52 COSTS_N_BYTES (3), /* DI */
53 COSTS_N_BYTES (5)}, /* other */
54 COSTS_N_BYTES (3), /* cost of movsx */
55 COSTS_N_BYTES (3), /* cost of movzx */
56 0, /* "large" insn */
57 2, /* MOVE_RATIO */
59 /* All move costs are relative to integer->integer move times 2. */
60 2, /* cost for loading QImode using movzbl */
61 {2, 2, 2}, /* cost of loading integer registers
62 in QImode, HImode and SImode.
63 Relative to reg-reg move (2). */
64 {2, 2, 2}, /* cost of storing integer registers */
65 2, /* cost of reg,reg fld/fst */
66 {2, 2, 2}, /* cost of loading fp registers
67 in SFmode, DFmode and XFmode */
68 {2, 2, 2}, /* cost of storing fp registers
69 in SFmode, DFmode and XFmode */
70 3, /* cost of moving MMX register */
71 {3, 3}, /* cost of loading MMX registers
72 in SImode and DImode */
73 {3, 3}, /* cost of storing MMX registers
74 in SImode and DImode */
75 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
76 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
77 in 32,64,128,256 and 512-bit */
78 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
79 in 128bit, 256bit and 512bit */
80 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
81 in 32,64,128,256 and 512-bit */
82 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
83 in 128bit, 256bit and 512bit */
84 3, 3, /* SSE->integer and integer->SSE moves */
85 5, 0, /* Gather load static, per_elt. */
86 5, 0, /* Gather store static, per_elt. */
87 0, /* size of l1 cache */
88 0, /* size of l2 cache */
89 0, /* size of prefetch block */
90 0, /* number of parallel prefetches */
91 2, /* Branch cost */
92 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
93 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
94 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
95 COSTS_N_BYTES (2), /* cost of FABS instruction. */
96 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
97 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
99 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
100 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
101 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
102 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
103 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
104 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
105 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
106 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
107 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
108 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
109 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
110 ix86_size_memcpy,
111 ix86_size_memset,
112 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
113 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
114 NULL, /* Loop alignment. */
115 NULL, /* Jump alignment. */
116 NULL, /* Label alignment. */
117 NULL, /* Func alignment. */
120 /* Processor costs (relative to an add) */
121 static stringop_algs i386_memcpy[2] = {
122 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
123 DUMMY_STRINGOP_ALGS};
124 static stringop_algs i386_memset[2] = {
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
126 DUMMY_STRINGOP_ALGS};
128 static const
129 struct processor_costs i386_cost = { /* 386 specific costs */
130 COSTS_N_INSNS (1), /* cost of an add instruction */
131 COSTS_N_INSNS (1), /* cost of a lea instruction */
132 COSTS_N_INSNS (3), /* variable shift costs */
133 COSTS_N_INSNS (2), /* constant shift costs */
134 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
135 COSTS_N_INSNS (6), /* HI */
136 COSTS_N_INSNS (6), /* SI */
137 COSTS_N_INSNS (6), /* DI */
138 COSTS_N_INSNS (6)}, /* other */
139 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
140 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
141 COSTS_N_INSNS (23), /* HI */
142 COSTS_N_INSNS (23), /* SI */
143 COSTS_N_INSNS (23), /* DI */
144 COSTS_N_INSNS (23)}, /* other */
145 COSTS_N_INSNS (3), /* cost of movsx */
146 COSTS_N_INSNS (2), /* cost of movzx */
147 15, /* "large" insn */
148 3, /* MOVE_RATIO */
150 /* All move costs are relative to integer->integer move times 2 and thus
151 they are latency*2. */
152 4, /* cost for loading QImode using movzbl */
153 {2, 4, 2}, /* cost of loading integer registers
154 in QImode, HImode and SImode.
155 Relative to reg-reg move (2). */
156 {2, 4, 2}, /* cost of storing integer registers */
157 2, /* cost of reg,reg fld/fst */
158 {8, 8, 8}, /* cost of loading fp registers
159 in SFmode, DFmode and XFmode */
160 {8, 8, 8}, /* cost of storing fp registers
161 in SFmode, DFmode and XFmode */
162 2, /* cost of moving MMX register */
163 {4, 8}, /* cost of loading MMX registers
164 in SImode and DImode */
165 {4, 8}, /* cost of storing MMX registers
166 in SImode and DImode */
167 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
168 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
169 in 32,64,128,256 and 512-bit */
170 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
171 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
172 in 32,64,128,256 and 512-bit */
173 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
174 3, 3, /* SSE->integer and integer->SSE moves */
175 4, 4, /* Gather load static, per_elt. */
176 4, 4, /* Gather store static, per_elt. */
177 0, /* size of l1 cache */
178 0, /* size of l2 cache */
179 0, /* size of prefetch block */
180 0, /* number of parallel prefetches */
181 1, /* Branch cost */
182 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
183 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
184 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
185 COSTS_N_INSNS (22), /* cost of FABS instruction. */
186 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
187 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
189 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
190 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
191 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
192 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
193 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
194 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
195 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
196 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
197 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
198 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
199 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
200 i386_memcpy,
201 i386_memset,
202 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
203 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
204 "4", /* Loop alignment. */
205 "4", /* Jump alignment. */
206 NULL, /* Label alignment. */
207 "4", /* Func alignment. */
210 static stringop_algs i486_memcpy[2] = {
211 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
212 DUMMY_STRINGOP_ALGS};
213 static stringop_algs i486_memset[2] = {
214 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
215 DUMMY_STRINGOP_ALGS};
217 static const
218 struct processor_costs i486_cost = { /* 486 specific costs */
219 COSTS_N_INSNS (1), /* cost of an add instruction */
220 COSTS_N_INSNS (1), /* cost of a lea instruction */
221 COSTS_N_INSNS (3), /* variable shift costs */
222 COSTS_N_INSNS (2), /* constant shift costs */
223 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
224 COSTS_N_INSNS (12), /* HI */
225 COSTS_N_INSNS (12), /* SI */
226 COSTS_N_INSNS (12), /* DI */
227 COSTS_N_INSNS (12)}, /* other */
228 1, /* cost of multiply per each bit set */
229 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
230 COSTS_N_INSNS (40), /* HI */
231 COSTS_N_INSNS (40), /* SI */
232 COSTS_N_INSNS (40), /* DI */
233 COSTS_N_INSNS (40)}, /* other */
234 COSTS_N_INSNS (3), /* cost of movsx */
235 COSTS_N_INSNS (2), /* cost of movzx */
236 15, /* "large" insn */
237 3, /* MOVE_RATIO */
239 /* All move costs are relative to integer->integer move times 2 and thus
240 they are latency*2. */
241 4, /* cost for loading QImode using movzbl */
242 {2, 4, 2}, /* cost of loading integer registers
243 in QImode, HImode and SImode.
244 Relative to reg-reg move (2). */
245 {2, 4, 2}, /* cost of storing integer registers */
246 2, /* cost of reg,reg fld/fst */
247 {8, 8, 8}, /* cost of loading fp registers
248 in SFmode, DFmode and XFmode */
249 {8, 8, 8}, /* cost of storing fp registers
250 in SFmode, DFmode and XFmode */
251 2, /* cost of moving MMX register */
252 {4, 8}, /* cost of loading MMX registers
253 in SImode and DImode */
254 {4, 8}, /* cost of storing MMX registers
255 in SImode and DImode */
256 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
257 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
258 in 32,64,128,256 and 512-bit */
259 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
260 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
261 in 32,64,128,256 and 512-bit */
262 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
263 3, 3, /* SSE->integer and integer->SSE moves */
264 4, 4, /* Gather load static, per_elt. */
265 4, 4, /* Gather store static, per_elt. */
266 4, /* size of l1 cache. 486 has 8kB cache
267 shared for code and data, so 4kB is
268 not really precise. */
269 4, /* size of l2 cache */
270 0, /* size of prefetch block */
271 0, /* number of parallel prefetches */
272 1, /* Branch cost */
273 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
274 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
275 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
276 COSTS_N_INSNS (3), /* cost of FABS instruction. */
277 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
278 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
280 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
281 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
282 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
283 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
284 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
285 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
286 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
287 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
288 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
289 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
290 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
291 i486_memcpy,
292 i486_memset,
293 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
294 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
295 "16", /* Loop alignment. */
296 "16", /* Jump alignment. */
297 "0:0:8", /* Label alignment. */
298 "16", /* Func alignment. */
301 static stringop_algs pentium_memcpy[2] = {
302 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
303 DUMMY_STRINGOP_ALGS};
304 static stringop_algs pentium_memset[2] = {
305 {libcall, {{-1, rep_prefix_4_byte, false}}},
306 DUMMY_STRINGOP_ALGS};
308 static const
309 struct processor_costs pentium_cost = {
310 COSTS_N_INSNS (1), /* cost of an add instruction */
311 COSTS_N_INSNS (1), /* cost of a lea instruction */
312 COSTS_N_INSNS (4), /* variable shift costs */
313 COSTS_N_INSNS (1), /* constant shift costs */
314 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
315 COSTS_N_INSNS (11), /* HI */
316 COSTS_N_INSNS (11), /* SI */
317 COSTS_N_INSNS (11), /* DI */
318 COSTS_N_INSNS (11)}, /* other */
319 0, /* cost of multiply per each bit set */
320 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
321 COSTS_N_INSNS (25), /* HI */
322 COSTS_N_INSNS (25), /* SI */
323 COSTS_N_INSNS (25), /* DI */
324 COSTS_N_INSNS (25)}, /* other */
325 COSTS_N_INSNS (3), /* cost of movsx */
326 COSTS_N_INSNS (2), /* cost of movzx */
327 8, /* "large" insn */
328 6, /* MOVE_RATIO */
330 /* All move costs are relative to integer->integer move times 2 and thus
331 they are latency*2. */
332 6, /* cost for loading QImode using movzbl */
333 {2, 4, 2}, /* cost of loading integer registers
334 in QImode, HImode and SImode.
335 Relative to reg-reg move (2). */
336 {2, 4, 2}, /* cost of storing integer registers */
337 2, /* cost of reg,reg fld/fst */
338 {2, 2, 6}, /* cost of loading fp registers
339 in SFmode, DFmode and XFmode */
340 {4, 4, 6}, /* cost of storing fp registers
341 in SFmode, DFmode and XFmode */
342 8, /* cost of moving MMX register */
343 {8, 8}, /* cost of loading MMX registers
344 in SImode and DImode */
345 {8, 8}, /* cost of storing MMX registers
346 in SImode and DImode */
347 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
348 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
349 in 32,64,128,256 and 512-bit */
350 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
351 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
352 in 32,64,128,256 and 512-bit */
353 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
354 3, 3, /* SSE->integer and integer->SSE moves */
355 4, 4, /* Gather load static, per_elt. */
356 4, 4, /* Gather store static, per_elt. */
357 8, /* size of l1 cache. */
358 8, /* size of l2 cache */
359 0, /* size of prefetch block */
360 0, /* number of parallel prefetches */
361 2, /* Branch cost */
362 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
363 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
364 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
365 COSTS_N_INSNS (1), /* cost of FABS instruction. */
366 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
367 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
369 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
370 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
371 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
372 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
373 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
374 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
375 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
376 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
377 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
378 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
379 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
380 pentium_memcpy,
381 pentium_memset,
382 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
383 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
384 "16:8:8", /* Loop alignment. */
385 "16:8:8", /* Jump alignment. */
386 "0:0:8", /* Label alignment. */
387 "16", /* Func alignment. */
390 static const
391 struct processor_costs lakemont_cost = {
392 COSTS_N_INSNS (1), /* cost of an add instruction */
393 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
394 COSTS_N_INSNS (1), /* variable shift costs */
395 COSTS_N_INSNS (1), /* constant shift costs */
396 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
397 COSTS_N_INSNS (11), /* HI */
398 COSTS_N_INSNS (11), /* SI */
399 COSTS_N_INSNS (11), /* DI */
400 COSTS_N_INSNS (11)}, /* other */
401 0, /* cost of multiply per each bit set */
402 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
403 COSTS_N_INSNS (25), /* HI */
404 COSTS_N_INSNS (25), /* SI */
405 COSTS_N_INSNS (25), /* DI */
406 COSTS_N_INSNS (25)}, /* other */
407 COSTS_N_INSNS (3), /* cost of movsx */
408 COSTS_N_INSNS (2), /* cost of movzx */
409 8, /* "large" insn */
410 17, /* MOVE_RATIO */
412 /* All move costs are relative to integer->integer move times 2 and thus
413 they are latency*2. */
414 6, /* cost for loading QImode using movzbl */
415 {2, 4, 2}, /* cost of loading integer registers
416 in QImode, HImode and SImode.
417 Relative to reg-reg move (2). */
418 {2, 4, 2}, /* cost of storing integer registers */
419 2, /* cost of reg,reg fld/fst */
420 {2, 2, 6}, /* cost of loading fp registers
421 in SFmode, DFmode and XFmode */
422 {4, 4, 6}, /* cost of storing fp registers
423 in SFmode, DFmode and XFmode */
424 8, /* cost of moving MMX register */
425 {8, 8}, /* cost of loading MMX registers
426 in SImode and DImode */
427 {8, 8}, /* cost of storing MMX registers
428 in SImode and DImode */
429 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
430 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
431 in 32,64,128,256 and 512-bit */
432 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
433 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
434 in 32,64,128,256 and 512-bit */
435 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
436 3, 3, /* SSE->integer and integer->SSE moves */
437 4, 4, /* Gather load static, per_elt. */
438 4, 4, /* Gather store static, per_elt. */
439 8, /* size of l1 cache. */
440 8, /* size of l2 cache */
441 0, /* size of prefetch block */
442 0, /* number of parallel prefetches */
443 2, /* Branch cost */
444 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
445 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
446 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
447 COSTS_N_INSNS (1), /* cost of FABS instruction. */
448 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
449 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
451 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
452 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
453 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
454 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
455 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
456 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
457 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
458 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
459 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
460 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
461 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
462 pentium_memcpy,
463 pentium_memset,
464 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
465 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
466 "16:8:8", /* Loop alignment. */
467 "16:8:8", /* Jump alignment. */
468 "0:0:8", /* Label alignment. */
469 "16", /* Func alignment. */
472 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
473 (we ensure the alignment). For small blocks inline loop is still a
474 noticeable win, for bigger blocks either rep movsl or rep movsb is
475 way to go. Rep movsb has apparently more expensive startup time in CPU,
476 but after 4K the difference is down in the noise. */
477 static stringop_algs pentiumpro_memcpy[2] = {
478 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
479 {8192, rep_prefix_4_byte, false},
480 {-1, rep_prefix_1_byte, false}}},
481 DUMMY_STRINGOP_ALGS};
482 static stringop_algs pentiumpro_memset[2] = {
483 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
484 {8192, rep_prefix_4_byte, false},
485 {-1, libcall, false}}},
486 DUMMY_STRINGOP_ALGS};
487 static const
488 struct processor_costs pentiumpro_cost = {
489 COSTS_N_INSNS (1), /* cost of an add instruction */
490 COSTS_N_INSNS (1), /* cost of a lea instruction */
491 COSTS_N_INSNS (1), /* variable shift costs */
492 COSTS_N_INSNS (1), /* constant shift costs */
493 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
494 COSTS_N_INSNS (4), /* HI */
495 COSTS_N_INSNS (4), /* SI */
496 COSTS_N_INSNS (4), /* DI */
497 COSTS_N_INSNS (4)}, /* other */
498 0, /* cost of multiply per each bit set */
499 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
500 COSTS_N_INSNS (17), /* HI */
501 COSTS_N_INSNS (17), /* SI */
502 COSTS_N_INSNS (17), /* DI */
503 COSTS_N_INSNS (17)}, /* other */
504 COSTS_N_INSNS (1), /* cost of movsx */
505 COSTS_N_INSNS (1), /* cost of movzx */
506 8, /* "large" insn */
507 6, /* MOVE_RATIO */
509 /* All move costs are relative to integer->integer move times 2 and thus
510 they are latency*2. */
511 2, /* cost for loading QImode using movzbl */
512 {4, 4, 4}, /* cost of loading integer registers
513 in QImode, HImode and SImode.
514 Relative to reg-reg move (2). */
515 {2, 2, 2}, /* cost of storing integer registers */
516 2, /* cost of reg,reg fld/fst */
517 {2, 2, 6}, /* cost of loading fp registers
518 in SFmode, DFmode and XFmode */
519 {4, 4, 6}, /* cost of storing fp registers
520 in SFmode, DFmode and XFmode */
521 2, /* cost of moving MMX register */
522 {2, 2}, /* cost of loading MMX registers
523 in SImode and DImode */
524 {2, 2}, /* cost of storing MMX registers
525 in SImode and DImode */
526 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
527 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
528 in 32,64,128,256 and 512-bit */
529 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
530 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
531 in 32,64,128,256 and 512-bit */
532 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
533 3, 3, /* SSE->integer and integer->SSE moves */
534 4, 4, /* Gather load static, per_elt. */
535 4, 4, /* Gather store static, per_elt. */
536 8, /* size of l1 cache. */
537 256, /* size of l2 cache */
538 32, /* size of prefetch block */
539 6, /* number of parallel prefetches */
540 2, /* Branch cost */
541 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
542 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
543 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
544 COSTS_N_INSNS (2), /* cost of FABS instruction. */
545 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
546 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
548 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
549 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
550 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
551 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
552 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
553 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
554 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
555 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
556 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
557 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
558 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
559 pentiumpro_memcpy,
560 pentiumpro_memset,
561 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
562 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
563 "16", /* Loop alignment. */
564 "16:11:8", /* Jump alignment. */
565 "0:0:8", /* Label alignment. */
566 "16", /* Func alignment. */
569 static stringop_algs geode_memcpy[2] = {
570 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
571 DUMMY_STRINGOP_ALGS};
572 static stringop_algs geode_memset[2] = {
573 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
574 DUMMY_STRINGOP_ALGS};
575 static const
576 struct processor_costs geode_cost = {
577 COSTS_N_INSNS (1), /* cost of an add instruction */
578 COSTS_N_INSNS (1), /* cost of a lea instruction */
579 COSTS_N_INSNS (2), /* variable shift costs */
580 COSTS_N_INSNS (1), /* constant shift costs */
581 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
582 COSTS_N_INSNS (4), /* HI */
583 COSTS_N_INSNS (7), /* SI */
584 COSTS_N_INSNS (7), /* DI */
585 COSTS_N_INSNS (7)}, /* other */
586 0, /* cost of multiply per each bit set */
587 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
588 COSTS_N_INSNS (23), /* HI */
589 COSTS_N_INSNS (39), /* SI */
590 COSTS_N_INSNS (39), /* DI */
591 COSTS_N_INSNS (39)}, /* other */
592 COSTS_N_INSNS (1), /* cost of movsx */
593 COSTS_N_INSNS (1), /* cost of movzx */
594 8, /* "large" insn */
595 4, /* MOVE_RATIO */
597 /* All move costs are relative to integer->integer move times 2 and thus
598 they are latency*2. */
599 2, /* cost for loading QImode using movzbl */
600 {2, 2, 2}, /* cost of loading integer registers
601 in QImode, HImode and SImode.
602 Relative to reg-reg move (2). */
603 {2, 2, 2}, /* cost of storing integer registers */
604 2, /* cost of reg,reg fld/fst */
605 {2, 2, 2}, /* cost of loading fp registers
606 in SFmode, DFmode and XFmode */
607 {4, 6, 6}, /* cost of storing fp registers
608 in SFmode, DFmode and XFmode */
610 2, /* cost of moving MMX register */
611 {2, 2}, /* cost of loading MMX registers
612 in SImode and DImode */
613 {2, 2}, /* cost of storing MMX registers
614 in SImode and DImode */
615 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
616 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
617 in 32,64,128,256 and 512-bit */
618 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
619 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
620 in 32,64,128,256 and 512-bit */
621 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
622 6, 6, /* SSE->integer and integer->SSE moves */
623 2, 2, /* Gather load static, per_elt. */
624 2, 2, /* Gather store static, per_elt. */
625 64, /* size of l1 cache. */
626 128, /* size of l2 cache. */
627 32, /* size of prefetch block */
628 1, /* number of parallel prefetches */
629 1, /* Branch cost */
630 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (1), /* cost of FABS instruction. */
634 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
637 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
638 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
639 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
640 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
641 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
642 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
643 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
644 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
645 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
646 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
647 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
648 geode_memcpy,
649 geode_memset,
650 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
651 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
652 NULL, /* Loop alignment. */
653 NULL, /* Jump alignment. */
654 NULL, /* Label alignment. */
655 NULL, /* Func alignment. */
658 static stringop_algs k6_memcpy[2] = {
659 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
660 DUMMY_STRINGOP_ALGS};
661 static stringop_algs k6_memset[2] = {
662 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
663 DUMMY_STRINGOP_ALGS};
664 static const
665 struct processor_costs k6_cost = {
666 COSTS_N_INSNS (1), /* cost of an add instruction */
667 COSTS_N_INSNS (2), /* cost of a lea instruction */
668 COSTS_N_INSNS (1), /* variable shift costs */
669 COSTS_N_INSNS (1), /* constant shift costs */
670 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
671 COSTS_N_INSNS (3), /* HI */
672 COSTS_N_INSNS (3), /* SI */
673 COSTS_N_INSNS (3), /* DI */
674 COSTS_N_INSNS (3)}, /* other */
675 0, /* cost of multiply per each bit set */
676 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
677 COSTS_N_INSNS (18), /* HI */
678 COSTS_N_INSNS (18), /* SI */
679 COSTS_N_INSNS (18), /* DI */
680 COSTS_N_INSNS (18)}, /* other */
681 COSTS_N_INSNS (2), /* cost of movsx */
682 COSTS_N_INSNS (2), /* cost of movzx */
683 8, /* "large" insn */
684 4, /* MOVE_RATIO */
686 /* All move costs are relative to integer->integer move times 2 and thus
687 they are latency*2. */
688 3, /* cost for loading QImode using movzbl */
689 {4, 5, 4}, /* cost of loading integer registers
690 in QImode, HImode and SImode.
691 Relative to reg-reg move (2). */
692 {2, 3, 2}, /* cost of storing integer registers */
693 4, /* cost of reg,reg fld/fst */
694 {6, 6, 6}, /* cost of loading fp registers
695 in SFmode, DFmode and XFmode */
696 {4, 4, 4}, /* cost of storing fp registers
697 in SFmode, DFmode and XFmode */
698 2, /* cost of moving MMX register */
699 {2, 2}, /* cost of loading MMX registers
700 in SImode and DImode */
701 {2, 2}, /* cost of storing MMX registers
702 in SImode and DImode */
703 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
704 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
705 in 32,64,128,256 and 512-bit */
706 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
707 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
708 in 32,64,128,256 and 512-bit */
709 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
710 6, 6, /* SSE->integer and integer->SSE moves */
711 2, 2, /* Gather load static, per_elt. */
712 2, 2, /* Gather store static, per_elt. */
713 32, /* size of l1 cache. */
714 32, /* size of l2 cache. Some models
715 have integrated l2 cache, but
716 optimizing for k6 is not important
717 enough to worry about that. */
718 32, /* size of prefetch block */
719 1, /* number of parallel prefetches */
720 1, /* Branch cost */
721 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
722 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
723 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
724 COSTS_N_INSNS (2), /* cost of FABS instruction. */
725 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
726 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
728 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
729 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
730 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
731 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
732 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
733 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
734 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
735 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
736 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
737 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
738 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
739 k6_memcpy,
740 k6_memset,
741 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
742 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
743 "32:8:8", /* Loop alignment. */
744 "32:8:8", /* Jump alignment. */
745 "0:0:8", /* Label alignment. */
746 "32", /* Func alignment. */
749 /* For some reason, Athlon deals better with REP prefix (relative to loops)
750 compared to K8. Alignment becomes important after 8 bytes for memcpy and
751 128 bytes for memset. */
752 static stringop_algs athlon_memcpy[2] = {
753 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
754 DUMMY_STRINGOP_ALGS};
755 static stringop_algs athlon_memset[2] = {
756 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
757 DUMMY_STRINGOP_ALGS};
758 static const
759 struct processor_costs athlon_cost = {
760 COSTS_N_INSNS (1), /* cost of an add instruction */
761 COSTS_N_INSNS (2), /* cost of a lea instruction */
762 COSTS_N_INSNS (1), /* variable shift costs */
763 COSTS_N_INSNS (1), /* constant shift costs */
764 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
765 COSTS_N_INSNS (5), /* HI */
766 COSTS_N_INSNS (5), /* SI */
767 COSTS_N_INSNS (5), /* DI */
768 COSTS_N_INSNS (5)}, /* other */
769 0, /* cost of multiply per each bit set */
770 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
771 COSTS_N_INSNS (26), /* HI */
772 COSTS_N_INSNS (42), /* SI */
773 COSTS_N_INSNS (74), /* DI */
774 COSTS_N_INSNS (74)}, /* other */
775 COSTS_N_INSNS (1), /* cost of movsx */
776 COSTS_N_INSNS (1), /* cost of movzx */
777 8, /* "large" insn */
778 9, /* MOVE_RATIO */
780 /* All move costs are relative to integer->integer move times 2 and thus
781 they are latency*2. */
782 4, /* cost for loading QImode using movzbl */
783 {3, 4, 3}, /* cost of loading integer registers
784 in QImode, HImode and SImode.
785 Relative to reg-reg move (2). */
786 {3, 4, 3}, /* cost of storing integer registers */
787 4, /* cost of reg,reg fld/fst */
788 {4, 4, 12}, /* cost of loading fp registers
789 in SFmode, DFmode and XFmode */
790 {6, 6, 8}, /* cost of storing fp registers
791 in SFmode, DFmode and XFmode */
792 2, /* cost of moving MMX register */
793 {4, 4}, /* cost of loading MMX registers
794 in SImode and DImode */
795 {4, 4}, /* cost of storing MMX registers
796 in SImode and DImode */
797 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
798 {4, 4, 6, 12, 24}, /* cost of loading SSE registers
799 in 32,64,128,256 and 512-bit */
800 {4, 4, 6, 12, 24}, /* cost of unaligned loads. */
801 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
802 in 32,64,128,256 and 512-bit */
803 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
804 5, 5, /* SSE->integer and integer->SSE moves */
805 4, 4, /* Gather load static, per_elt. */
806 4, 4, /* Gather store static, per_elt. */
807 64, /* size of l1 cache. */
808 256, /* size of l2 cache. */
809 64, /* size of prefetch block */
810 6, /* number of parallel prefetches */
811 5, /* Branch cost */
812 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
813 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
814 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
815 COSTS_N_INSNS (2), /* cost of FABS instruction. */
816 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
817 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
819 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
820 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
821 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
822 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
823 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
824 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
825 /* 11-16 */
826 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
827 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
828 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
829 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
830 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
831 athlon_memcpy,
832 athlon_memset,
833 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
834 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
835 "16:8:8", /* Loop alignment. */
836 "16:8:8", /* Jump alignment. */
837 "0:0:8", /* Label alignment. */
838 "16", /* Func alignment. */
841 /* K8 has optimized REP instruction for medium sized blocks, but for very
842 small blocks it is better to use loop. For large blocks, libcall can
843 do nontemporary accesses and beat inline considerably. */
844 static stringop_algs k8_memcpy[2] = {
845 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
846 {-1, rep_prefix_4_byte, false}}},
847 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
848 {-1, libcall, false}}}};
849 static stringop_algs k8_memset[2] = {
850 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
851 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
852 {libcall, {{48, unrolled_loop, false},
853 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
854 static const
855 struct processor_costs k8_cost = {
856 COSTS_N_INSNS (1), /* cost of an add instruction */
857 COSTS_N_INSNS (2), /* cost of a lea instruction */
858 COSTS_N_INSNS (1), /* variable shift costs */
859 COSTS_N_INSNS (1), /* constant shift costs */
860 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
861 COSTS_N_INSNS (4), /* HI */
862 COSTS_N_INSNS (3), /* SI */
863 COSTS_N_INSNS (4), /* DI */
864 COSTS_N_INSNS (5)}, /* other */
865 0, /* cost of multiply per each bit set */
866 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
867 COSTS_N_INSNS (26), /* HI */
868 COSTS_N_INSNS (42), /* SI */
869 COSTS_N_INSNS (74), /* DI */
870 COSTS_N_INSNS (74)}, /* other */
871 COSTS_N_INSNS (1), /* cost of movsx */
872 COSTS_N_INSNS (1), /* cost of movzx */
873 8, /* "large" insn */
874 9, /* MOVE_RATIO */
876 /* All move costs are relative to integer->integer move times 2 and thus
877 they are latency*2. */
878 4, /* cost for loading QImode using movzbl */
879 {3, 4, 3}, /* cost of loading integer registers
880 in QImode, HImode and SImode.
881 Relative to reg-reg move (2). */
882 {3, 4, 3}, /* cost of storing integer registers */
883 4, /* cost of reg,reg fld/fst */
884 {4, 4, 12}, /* cost of loading fp registers
885 in SFmode, DFmode and XFmode */
886 {6, 6, 8}, /* cost of storing fp registers
887 in SFmode, DFmode and XFmode */
888 2, /* cost of moving MMX register */
889 {3, 3}, /* cost of loading MMX registers
890 in SImode and DImode */
891 {4, 4}, /* cost of storing MMX registers
892 in SImode and DImode */
893 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
894 {4, 3, 6, 12, 24}, /* cost of loading SSE registers
895 in 32,64,128,256 and 512-bit */
896 {4, 3, 6, 12, 24}, /* cost of unaligned loads. */
897 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
898 in 32,64,128,256 and 512-bit */
899 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
900 5, 5, /* SSE->integer and integer->SSE moves */
901 4, 4, /* Gather load static, per_elt. */
902 4, 4, /* Gather store static, per_elt. */
903 64, /* size of l1 cache. */
904 512, /* size of l2 cache. */
905 64, /* size of prefetch block */
906 /* New AMD processors never drop prefetches; if they cannot be performed
907 immediately, they are queued. We set number of simultaneous prefetches
908 to a large constant to reflect this (it probably is not a good idea not
909 to limit number of prefetches at all, as their execution also takes some
910 time). */
911 100, /* number of parallel prefetches */
912 3, /* Branch cost */
913 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
914 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
915 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
916 COSTS_N_INSNS (2), /* cost of FABS instruction. */
917 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
918 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
920 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
921 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
922 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
923 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
924 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
925 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
926 /* 11-16 */
927 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
928 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
929 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
930 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
931 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
932 k8_memcpy,
933 k8_memset,
934 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
935 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
936 "16:8:8", /* Loop alignment. */
937 "16:8:8", /* Jump alignment. */
938 "0:0:8", /* Label alignment. */
939 "16", /* Func alignment. */
942 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
943 very small blocks it is better to use loop. For large blocks, libcall can
944 do nontemporary accesses and beat inline considerably. */
945 static stringop_algs amdfam10_memcpy[2] = {
946 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
947 {-1, rep_prefix_4_byte, false}}},
948 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
949 {-1, libcall, false}}}};
950 static stringop_algs amdfam10_memset[2] = {
951 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
952 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
953 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
954 {-1, libcall, false}}}};
955 struct processor_costs amdfam10_cost = {
956 COSTS_N_INSNS (1), /* cost of an add instruction */
957 COSTS_N_INSNS (2), /* cost of a lea instruction */
958 COSTS_N_INSNS (1), /* variable shift costs */
959 COSTS_N_INSNS (1), /* constant shift costs */
960 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
961 COSTS_N_INSNS (4), /* HI */
962 COSTS_N_INSNS (3), /* SI */
963 COSTS_N_INSNS (4), /* DI */
964 COSTS_N_INSNS (5)}, /* other */
965 0, /* cost of multiply per each bit set */
966 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
967 COSTS_N_INSNS (35), /* HI */
968 COSTS_N_INSNS (51), /* SI */
969 COSTS_N_INSNS (83), /* DI */
970 COSTS_N_INSNS (83)}, /* other */
971 COSTS_N_INSNS (1), /* cost of movsx */
972 COSTS_N_INSNS (1), /* cost of movzx */
973 8, /* "large" insn */
974 9, /* MOVE_RATIO */
976 /* All move costs are relative to integer->integer move times 2 and thus
977 they are latency*2. */
978 4, /* cost for loading QImode using movzbl */
979 {3, 4, 3}, /* cost of loading integer registers
980 in QImode, HImode and SImode.
981 Relative to reg-reg move (2). */
982 {3, 4, 3}, /* cost of storing integer registers */
983 4, /* cost of reg,reg fld/fst */
984 {4, 4, 12}, /* cost of loading fp registers
985 in SFmode, DFmode and XFmode */
986 {6, 6, 8}, /* cost of storing fp registers
987 in SFmode, DFmode and XFmode */
988 2, /* cost of moving MMX register */
989 {3, 3}, /* cost of loading MMX registers
990 in SImode and DImode */
991 {4, 4}, /* cost of storing MMX registers
992 in SImode and DImode */
993 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
994 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
995 in 32,64,128,256 and 512-bit */
996 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
997 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
998 in 32,64,128,256 and 512-bit */
999 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
1000 3, 3, /* SSE->integer and integer->SSE moves */
1001 /* On K8:
1002 MOVD reg64, xmmreg Double FSTORE 4
1003 MOVD reg32, xmmreg Double FSTORE 4
1004 On AMDFAM10:
1005 MOVD reg64, xmmreg Double FADD 3
1006 1/1 1/1
1007 MOVD reg32, xmmreg Double FADD 3
1008 1/1 1/1 */
1009 4, 4, /* Gather load static, per_elt. */
1010 4, 4, /* Gather store static, per_elt. */
1011 64, /* size of l1 cache. */
1012 512, /* size of l2 cache. */
1013 64, /* size of prefetch block */
1014 /* New AMD processors never drop prefetches; if they cannot be performed
1015 immediately, they are queued. We set number of simultaneous prefetches
1016 to a large constant to reflect this (it probably is not a good idea not
1017 to limit number of prefetches at all, as their execution also takes some
1018 time). */
1019 100, /* number of parallel prefetches */
1020 2, /* Branch cost */
1021 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1022 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1023 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1024 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1025 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1026 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1028 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1029 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1030 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1031 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1032 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
1033 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
1034 /* 11-16 */
1035 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
1036 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
1037 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
1038 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
1039 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1040 amdfam10_memcpy,
1041 amdfam10_memset,
1042 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1043 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1044 "32:25:8", /* Loop alignment. */
1045 "32:8:8", /* Jump alignment. */
1046 "0:0:8", /* Label alignment. */
1047 "32", /* Func alignment. */
1050 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1051 very small blocks it is better to use loop. For large blocks, libcall
1052 can do nontemporary accesses and beat inline considerably. */
1053 static stringop_algs bdver1_memcpy[2] = {
1054 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1055 {-1, rep_prefix_4_byte, false}}},
1056 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1057 {-1, libcall, false}}}};
1058 static stringop_algs bdver1_memset[2] = {
1059 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1060 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1061 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1062 {-1, libcall, false}}}};
1064 const struct processor_costs bdver1_cost = {
1065 COSTS_N_INSNS (1), /* cost of an add instruction */
1066 COSTS_N_INSNS (1), /* cost of a lea instruction */
1067 COSTS_N_INSNS (1), /* variable shift costs */
1068 COSTS_N_INSNS (1), /* constant shift costs */
1069 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1070 COSTS_N_INSNS (4), /* HI */
1071 COSTS_N_INSNS (4), /* SI */
1072 COSTS_N_INSNS (6), /* DI */
1073 COSTS_N_INSNS (6)}, /* other */
1074 0, /* cost of multiply per each bit set */
1075 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1076 COSTS_N_INSNS (35), /* HI */
1077 COSTS_N_INSNS (51), /* SI */
1078 COSTS_N_INSNS (83), /* DI */
1079 COSTS_N_INSNS (83)}, /* other */
1080 COSTS_N_INSNS (1), /* cost of movsx */
1081 COSTS_N_INSNS (1), /* cost of movzx */
1082 8, /* "large" insn */
1083 9, /* MOVE_RATIO */
1085 /* All move costs are relative to integer->integer move times 2 and thus
1086 they are latency*2. */
1087 8, /* cost for loading QImode using movzbl */
1088 {8, 8, 8}, /* cost of loading integer registers
1089 in QImode, HImode and SImode.
1090 Relative to reg-reg move (2). */
1091 {8, 8, 8}, /* cost of storing integer registers */
1092 4, /* cost of reg,reg fld/fst */
1093 {12, 12, 28}, /* cost of loading fp registers
1094 in SFmode, DFmode and XFmode */
1095 {10, 10, 18}, /* cost of storing fp registers
1096 in SFmode, DFmode and XFmode */
1097 4, /* cost of moving MMX register */
1098 {12, 12}, /* cost of loading MMX registers
1099 in SImode and DImode */
1100 {10, 10}, /* cost of storing MMX registers
1101 in SImode and DImode */
1102 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1103 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1104 in 32,64,128,256 and 512-bit */
1105 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1106 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1107 in 32,64,128,256 and 512-bit */
1108 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1109 16, 20, /* SSE->integer and integer->SSE moves */
1110 12, 12, /* Gather load static, per_elt. */
1111 10, 10, /* Gather store static, per_elt. */
1112 16, /* size of l1 cache. */
1113 2048, /* size of l2 cache. */
1114 64, /* size of prefetch block */
1115 /* New AMD processors never drop prefetches; if they cannot be performed
1116 immediately, they are queued. We set number of simultaneous prefetches
1117 to a large constant to reflect this (it probably is not a good idea not
1118 to limit number of prefetches at all, as their execution also takes some
1119 time). */
1120 100, /* number of parallel prefetches */
1121 2, /* Branch cost */
1122 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1123 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1124 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1125 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1126 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1127 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1129 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1130 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1131 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1132 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1133 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1134 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1135 /* 9-24 */
1136 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1137 /* 9-27 */
1138 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1139 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1140 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1141 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1142 bdver1_memcpy,
1143 bdver1_memset,
1144 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1145 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1146 "16:11:8", /* Loop alignment. */
1147 "16:8:8", /* Jump alignment. */
1148 "0:0:8", /* Label alignment. */
1149 "11", /* Func alignment. */
1152 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1153 very small blocks it is better to use loop. For large blocks, libcall
1154 can do nontemporary accesses and beat inline considerably. */
1156 static stringop_algs bdver2_memcpy[2] = {
1157 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1158 {-1, rep_prefix_4_byte, false}}},
1159 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1160 {-1, libcall, false}}}};
1161 static stringop_algs bdver2_memset[2] = {
1162 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1163 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1164 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1165 {-1, libcall, false}}}};
1167 const struct processor_costs bdver2_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (1), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (4), /* SI */
1175 COSTS_N_INSNS (6), /* DI */
1176 COSTS_N_INSNS (6)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1186 9, /* MOVE_RATIO */
1188 /* All move costs are relative to integer->integer move times 2 and thus
1189 they are latency*2. */
1190 8, /* cost for loading QImode using movzbl */
1191 {8, 8, 8}, /* cost of loading integer registers
1192 in QImode, HImode and SImode.
1193 Relative to reg-reg move (2). */
1194 {8, 8, 8}, /* cost of storing integer registers */
1195 4, /* cost of reg,reg fld/fst */
1196 {12, 12, 28}, /* cost of loading fp registers
1197 in SFmode, DFmode and XFmode */
1198 {10, 10, 18}, /* cost of storing fp registers
1199 in SFmode, DFmode and XFmode */
1200 4, /* cost of moving MMX register */
1201 {12, 12}, /* cost of loading MMX registers
1202 in SImode and DImode */
1203 {10, 10}, /* cost of storing MMX registers
1204 in SImode and DImode */
1205 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1206 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1207 in 32,64,128,256 and 512-bit */
1208 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1209 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1210 in 32,64,128,256 and 512-bit */
1211 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1212 16, 20, /* SSE->integer and integer->SSE moves */
1213 12, 12, /* Gather load static, per_elt. */
1214 10, 10, /* Gather store static, per_elt. */
1215 16, /* size of l1 cache. */
1216 2048, /* size of l2 cache. */
1217 64, /* size of prefetch block */
1218 /* New AMD processors never drop prefetches; if they cannot be performed
1219 immediately, they are queued. We set number of simultaneous prefetches
1220 to a large constant to reflect this (it probably is not a good idea not
1221 to limit number of prefetches at all, as their execution also takes some
1222 time). */
1223 100, /* number of parallel prefetches */
1224 2, /* Branch cost */
1225 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1226 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1227 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1228 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1229 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1230 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1232 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1233 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1234 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1235 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1236 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1237 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1238 /* 9-24 */
1239 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1240 /* 9-27 */
1241 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1242 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1243 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1244 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1245 bdver2_memcpy,
1246 bdver2_memset,
1247 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1248 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1249 "16:11:8", /* Loop alignment. */
1250 "16:8:8", /* Jump alignment. */
1251 "0:0:8", /* Label alignment. */
1252 "11", /* Func alignment. */
1256 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1257 very small blocks it is better to use loop. For large blocks, libcall
1258 can do nontemporary accesses and beat inline considerably. */
1259 static stringop_algs bdver3_memcpy[2] = {
1260 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1261 {-1, rep_prefix_4_byte, false}}},
1262 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1263 {-1, libcall, false}}}};
1264 static stringop_algs bdver3_memset[2] = {
1265 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1266 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1267 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1268 {-1, libcall, false}}}};
1269 struct processor_costs bdver3_cost = {
1270 COSTS_N_INSNS (1), /* cost of an add instruction */
1271 COSTS_N_INSNS (1), /* cost of a lea instruction */
1272 COSTS_N_INSNS (1), /* variable shift costs */
1273 COSTS_N_INSNS (1), /* constant shift costs */
1274 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1275 COSTS_N_INSNS (4), /* HI */
1276 COSTS_N_INSNS (4), /* SI */
1277 COSTS_N_INSNS (6), /* DI */
1278 COSTS_N_INSNS (6)}, /* other */
1279 0, /* cost of multiply per each bit set */
1280 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1281 COSTS_N_INSNS (35), /* HI */
1282 COSTS_N_INSNS (51), /* SI */
1283 COSTS_N_INSNS (83), /* DI */
1284 COSTS_N_INSNS (83)}, /* other */
1285 COSTS_N_INSNS (1), /* cost of movsx */
1286 COSTS_N_INSNS (1), /* cost of movzx */
1287 8, /* "large" insn */
1288 9, /* MOVE_RATIO */
1290 /* All move costs are relative to integer->integer move times 2 and thus
1291 they are latency*2. */
1292 8, /* cost for loading QImode using movzbl */
1293 {8, 8, 8}, /* cost of loading integer registers
1294 in QImode, HImode and SImode.
1295 Relative to reg-reg move (2). */
1296 {8, 8, 8}, /* cost of storing integer registers */
1297 4, /* cost of reg,reg fld/fst */
1298 {12, 12, 28}, /* cost of loading fp registers
1299 in SFmode, DFmode and XFmode */
1300 {10, 10, 18}, /* cost of storing fp registers
1301 in SFmode, DFmode and XFmode */
1302 4, /* cost of moving MMX register */
1303 {12, 12}, /* cost of loading MMX registers
1304 in SImode and DImode */
1305 {10, 10}, /* cost of storing MMX registers
1306 in SImode and DImode */
1307 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1308 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1309 in 32,64,128,256 and 512-bit */
1310 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1311 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1312 in 32,64,128,256 and 512-bit */
1313 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1314 16, 20, /* SSE->integer and integer->SSE moves */
1315 12, 12, /* Gather load static, per_elt. */
1316 10, 10, /* Gather store static, per_elt. */
1317 16, /* size of l1 cache. */
1318 2048, /* size of l2 cache. */
1319 64, /* size of prefetch block */
1320 /* New AMD processors never drop prefetches; if they cannot be performed
1321 immediately, they are queued. We set number of simultaneous prefetches
1322 to a large constant to reflect this (it probably is not a good idea not
1323 to limit number of prefetches at all, as their execution also takes some
1324 time). */
1325 100, /* number of parallel prefetches */
1326 2, /* Branch cost */
1327 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1328 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1329 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1330 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1331 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1332 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1334 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1335 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1336 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1337 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1338 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1339 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1340 /* 9-24 */
1341 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1342 /* 9-27 */
1343 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1344 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1345 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1346 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1347 bdver3_memcpy,
1348 bdver3_memset,
1349 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1350 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1351 "16:11:8", /* Loop alignment. */
1352 "16:8:8", /* Jump alignment. */
1353 "0:0:8", /* Label alignment. */
1354 "11", /* Func alignment. */
1357 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1358 very small blocks it is better to use loop. For large blocks, libcall
1359 can do nontemporary accesses and beat inline considerably. */
1360 static stringop_algs bdver4_memcpy[2] = {
1361 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1362 {-1, rep_prefix_4_byte, false}}},
1363 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1364 {-1, libcall, false}}}};
1365 static stringop_algs bdver4_memset[2] = {
1366 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1367 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1368 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1369 {-1, libcall, false}}}};
1370 struct processor_costs bdver4_cost = {
1371 COSTS_N_INSNS (1), /* cost of an add instruction */
1372 COSTS_N_INSNS (1), /* cost of a lea instruction */
1373 COSTS_N_INSNS (1), /* variable shift costs */
1374 COSTS_N_INSNS (1), /* constant shift costs */
1375 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1376 COSTS_N_INSNS (4), /* HI */
1377 COSTS_N_INSNS (4), /* SI */
1378 COSTS_N_INSNS (6), /* DI */
1379 COSTS_N_INSNS (6)}, /* other */
1380 0, /* cost of multiply per each bit set */
1381 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1382 COSTS_N_INSNS (35), /* HI */
1383 COSTS_N_INSNS (51), /* SI */
1384 COSTS_N_INSNS (83), /* DI */
1385 COSTS_N_INSNS (83)}, /* other */
1386 COSTS_N_INSNS (1), /* cost of movsx */
1387 COSTS_N_INSNS (1), /* cost of movzx */
1388 8, /* "large" insn */
1389 9, /* MOVE_RATIO */
1391 /* All move costs are relative to integer->integer move times 2 and thus
1392 they are latency*2. */
1393 8, /* cost for loading QImode using movzbl */
1394 {8, 8, 8}, /* cost of loading integer registers
1395 in QImode, HImode and SImode.
1396 Relative to reg-reg move (2). */
1397 {8, 8, 8}, /* cost of storing integer registers */
1398 4, /* cost of reg,reg fld/fst */
1399 {12, 12, 28}, /* cost of loading fp registers
1400 in SFmode, DFmode and XFmode */
1401 {10, 10, 18}, /* cost of storing fp registers
1402 in SFmode, DFmode and XFmode */
1403 4, /* cost of moving MMX register */
1404 {12, 12}, /* cost of loading MMX registers
1405 in SImode and DImode */
1406 {10, 10}, /* cost of storing MMX registers
1407 in SImode and DImode */
1408 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1409 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1410 in 32,64,128,256 and 512-bit */
1411 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1412 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1413 in 32,64,128,256 and 512-bit */
1414 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1415 16, 20, /* SSE->integer and integer->SSE moves */
1416 12, 12, /* Gather load static, per_elt. */
1417 10, 10, /* Gather store static, per_elt. */
1418 16, /* size of l1 cache. */
1419 2048, /* size of l2 cache. */
1420 64, /* size of prefetch block */
1421 /* New AMD processors never drop prefetches; if they cannot be performed
1422 immediately, they are queued. We set number of simultaneous prefetches
1423 to a large constant to reflect this (it probably is not a good idea not
1424 to limit number of prefetches at all, as their execution also takes some
1425 time). */
1426 100, /* number of parallel prefetches */
1427 2, /* Branch cost */
1428 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1429 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1430 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1431 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1432 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1433 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1435 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1436 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1437 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1438 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1439 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1440 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1441 /* 9-24 */
1442 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1443 /* 9-27 */
1444 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1445 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1446 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1447 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1448 bdver4_memcpy,
1449 bdver4_memset,
1450 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1451 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1452 "16:11:8", /* Loop alignment. */
1453 "16:8:8", /* Jump alignment. */
1454 "0:0:8", /* Label alignment. */
1455 "11", /* Func alignment. */
1459 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1460 very small blocks it is better to use loop. For large blocks, libcall
1461 can do nontemporary accesses and beat inline considerably. */
1462 static stringop_algs znver1_memcpy[2] = {
1463 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1464 {-1, rep_prefix_4_byte, false}}},
1465 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1466 {-1, libcall, false}}}};
1467 static stringop_algs znver1_memset[2] = {
1468 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1469 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1470 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1471 {-1, libcall, false}}}};
1472 struct processor_costs znver1_cost = {
1473 COSTS_N_INSNS (1), /* cost of an add instruction. */
1474 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1475 COSTS_N_INSNS (1), /* variable shift costs. */
1476 COSTS_N_INSNS (1), /* constant shift costs. */
1477 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1478 COSTS_N_INSNS (3), /* HI. */
1479 COSTS_N_INSNS (3), /* SI. */
1480 COSTS_N_INSNS (3), /* DI. */
1481 COSTS_N_INSNS (3)}, /* other. */
1482 0, /* cost of multiply per each bit
1483 set. */
1484 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1485 bound. */
1486 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1487 COSTS_N_INSNS (22), /* HI. */
1488 COSTS_N_INSNS (30), /* SI. */
1489 COSTS_N_INSNS (45), /* DI. */
1490 COSTS_N_INSNS (45)}, /* other. */
1491 COSTS_N_INSNS (1), /* cost of movsx. */
1492 COSTS_N_INSNS (1), /* cost of movzx. */
1493 8, /* "large" insn. */
1494 9, /* MOVE_RATIO. */
1496 /* All move costs are relative to integer->integer move times 2 and thus
1497 they are latency*2. */
1499 /* reg-reg moves are done by renaming and thus they are even cheaper than
1500 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1501 to doubles of latencies, we do not model this correctly. It does not
1502 seem to make practical difference to bump prices up even more. */
1503 6, /* cost for loading QImode using
1504 movzbl. */
1505 {6, 6, 6}, /* cost of loading integer registers
1506 in QImode, HImode and SImode.
1507 Relative to reg-reg move (2). */
1508 {8, 8, 8}, /* cost of storing integer
1509 registers. */
1510 2, /* cost of reg,reg fld/fst. */
1511 {6, 6, 16}, /* cost of loading fp registers
1512 in SFmode, DFmode and XFmode. */
1513 {8, 8, 16}, /* cost of storing fp registers
1514 in SFmode, DFmode and XFmode. */
1515 2, /* cost of moving MMX register. */
1516 {6, 6}, /* cost of loading MMX registers
1517 in SImode and DImode. */
1518 {8, 8}, /* cost of storing MMX registers
1519 in SImode and DImode. */
1520 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1521 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1522 in 32,64,128,256 and 512-bit. */
1523 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1524 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1525 in 32,64,128,256 and 512-bit. */
1526 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1527 6, 6, /* SSE->integer and integer->SSE moves. */
1528 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1529 throughput 12. Approx 9 uops do not depend on vector size and every load
1530 is 7 uops. */
1531 18, 8, /* Gather load static, per_elt. */
1532 18, 10, /* Gather store static, per_elt. */
1533 32, /* size of l1 cache. */
1534 512, /* size of l2 cache. */
1535 64, /* size of prefetch block. */
1536 /* New AMD processors never drop prefetches; if they cannot be performed
1537 immediately, they are queued. We set number of simultaneous prefetches
1538 to a large constant to reflect this (it probably is not a good idea not
1539 to limit number of prefetches at all, as their execution also takes some
1540 time). */
1541 100, /* number of parallel prefetches. */
1542 3, /* Branch cost. */
1543 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1544 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1545 /* Latency of fdiv is 8-15. */
1546 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1547 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1548 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1549 /* Latency of fsqrt is 4-10. */
1550 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1552 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1553 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1554 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1555 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1556 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1557 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1558 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1559 /* 9-13 */
1560 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1561 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1562 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1563 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1564 and it can execute 2 integer additions and 2 multiplications thus
1565 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1566 that 4 works better than 6 probably due to register pressure.
1568 Integer vector operations are taken by FP unit and execute 3 vector
1569 plus/minus operations per cycle but only one multiply. This is adjusted
1570 in ix86_reassociation_width. */
1571 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1572 znver1_memcpy,
1573 znver1_memset,
1574 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1575 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1576 "16", /* Loop alignment. */
1577 "16", /* Jump alignment. */
1578 "0:0:8", /* Label alignment. */
1579 "16", /* Func alignment. */
1582 /* skylake_cost should produce code tuned for Skylake familly of CPUs. */
1583 static stringop_algs skylake_memcpy[2] = {
1584 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1585 {libcall, {{16, loop, false}, {512, unrolled_loop, false},
1586 {-1, libcall, false}}}};
1588 static stringop_algs skylake_memset[2] = {
1589 {libcall, {{6, loop_1_byte, true},
1590 {24, loop, true},
1591 {8192, rep_prefix_4_byte, true},
1592 {-1, libcall, false}}},
1593 {libcall, {{24, loop, true}, {512, unrolled_loop, false},
1594 {-1, libcall, false}}}};
1596 static const
1597 struct processor_costs skylake_cost = {
1598 COSTS_N_INSNS (1), /* cost of an add instruction */
1599 COSTS_N_INSNS (1)+1, /* cost of a lea instruction */
1600 COSTS_N_INSNS (1), /* variable shift costs */
1601 COSTS_N_INSNS (1), /* constant shift costs */
1602 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1603 COSTS_N_INSNS (4), /* HI */
1604 COSTS_N_INSNS (3), /* SI */
1605 COSTS_N_INSNS (3), /* DI */
1606 COSTS_N_INSNS (3)}, /* other */
1607 0, /* cost of multiply per each bit set */
1608 /* Expanding div/mod currently doesn't consider parallelism. So the cost
1609 model is not realistic. We compensate by increasing the latencies a bit. */
1610 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
1611 COSTS_N_INSNS (11), /* HI */
1612 COSTS_N_INSNS (14), /* SI */
1613 COSTS_N_INSNS (76), /* DI */
1614 COSTS_N_INSNS (76)}, /* other */
1615 COSTS_N_INSNS (1), /* cost of movsx */
1616 COSTS_N_INSNS (0), /* cost of movzx */
1617 8, /* "large" insn */
1618 17, /* MOVE_RATIO */
1620 6, /* cost for loading QImode using movzbl */
1621 {4, 4, 4}, /* cost of loading integer registers
1622 in QImode, HImode and SImode.
1623 Relative to reg-reg move (2). */
1624 {6, 6, 3}, /* cost of storing integer registers */
1625 2, /* cost of reg,reg fld/fst */
1626 {6, 6, 8}, /* cost of loading fp registers
1627 in SFmode, DFmode and XFmode */
1628 {6, 6, 10}, /* cost of storing fp registers
1629 in SFmode, DFmode and XFmode */
1630 2, /* cost of moving MMX register */
1631 {6, 6}, /* cost of loading MMX registers
1632 in SImode and DImode */
1633 {6, 6}, /* cost of storing MMX registers
1634 in SImode and DImode */
1635 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
1636 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1637 in 32,64,128,256 and 512-bit */
1638 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1639 {8, 8, 8, 12, 24}, /* cost of storing SSE registers
1640 in 32,64,128,256 and 512-bit */
1641 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1642 2, 2, /* SSE->integer and integer->SSE moves */
1643 20, 8, /* Gather load static, per_elt. */
1644 22, 10, /* Gather store static, per_elt. */
1645 64, /* size of l1 cache. */
1646 512, /* size of l2 cache. */
1647 64, /* size of prefetch block */
1648 6, /* number of parallel prefetches */
1649 3, /* Branch cost */
1650 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1651 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1652 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1653 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1654 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1655 COSTS_N_INSNS (20), /* cost of FSQRT instruction. */
1657 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1658 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1659 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1660 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1661 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
1662 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
1663 COSTS_N_INSNS (11), /* cost of DIVSS instruction. */
1664 COSTS_N_INSNS (14), /* cost of DIVSD instruction. */
1665 COSTS_N_INSNS (12), /* cost of SQRTSS instruction. */
1666 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
1667 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1668 skylake_memcpy,
1669 skylake_memset,
1670 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1671 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1672 "16:11:8", /* Loop alignment. */
1673 "16:11:8", /* Jump alignment. */
1674 "0:0:8", /* Label alignment. */
1675 "16", /* Func alignment. */
1677 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1678 very small blocks it is better to use loop. For large blocks, libcall can
1679 do nontemporary accesses and beat inline considerably. */
1680 static stringop_algs btver1_memcpy[2] = {
1681 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1682 {-1, rep_prefix_4_byte, false}}},
1683 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1684 {-1, libcall, false}}}};
1685 static stringop_algs btver1_memset[2] = {
1686 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1687 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1688 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1689 {-1, libcall, false}}}};
1690 const struct processor_costs btver1_cost = {
1691 COSTS_N_INSNS (1), /* cost of an add instruction */
1692 COSTS_N_INSNS (2), /* cost of a lea instruction */
1693 COSTS_N_INSNS (1), /* variable shift costs */
1694 COSTS_N_INSNS (1), /* constant shift costs */
1695 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1696 COSTS_N_INSNS (4), /* HI */
1697 COSTS_N_INSNS (3), /* SI */
1698 COSTS_N_INSNS (4), /* DI */
1699 COSTS_N_INSNS (5)}, /* other */
1700 0, /* cost of multiply per each bit set */
1701 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1702 COSTS_N_INSNS (35), /* HI */
1703 COSTS_N_INSNS (51), /* SI */
1704 COSTS_N_INSNS (83), /* DI */
1705 COSTS_N_INSNS (83)}, /* other */
1706 COSTS_N_INSNS (1), /* cost of movsx */
1707 COSTS_N_INSNS (1), /* cost of movzx */
1708 8, /* "large" insn */
1709 9, /* MOVE_RATIO */
1711 /* All move costs are relative to integer->integer move times 2 and thus
1712 they are latency*2. */
1713 8, /* cost for loading QImode using movzbl */
1714 {6, 8, 6}, /* cost of loading integer registers
1715 in QImode, HImode and SImode.
1716 Relative to reg-reg move (2). */
1717 {6, 8, 6}, /* cost of storing integer registers */
1718 4, /* cost of reg,reg fld/fst */
1719 {12, 12, 28}, /* cost of loading fp registers
1720 in SFmode, DFmode and XFmode */
1721 {12, 12, 38}, /* cost of storing fp registers
1722 in SFmode, DFmode and XFmode */
1723 4, /* cost of moving MMX register */
1724 {10, 10}, /* cost of loading MMX registers
1725 in SImode and DImode */
1726 {12, 12}, /* cost of storing MMX registers
1727 in SImode and DImode */
1728 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1729 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1730 in 32,64,128,256 and 512-bit */
1731 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1732 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1733 in 32,64,128,256 and 512-bit */
1734 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1735 14, 14, /* SSE->integer and integer->SSE moves */
1736 10, 10, /* Gather load static, per_elt. */
1737 10, 10, /* Gather store static, per_elt. */
1738 32, /* size of l1 cache. */
1739 512, /* size of l2 cache. */
1740 64, /* size of prefetch block */
1741 100, /* number of parallel prefetches */
1742 2, /* Branch cost */
1743 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1744 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1745 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1746 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1747 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1748 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1750 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1751 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1752 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1753 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1754 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1755 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1756 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1757 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1758 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1759 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1760 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1761 btver1_memcpy,
1762 btver1_memset,
1763 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1764 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1765 "16:11:8", /* Loop alignment. */
1766 "16:8:8", /* Jump alignment. */
1767 "0:0:8", /* Label alignment. */
1768 "11", /* Func alignment. */
1771 static stringop_algs btver2_memcpy[2] = {
1772 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1773 {-1, rep_prefix_4_byte, false}}},
1774 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1775 {-1, libcall, false}}}};
1776 static stringop_algs btver2_memset[2] = {
1777 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1778 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1779 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1780 {-1, libcall, false}}}};
1781 const struct processor_costs btver2_cost = {
1782 COSTS_N_INSNS (1), /* cost of an add instruction */
1783 COSTS_N_INSNS (2), /* cost of a lea instruction */
1784 COSTS_N_INSNS (1), /* variable shift costs */
1785 COSTS_N_INSNS (1), /* constant shift costs */
1786 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1787 COSTS_N_INSNS (4), /* HI */
1788 COSTS_N_INSNS (3), /* SI */
1789 COSTS_N_INSNS (4), /* DI */
1790 COSTS_N_INSNS (5)}, /* other */
1791 0, /* cost of multiply per each bit set */
1792 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1793 COSTS_N_INSNS (35), /* HI */
1794 COSTS_N_INSNS (51), /* SI */
1795 COSTS_N_INSNS (83), /* DI */
1796 COSTS_N_INSNS (83)}, /* other */
1797 COSTS_N_INSNS (1), /* cost of movsx */
1798 COSTS_N_INSNS (1), /* cost of movzx */
1799 8, /* "large" insn */
1800 9, /* MOVE_RATIO */
1802 /* All move costs are relative to integer->integer move times 2 and thus
1803 they are latency*2. */
1804 8, /* cost for loading QImode using movzbl */
1805 {8, 8, 6}, /* cost of loading integer registers
1806 in QImode, HImode and SImode.
1807 Relative to reg-reg move (2). */
1808 {8, 8, 6}, /* cost of storing integer registers */
1809 4, /* cost of reg,reg fld/fst */
1810 {12, 12, 28}, /* cost of loading fp registers
1811 in SFmode, DFmode and XFmode */
1812 {12, 12, 38}, /* cost of storing fp registers
1813 in SFmode, DFmode and XFmode */
1814 4, /* cost of moving MMX register */
1815 {10, 10}, /* cost of loading MMX registers
1816 in SImode and DImode */
1817 {12, 12}, /* cost of storing MMX registers
1818 in SImode and DImode */
1819 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1820 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1821 in 32,64,128,256 and 512-bit */
1822 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1823 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1824 in 32,64,128,256 and 512-bit */
1825 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1826 14, 14, /* SSE->integer and integer->SSE moves */
1827 10, 10, /* Gather load static, per_elt. */
1828 10, 10, /* Gather store static, per_elt. */
1829 32, /* size of l1 cache. */
1830 2048, /* size of l2 cache. */
1831 64, /* size of prefetch block */
1832 100, /* number of parallel prefetches */
1833 2, /* Branch cost */
1834 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1835 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1836 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1837 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1838 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1839 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1841 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1842 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1843 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1844 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1845 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1846 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1847 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1848 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1849 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1850 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1851 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1852 btver2_memcpy,
1853 btver2_memset,
1854 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1855 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1856 "16:11:8", /* Loop alignment. */
1857 "16:8:8", /* Jump alignment. */
1858 "0:0:8", /* Label alignment. */
1859 "11", /* Func alignment. */
1862 static stringop_algs pentium4_memcpy[2] = {
1863 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1864 DUMMY_STRINGOP_ALGS};
1865 static stringop_algs pentium4_memset[2] = {
1866 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1867 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1868 DUMMY_STRINGOP_ALGS};
1870 static const
1871 struct processor_costs pentium4_cost = {
1872 COSTS_N_INSNS (1), /* cost of an add instruction */
1873 COSTS_N_INSNS (3), /* cost of a lea instruction */
1874 COSTS_N_INSNS (4), /* variable shift costs */
1875 COSTS_N_INSNS (4), /* constant shift costs */
1876 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1877 COSTS_N_INSNS (15), /* HI */
1878 COSTS_N_INSNS (15), /* SI */
1879 COSTS_N_INSNS (15), /* DI */
1880 COSTS_N_INSNS (15)}, /* other */
1881 0, /* cost of multiply per each bit set */
1882 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1883 COSTS_N_INSNS (56), /* HI */
1884 COSTS_N_INSNS (56), /* SI */
1885 COSTS_N_INSNS (56), /* DI */
1886 COSTS_N_INSNS (56)}, /* other */
1887 COSTS_N_INSNS (1), /* cost of movsx */
1888 COSTS_N_INSNS (1), /* cost of movzx */
1889 16, /* "large" insn */
1890 6, /* MOVE_RATIO */
1892 /* All move costs are relative to integer->integer move times 2 and thus
1893 they are latency*2. */
1894 5, /* cost for loading QImode using movzbl */
1895 {4, 5, 4}, /* cost of loading integer registers
1896 in QImode, HImode and SImode.
1897 Relative to reg-reg move (2). */
1898 {2, 3, 2}, /* cost of storing integer registers */
1899 12, /* cost of reg,reg fld/fst */
1900 {14, 14, 14}, /* cost of loading fp registers
1901 in SFmode, DFmode and XFmode */
1902 {14, 14, 14}, /* cost of storing fp registers
1903 in SFmode, DFmode and XFmode */
1904 12, /* cost of moving MMX register */
1905 {16, 16}, /* cost of loading MMX registers
1906 in SImode and DImode */
1907 {16, 16}, /* cost of storing MMX registers
1908 in SImode and DImode */
1909 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1910 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
1911 in 32,64,128,256 and 512-bit */
1912 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
1913 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
1914 in 32,64,128,256 and 512-bit */
1915 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
1916 20, 12, /* SSE->integer and integer->SSE moves */
1917 16, 16, /* Gather load static, per_elt. */
1918 16, 16, /* Gather store static, per_elt. */
1919 8, /* size of l1 cache. */
1920 256, /* size of l2 cache. */
1921 64, /* size of prefetch block */
1922 6, /* number of parallel prefetches */
1923 2, /* Branch cost */
1924 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1925 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1926 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1927 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1928 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1929 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1931 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1932 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1933 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1934 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1935 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1936 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1937 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1938 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
1939 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
1940 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
1941 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1942 pentium4_memcpy,
1943 pentium4_memset,
1944 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1945 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1946 NULL, /* Loop alignment. */
1947 NULL, /* Jump alignment. */
1948 NULL, /* Label alignment. */
1949 NULL, /* Func alignment. */
1952 static stringop_algs nocona_memcpy[2] = {
1953 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1954 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1955 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1957 static stringop_algs nocona_memset[2] = {
1958 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1959 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1960 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1961 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1963 static const
1964 struct processor_costs nocona_cost = {
1965 COSTS_N_INSNS (1), /* cost of an add instruction */
1966 COSTS_N_INSNS (1), /* cost of a lea instruction */
1967 COSTS_N_INSNS (1), /* variable shift costs */
1968 COSTS_N_INSNS (1), /* constant shift costs */
1969 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1970 COSTS_N_INSNS (10), /* HI */
1971 COSTS_N_INSNS (10), /* SI */
1972 COSTS_N_INSNS (10), /* DI */
1973 COSTS_N_INSNS (10)}, /* other */
1974 0, /* cost of multiply per each bit set */
1975 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1976 COSTS_N_INSNS (66), /* HI */
1977 COSTS_N_INSNS (66), /* SI */
1978 COSTS_N_INSNS (66), /* DI */
1979 COSTS_N_INSNS (66)}, /* other */
1980 COSTS_N_INSNS (1), /* cost of movsx */
1981 COSTS_N_INSNS (1), /* cost of movzx */
1982 16, /* "large" insn */
1983 17, /* MOVE_RATIO */
1985 /* All move costs are relative to integer->integer move times 2 and thus
1986 they are latency*2. */
1987 4, /* cost for loading QImode using movzbl */
1988 {4, 4, 4}, /* cost of loading integer registers
1989 in QImode, HImode and SImode.
1990 Relative to reg-reg move (2). */
1991 {4, 4, 4}, /* cost of storing integer registers */
1992 12, /* cost of reg,reg fld/fst */
1993 {14, 14, 14}, /* cost of loading fp registers
1994 in SFmode, DFmode and XFmode */
1995 {14, 14, 14}, /* cost of storing fp registers
1996 in SFmode, DFmode and XFmode */
1997 14, /* cost of moving MMX register */
1998 {12, 12}, /* cost of loading MMX registers
1999 in SImode and DImode */
2000 {12, 12}, /* cost of storing MMX registers
2001 in SImode and DImode */
2002 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
2003 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
2004 in 32,64,128,256 and 512-bit */
2005 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
2006 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
2007 in 32,64,128,256 and 512-bit */
2008 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
2009 20, 12, /* SSE->integer and integer->SSE moves */
2010 12, 12, /* Gather load static, per_elt. */
2011 12, 12, /* Gather store static, per_elt. */
2012 8, /* size of l1 cache. */
2013 1024, /* size of l2 cache. */
2014 64, /* size of prefetch block */
2015 8, /* number of parallel prefetches */
2016 1, /* Branch cost */
2017 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
2018 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2019 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
2020 COSTS_N_INSNS (3), /* cost of FABS instruction. */
2021 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
2022 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
2024 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
2025 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2026 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
2027 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
2028 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
2029 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
2030 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
2031 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
2032 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
2033 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
2034 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2035 nocona_memcpy,
2036 nocona_memset,
2037 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2038 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2039 NULL, /* Loop alignment. */
2040 NULL, /* Jump alignment. */
2041 NULL, /* Label alignment. */
2042 NULL, /* Func alignment. */
2045 static stringop_algs atom_memcpy[2] = {
2046 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2047 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2048 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2049 static stringop_algs atom_memset[2] = {
2050 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2051 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2052 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2053 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2054 static const
2055 struct processor_costs atom_cost = {
2056 COSTS_N_INSNS (1), /* cost of an add instruction */
2057 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2058 COSTS_N_INSNS (1), /* variable shift costs */
2059 COSTS_N_INSNS (1), /* constant shift costs */
2060 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2061 COSTS_N_INSNS (4), /* HI */
2062 COSTS_N_INSNS (3), /* SI */
2063 COSTS_N_INSNS (4), /* DI */
2064 COSTS_N_INSNS (2)}, /* other */
2065 0, /* cost of multiply per each bit set */
2066 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2067 COSTS_N_INSNS (26), /* HI */
2068 COSTS_N_INSNS (42), /* SI */
2069 COSTS_N_INSNS (74), /* DI */
2070 COSTS_N_INSNS (74)}, /* other */
2071 COSTS_N_INSNS (1), /* cost of movsx */
2072 COSTS_N_INSNS (1), /* cost of movzx */
2073 8, /* "large" insn */
2074 17, /* MOVE_RATIO */
2076 /* All move costs are relative to integer->integer move times 2 and thus
2077 they are latency*2. */
2078 6, /* cost for loading QImode using movzbl */
2079 {6, 6, 6}, /* cost of loading integer registers
2080 in QImode, HImode and SImode.
2081 Relative to reg-reg move (2). */
2082 {6, 6, 6}, /* cost of storing integer registers */
2083 4, /* cost of reg,reg fld/fst */
2084 {6, 6, 18}, /* cost of loading fp registers
2085 in SFmode, DFmode and XFmode */
2086 {14, 14, 24}, /* cost of storing fp registers
2087 in SFmode, DFmode and XFmode */
2088 2, /* cost of moving MMX register */
2089 {8, 8}, /* cost of loading MMX registers
2090 in SImode and DImode */
2091 {10, 10}, /* cost of storing MMX registers
2092 in SImode and DImode */
2093 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2094 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2095 in 32,64,128,256 and 512-bit */
2096 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2097 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2098 in 32,64,128,256 and 512-bit */
2099 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2100 8, 6, /* SSE->integer and integer->SSE moves */
2101 8, 8, /* Gather load static, per_elt. */
2102 8, 8, /* Gather store static, per_elt. */
2103 32, /* size of l1 cache. */
2104 256, /* size of l2 cache. */
2105 64, /* size of prefetch block */
2106 6, /* number of parallel prefetches */
2107 3, /* Branch cost */
2108 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2109 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2110 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2111 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2112 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2113 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2115 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2116 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
2117 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2118 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2119 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2120 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2121 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
2122 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
2123 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
2124 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
2125 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2126 atom_memcpy,
2127 atom_memset,
2128 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2129 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2130 "16", /* Loop alignment. */
2131 "16:8:8", /* Jump alignment. */
2132 "0:0:8", /* Label alignment. */
2133 "16", /* Func alignment. */
2136 static stringop_algs slm_memcpy[2] = {
2137 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2138 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2139 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2140 static stringop_algs slm_memset[2] = {
2141 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2142 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2143 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2144 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2145 static const
2146 struct processor_costs slm_cost = {
2147 COSTS_N_INSNS (1), /* cost of an add instruction */
2148 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2149 COSTS_N_INSNS (1), /* variable shift costs */
2150 COSTS_N_INSNS (1), /* constant shift costs */
2151 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2152 COSTS_N_INSNS (3), /* HI */
2153 COSTS_N_INSNS (3), /* SI */
2154 COSTS_N_INSNS (4), /* DI */
2155 COSTS_N_INSNS (2)}, /* other */
2156 0, /* cost of multiply per each bit set */
2157 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2158 COSTS_N_INSNS (26), /* HI */
2159 COSTS_N_INSNS (42), /* SI */
2160 COSTS_N_INSNS (74), /* DI */
2161 COSTS_N_INSNS (74)}, /* other */
2162 COSTS_N_INSNS (1), /* cost of movsx */
2163 COSTS_N_INSNS (1), /* cost of movzx */
2164 8, /* "large" insn */
2165 17, /* MOVE_RATIO */
2167 /* All move costs are relative to integer->integer move times 2 and thus
2168 they are latency*2. */
2169 8, /* cost for loading QImode using movzbl */
2170 {8, 8, 8}, /* cost of loading integer registers
2171 in QImode, HImode and SImode.
2172 Relative to reg-reg move (2). */
2173 {6, 6, 6}, /* cost of storing integer registers */
2174 2, /* cost of reg,reg fld/fst */
2175 {8, 8, 18}, /* cost of loading fp registers
2176 in SFmode, DFmode and XFmode */
2177 {6, 6, 18}, /* cost of storing fp registers
2178 in SFmode, DFmode and XFmode */
2179 2, /* cost of moving MMX register */
2180 {8, 8}, /* cost of loading MMX registers
2181 in SImode and DImode */
2182 {6, 6}, /* cost of storing MMX registers
2183 in SImode and DImode */
2184 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2185 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2186 in 32,64,128,256 and 512-bit */
2187 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2188 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2189 in 32,64,128,256 and 512-bit */
2190 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2191 8, 6, /* SSE->integer and integer->SSE moves */
2192 8, 8, /* Gather load static, per_elt. */
2193 8, 8, /* Gather store static, per_elt. */
2194 32, /* size of l1 cache. */
2195 256, /* size of l2 cache. */
2196 64, /* size of prefetch block */
2197 6, /* number of parallel prefetches */
2198 3, /* Branch cost */
2199 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2200 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2201 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2202 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2203 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2204 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2206 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2207 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2208 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2209 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2210 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2211 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2212 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2213 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2214 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2215 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
2216 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2217 slm_memcpy,
2218 slm_memset,
2219 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2220 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2221 "16", /* Loop alignment. */
2222 "16:8:8", /* Jump alignment. */
2223 "0:0:8", /* Label alignment. */
2224 "16", /* Func alignment. */
2227 static stringop_algs intel_memcpy[2] = {
2228 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2229 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2230 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2231 static stringop_algs intel_memset[2] = {
2232 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2233 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2234 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2235 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2236 static const
2237 struct processor_costs intel_cost = {
2238 COSTS_N_INSNS (1), /* cost of an add instruction */
2239 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2240 COSTS_N_INSNS (1), /* variable shift costs */
2241 COSTS_N_INSNS (1), /* constant shift costs */
2242 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2243 COSTS_N_INSNS (3), /* HI */
2244 COSTS_N_INSNS (3), /* SI */
2245 COSTS_N_INSNS (4), /* DI */
2246 COSTS_N_INSNS (2)}, /* other */
2247 0, /* cost of multiply per each bit set */
2248 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2249 COSTS_N_INSNS (26), /* HI */
2250 COSTS_N_INSNS (42), /* SI */
2251 COSTS_N_INSNS (74), /* DI */
2252 COSTS_N_INSNS (74)}, /* other */
2253 COSTS_N_INSNS (1), /* cost of movsx */
2254 COSTS_N_INSNS (1), /* cost of movzx */
2255 8, /* "large" insn */
2256 17, /* MOVE_RATIO */
2258 /* All move costs are relative to integer->integer move times 2 and thus
2259 they are latency*2. */
2260 6, /* cost for loading QImode using movzbl */
2261 {4, 4, 4}, /* cost of loading integer registers
2262 in QImode, HImode and SImode.
2263 Relative to reg-reg move (2). */
2264 {6, 6, 6}, /* cost of storing integer registers */
2265 2, /* cost of reg,reg fld/fst */
2266 {6, 6, 8}, /* cost of loading fp registers
2267 in SFmode, DFmode and XFmode */
2268 {6, 6, 10}, /* cost of storing fp registers
2269 in SFmode, DFmode and XFmode */
2270 2, /* cost of moving MMX register */
2271 {6, 6}, /* cost of loading MMX registers
2272 in SImode and DImode */
2273 {6, 6}, /* cost of storing MMX registers
2274 in SImode and DImode */
2275 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2276 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2277 in 32,64,128,256 and 512-bit */
2278 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2279 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2280 in 32,64,128,256 and 512-bit */
2281 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2282 4, 4, /* SSE->integer and integer->SSE moves */
2283 6, 6, /* Gather load static, per_elt. */
2284 6, 6, /* Gather store static, per_elt. */
2285 32, /* size of l1 cache. */
2286 256, /* size of l2 cache. */
2287 64, /* size of prefetch block */
2288 6, /* number of parallel prefetches */
2289 3, /* Branch cost */
2290 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2291 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2292 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2293 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2294 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2295 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2297 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */
2298 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2299 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2300 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2301 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2302 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2303 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2304 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2305 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2306 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2307 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2308 intel_memcpy,
2309 intel_memset,
2310 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2311 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2312 "16", /* Loop alignment. */
2313 "16:8:8", /* Jump alignment. */
2314 "0:0:8", /* Label alignment. */
2315 "16", /* Func alignment. */
2318 /* Generic should produce code tuned for Core-i7 (and newer chips)
2319 and btver1 (and newer chips). */
2321 static stringop_algs generic_memcpy[2] = {
2322 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2323 {-1, libcall, false}}},
2324 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2325 {-1, libcall, false}}}};
2326 static stringop_algs generic_memset[2] = {
2327 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2328 {-1, libcall, false}}},
2329 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2330 {-1, libcall, false}}}};
2331 static const
2332 struct processor_costs generic_cost = {
2333 COSTS_N_INSNS (1), /* cost of an add instruction */
2334 /* Setting cost to 2 makes our current implementation of synth_mult result in
2335 use of unnecessary temporary registers causing regression on several
2336 SPECfp benchmarks. */
2337 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2338 COSTS_N_INSNS (1), /* variable shift costs */
2339 COSTS_N_INSNS (1), /* constant shift costs */
2340 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2341 COSTS_N_INSNS (4), /* HI */
2342 COSTS_N_INSNS (3), /* SI */
2343 COSTS_N_INSNS (4), /* DI */
2344 COSTS_N_INSNS (4)}, /* other */
2345 0, /* cost of multiply per each bit set */
2346 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI */
2347 COSTS_N_INSNS (22), /* HI */
2348 COSTS_N_INSNS (30), /* SI */
2349 COSTS_N_INSNS (74), /* DI */
2350 COSTS_N_INSNS (74)}, /* other */
2351 COSTS_N_INSNS (1), /* cost of movsx */
2352 COSTS_N_INSNS (1), /* cost of movzx */
2353 8, /* "large" insn */
2354 17, /* MOVE_RATIO */
2356 /* All move costs are relative to integer->integer move times 2 and thus
2357 they are latency*2. */
2358 6, /* cost for loading QImode using movzbl */
2359 {6, 6, 6}, /* cost of loading integer registers
2360 in QImode, HImode and SImode.
2361 Relative to reg-reg move (2). */
2362 {6, 6, 6}, /* cost of storing integer registers */
2363 4, /* cost of reg,reg fld/fst */
2364 {6, 6, 12}, /* cost of loading fp registers
2365 in SFmode, DFmode and XFmode */
2366 {6, 6, 12}, /* cost of storing fp registers
2367 in SFmode, DFmode and XFmode */
2368 2, /* cost of moving MMX register */
2369 {6, 6}, /* cost of loading MMX registers
2370 in SImode and DImode */
2371 {6, 6}, /* cost of storing MMX registers
2372 in SImode and DImode */
2373 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2374 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2375 in 32,64,128,256 and 512-bit */
2376 {6, 6, 6, 10, 15}, /* cost of unaligned loads. */
2377 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2378 in 32,64,128,256 and 512-bit */
2379 {6, 6, 6, 10, 15}, /* cost of unaligned storess. */
2380 6, 6, /* SSE->integer and integer->SSE moves */
2381 18, 6, /* Gather load static, per_elt. */
2382 18, 6, /* Gather store static, per_elt. */
2383 32, /* size of l1 cache. */
2384 512, /* size of l2 cache. */
2385 64, /* size of prefetch block */
2386 6, /* number of parallel prefetches */
2387 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2388 value is increased to perhaps more appropriate value of 5. */
2389 3, /* Branch cost */
2390 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2391 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2392 COSTS_N_INSNS (17), /* cost of FDIV instruction. */
2393 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2394 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2395 COSTS_N_INSNS (14), /* cost of FSQRT instruction. */
2397 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2398 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2399 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2400 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2401 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2402 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2403 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
2404 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
2405 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
2406 COSTS_N_INSNS (18), /* cost of SQRTSD instruction. */
2407 1, 4, 3, 3, /* reassoc int, fp, vec_int, vec_fp. */
2408 generic_memcpy,
2409 generic_memset,
2410 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
2411 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
2412 "16:11:8", /* Loop alignment. */
2413 "16:11:8", /* Jump alignment. */
2414 "0:0:8", /* Label alignment. */
2415 "16", /* Func alignment. */
2418 /* core_cost should produce code tuned for Core familly of CPUs. */
2419 static stringop_algs core_memcpy[2] = {
2420 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2421 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2422 {-1, libcall, false}}}};
2423 static stringop_algs core_memset[2] = {
2424 {libcall, {{6, loop_1_byte, true},
2425 {24, loop, true},
2426 {8192, rep_prefix_4_byte, true},
2427 {-1, libcall, false}}},
2428 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2429 {-1, libcall, false}}}};
2431 static const
2432 struct processor_costs core_cost = {
2433 COSTS_N_INSNS (1), /* cost of an add instruction */
2434 /* On all chips taken into consideration lea is 2 cycles and more. With
2435 this cost however our current implementation of synth_mult results in
2436 use of unnecessary temporary registers causing regression on several
2437 SPECfp benchmarks. */
2438 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2439 COSTS_N_INSNS (1), /* variable shift costs */
2440 COSTS_N_INSNS (1), /* constant shift costs */
2441 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2442 COSTS_N_INSNS (4), /* HI */
2443 COSTS_N_INSNS (3), /* SI */
2444 /* Here we tune for Sandybridge or newer. */
2445 COSTS_N_INSNS (3), /* DI */
2446 COSTS_N_INSNS (3)}, /* other */
2447 0, /* cost of multiply per each bit set */
2448 /* Expanding div/mod currently doesn't consider parallelism. So the cost
2449 model is not realistic. We compensate by increasing the latencies a bit. */
2450 {COSTS_N_INSNS (11), /* cost of a divide/mod for QI */
2451 COSTS_N_INSNS (11), /* HI */
2452 COSTS_N_INSNS (14), /* SI */
2453 COSTS_N_INSNS (81), /* DI */
2454 COSTS_N_INSNS (81)}, /* other */
2455 COSTS_N_INSNS (1), /* cost of movsx */
2456 COSTS_N_INSNS (1), /* cost of movzx */
2457 8, /* "large" insn */
2458 17, /* MOVE_RATIO */
2460 /* All move costs are relative to integer->integer move times 2 and thus
2461 they are latency*2. */
2462 6, /* cost for loading QImode using movzbl */
2463 {4, 4, 4}, /* cost of loading integer registers
2464 in QImode, HImode and SImode.
2465 Relative to reg-reg move (2). */
2466 {6, 6, 6}, /* cost of storing integer registers */
2467 2, /* cost of reg,reg fld/fst */
2468 {6, 6, 8}, /* cost of loading fp registers
2469 in SFmode, DFmode and XFmode */
2470 {6, 6, 10}, /* cost of storing fp registers
2471 in SFmode, DFmode and XFmode */
2472 2, /* cost of moving MMX register */
2473 {6, 6}, /* cost of loading MMX registers
2474 in SImode and DImode */
2475 {6, 6}, /* cost of storing MMX registers
2476 in SImode and DImode */
2477 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2478 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2479 in 32,64,128,256 and 512-bit */
2480 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2481 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2482 in 32,64,128,256 and 512-bit */
2483 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2484 2, 2, /* SSE->integer and integer->SSE moves */
2485 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2486 rec. throughput 6.
2487 So 5 uops statically and one uops per load. */
2488 10, 6, /* Gather load static, per_elt. */
2489 10, 6, /* Gather store static, per_elt. */
2490 64, /* size of l1 cache. */
2491 512, /* size of l2 cache. */
2492 64, /* size of prefetch block */
2493 6, /* number of parallel prefetches */
2494 /* FIXME perhaps more appropriate value is 5. */
2495 3, /* Branch cost */
2496 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2497 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2498 /* 10-24 */
2499 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2500 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2501 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2502 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
2504 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2505 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2506 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2507 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2508 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2509 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2510 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2511 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2512 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2513 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2514 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2515 core_memcpy,
2516 core_memset,
2517 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2518 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2519 "16:11:8", /* Loop alignment. */
2520 "16:11:8", /* Jump alignment. */
2521 "0:0:8", /* Label alignment. */
2522 "16", /* Func alignment. */