* i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
[official-gcc.git] / gcc / config / i386 / x86-tune-costs.h
blobc7ac70e8453e0336370652f2683a37418e95d52e
1 /* Costs of operations of individual x86 CPUs.
2 Copyright (C) 1988-2017 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 Under Section 7 of GPL version 3, you are granted additional
17 permissions described in the GCC Runtime Library Exception, version
18 3.1, as published by the Free Software Foundation.
20 You should have received a copy of the GNU General Public License and
21 a copy of the GCC Runtime Library Exception along with this program;
22 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 <http://www.gnu.org/licenses/>. */
24 /* Processor costs (relative to an add) */
25 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
26 #define COSTS_N_BYTES(N) ((N) * 2)
28 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
30 static stringop_algs ix86_size_memcpy[2] = {
31 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
32 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
33 static stringop_algs ix86_size_memset[2] = {
34 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
35 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
37 const
38 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
39 COSTS_N_BYTES (2), /* cost of an add instruction */
40 COSTS_N_BYTES (3), /* cost of a lea instruction */
41 COSTS_N_BYTES (2), /* variable shift costs */
42 COSTS_N_BYTES (3), /* constant shift costs */
43 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
44 COSTS_N_BYTES (3), /* HI */
45 COSTS_N_BYTES (3), /* SI */
46 COSTS_N_BYTES (3), /* DI */
47 COSTS_N_BYTES (5)}, /* other */
48 0, /* cost of multiply per each bit set */
49 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
50 COSTS_N_BYTES (3), /* HI */
51 COSTS_N_BYTES (3), /* SI */
52 COSTS_N_BYTES (3), /* DI */
53 COSTS_N_BYTES (5)}, /* other */
54 COSTS_N_BYTES (3), /* cost of movsx */
55 COSTS_N_BYTES (3), /* cost of movzx */
56 0, /* "large" insn */
57 2, /* MOVE_RATIO */
59 /* All move costs are relative to integer->integer move times 2. */
60 2, /* cost for loading QImode using movzbl */
61 {2, 2, 2}, /* cost of loading integer registers
62 in QImode, HImode and SImode.
63 Relative to reg-reg move (2). */
64 {2, 2, 2}, /* cost of storing integer registers */
65 2, /* cost of reg,reg fld/fst */
66 {2, 2, 2}, /* cost of loading fp registers
67 in SFmode, DFmode and XFmode */
68 {2, 2, 2}, /* cost of storing fp registers
69 in SFmode, DFmode and XFmode */
70 3, /* cost of moving MMX register */
71 {3, 3}, /* cost of loading MMX registers
72 in SImode and DImode */
73 {3, 3}, /* cost of storing MMX registers
74 in SImode and DImode */
75 3, 3, 3, /* cost of moving XMM,YMM,ZMM register */
76 {3, 3, 3, 3, 3}, /* cost of loading SSE registers
77 in 32,64,128,256 and 512-bit */
78 {3, 3, 3, 3, 3}, /* cost of unaligned SSE load
79 in 128bit, 256bit and 512bit */
80 {3, 3, 3, 3, 3}, /* cost of storing SSE registers
81 in 32,64,128,256 and 512-bit */
82 {3, 3, 3, 3, 3}, /* cost of unaligned SSE store
83 in 128bit, 256bit and 512bit */
84 3, 3, /* SSE->integer and integer->SSE moves */
85 5, 0, /* Gather load static, per_elt. */
86 5, 0, /* Gather store static, per_elt. */
87 0, /* size of l1 cache */
88 0, /* size of l2 cache */
89 0, /* size of prefetch block */
90 0, /* number of parallel prefetches */
91 2, /* Branch cost */
92 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
93 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
94 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
95 COSTS_N_BYTES (2), /* cost of FABS instruction. */
96 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
97 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
99 COSTS_N_BYTES (2), /* cost of cheap SSE instruction. */
100 COSTS_N_BYTES (2), /* cost of ADDSS/SD SUBSS/SD insns. */
101 COSTS_N_BYTES (2), /* cost of MULSS instruction. */
102 COSTS_N_BYTES (2), /* cost of MULSD instruction. */
103 COSTS_N_BYTES (2), /* cost of FMA SS instruction. */
104 COSTS_N_BYTES (2), /* cost of FMA SD instruction. */
105 COSTS_N_BYTES (2), /* cost of DIVSS instruction. */
106 COSTS_N_BYTES (2), /* cost of DIVSD instruction. */
107 COSTS_N_BYTES (2), /* cost of SQRTSS instruction. */
108 COSTS_N_BYTES (2), /* cost of SQRTSD instruction. */
109 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
110 ix86_size_memcpy,
111 ix86_size_memset,
112 COSTS_N_BYTES (1), /* cond_taken_branch_cost. */
113 COSTS_N_BYTES (1), /* cond_not_taken_branch_cost. */
116 /* Processor costs (relative to an add) */
117 static stringop_algs i386_memcpy[2] = {
118 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
119 DUMMY_STRINGOP_ALGS};
120 static stringop_algs i386_memset[2] = {
121 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
122 DUMMY_STRINGOP_ALGS};
124 static const
125 struct processor_costs i386_cost = { /* 386 specific costs */
126 COSTS_N_INSNS (1), /* cost of an add instruction */
127 COSTS_N_INSNS (1), /* cost of a lea instruction */
128 COSTS_N_INSNS (3), /* variable shift costs */
129 COSTS_N_INSNS (2), /* constant shift costs */
130 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
131 COSTS_N_INSNS (6), /* HI */
132 COSTS_N_INSNS (6), /* SI */
133 COSTS_N_INSNS (6), /* DI */
134 COSTS_N_INSNS (6)}, /* other */
135 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
136 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
137 COSTS_N_INSNS (23), /* HI */
138 COSTS_N_INSNS (23), /* SI */
139 COSTS_N_INSNS (23), /* DI */
140 COSTS_N_INSNS (23)}, /* other */
141 COSTS_N_INSNS (3), /* cost of movsx */
142 COSTS_N_INSNS (2), /* cost of movzx */
143 15, /* "large" insn */
144 3, /* MOVE_RATIO */
146 /* All move costs are relative to integer->integer move times 2 and thus
147 they are latency*2. */
148 4, /* cost for loading QImode using movzbl */
149 {2, 4, 2}, /* cost of loading integer registers
150 in QImode, HImode and SImode.
151 Relative to reg-reg move (2). */
152 {2, 4, 2}, /* cost of storing integer registers */
153 2, /* cost of reg,reg fld/fst */
154 {8, 8, 8}, /* cost of loading fp registers
155 in SFmode, DFmode and XFmode */
156 {8, 8, 8}, /* cost of storing fp registers
157 in SFmode, DFmode and XFmode */
158 2, /* cost of moving MMX register */
159 {4, 8}, /* cost of loading MMX registers
160 in SImode and DImode */
161 {4, 8}, /* cost of storing MMX registers
162 in SImode and DImode */
163 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
164 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
165 in 32,64,128,256 and 512-bit */
166 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
167 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
168 in 32,64,128,256 and 512-bit */
169 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
170 3, 3, /* SSE->integer and integer->SSE moves */
171 4, 4, /* Gather load static, per_elt. */
172 4, 4, /* Gather store static, per_elt. */
173 0, /* size of l1 cache */
174 0, /* size of l2 cache */
175 0, /* size of prefetch block */
176 0, /* number of parallel prefetches */
177 1, /* Branch cost */
178 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
179 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
180 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
181 COSTS_N_INSNS (22), /* cost of FABS instruction. */
182 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
183 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
185 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
186 COSTS_N_INSNS (23), /* cost of ADDSS/SD SUBSS/SD insns. */
187 COSTS_N_INSNS (27), /* cost of MULSS instruction. */
188 COSTS_N_INSNS (27), /* cost of MULSD instruction. */
189 COSTS_N_INSNS (27), /* cost of FMA SS instruction. */
190 COSTS_N_INSNS (27), /* cost of FMA SD instruction. */
191 COSTS_N_INSNS (88), /* cost of DIVSS instruction. */
192 COSTS_N_INSNS (88), /* cost of DIVSD instruction. */
193 COSTS_N_INSNS (122), /* cost of SQRTSS instruction. */
194 COSTS_N_INSNS (122), /* cost of SQRTSD instruction. */
195 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
196 i386_memcpy,
197 i386_memset,
198 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
199 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
202 static stringop_algs i486_memcpy[2] = {
203 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
204 DUMMY_STRINGOP_ALGS};
205 static stringop_algs i486_memset[2] = {
206 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
207 DUMMY_STRINGOP_ALGS};
209 static const
210 struct processor_costs i486_cost = { /* 486 specific costs */
211 COSTS_N_INSNS (1), /* cost of an add instruction */
212 COSTS_N_INSNS (1), /* cost of a lea instruction */
213 COSTS_N_INSNS (3), /* variable shift costs */
214 COSTS_N_INSNS (2), /* constant shift costs */
215 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
216 COSTS_N_INSNS (12), /* HI */
217 COSTS_N_INSNS (12), /* SI */
218 COSTS_N_INSNS (12), /* DI */
219 COSTS_N_INSNS (12)}, /* other */
220 1, /* cost of multiply per each bit set */
221 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
222 COSTS_N_INSNS (40), /* HI */
223 COSTS_N_INSNS (40), /* SI */
224 COSTS_N_INSNS (40), /* DI */
225 COSTS_N_INSNS (40)}, /* other */
226 COSTS_N_INSNS (3), /* cost of movsx */
227 COSTS_N_INSNS (2), /* cost of movzx */
228 15, /* "large" insn */
229 3, /* MOVE_RATIO */
231 /* All move costs are relative to integer->integer move times 2 and thus
232 they are latency*2. */
233 4, /* cost for loading QImode using movzbl */
234 {2, 4, 2}, /* cost of loading integer registers
235 in QImode, HImode and SImode.
236 Relative to reg-reg move (2). */
237 {2, 4, 2}, /* cost of storing integer registers */
238 2, /* cost of reg,reg fld/fst */
239 {8, 8, 8}, /* cost of loading fp registers
240 in SFmode, DFmode and XFmode */
241 {8, 8, 8}, /* cost of storing fp registers
242 in SFmode, DFmode and XFmode */
243 2, /* cost of moving MMX register */
244 {4, 8}, /* cost of loading MMX registers
245 in SImode and DImode */
246 {4, 8}, /* cost of storing MMX registers
247 in SImode and DImode */
248 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
249 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
250 in 32,64,128,256 and 512-bit */
251 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
252 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
253 in 32,64,128,256 and 512-bit */
254 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
255 3, 3, /* SSE->integer and integer->SSE moves */
256 4, 4, /* Gather load static, per_elt. */
257 4, 4, /* Gather store static, per_elt. */
258 4, /* size of l1 cache. 486 has 8kB cache
259 shared for code and data, so 4kB is
260 not really precise. */
261 4, /* size of l2 cache */
262 0, /* size of prefetch block */
263 0, /* number of parallel prefetches */
264 1, /* Branch cost */
265 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
266 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
267 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
268 COSTS_N_INSNS (3), /* cost of FABS instruction. */
269 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
270 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
272 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
273 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
274 COSTS_N_INSNS (16), /* cost of MULSS instruction. */
275 COSTS_N_INSNS (16), /* cost of MULSD instruction. */
276 COSTS_N_INSNS (16), /* cost of FMA SS instruction. */
277 COSTS_N_INSNS (16), /* cost of FMA SD instruction. */
278 COSTS_N_INSNS (73), /* cost of DIVSS instruction. */
279 COSTS_N_INSNS (74), /* cost of DIVSD instruction. */
280 COSTS_N_INSNS (83), /* cost of SQRTSS instruction. */
281 COSTS_N_INSNS (83), /* cost of SQRTSD instruction. */
282 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
283 i486_memcpy,
284 i486_memset,
285 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
286 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
289 static stringop_algs pentium_memcpy[2] = {
290 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
291 DUMMY_STRINGOP_ALGS};
292 static stringop_algs pentium_memset[2] = {
293 {libcall, {{-1, rep_prefix_4_byte, false}}},
294 DUMMY_STRINGOP_ALGS};
296 static const
297 struct processor_costs pentium_cost = {
298 COSTS_N_INSNS (1), /* cost of an add instruction */
299 COSTS_N_INSNS (1), /* cost of a lea instruction */
300 COSTS_N_INSNS (4), /* variable shift costs */
301 COSTS_N_INSNS (1), /* constant shift costs */
302 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
303 COSTS_N_INSNS (11), /* HI */
304 COSTS_N_INSNS (11), /* SI */
305 COSTS_N_INSNS (11), /* DI */
306 COSTS_N_INSNS (11)}, /* other */
307 0, /* cost of multiply per each bit set */
308 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
309 COSTS_N_INSNS (25), /* HI */
310 COSTS_N_INSNS (25), /* SI */
311 COSTS_N_INSNS (25), /* DI */
312 COSTS_N_INSNS (25)}, /* other */
313 COSTS_N_INSNS (3), /* cost of movsx */
314 COSTS_N_INSNS (2), /* cost of movzx */
315 8, /* "large" insn */
316 6, /* MOVE_RATIO */
318 /* All move costs are relative to integer->integer move times 2 and thus
319 they are latency*2. */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
336 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
337 in 32,64,128,256 and 512-bit */
338 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
339 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
340 in 32,64,128,256 and 512-bit */
341 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
342 3, 3, /* SSE->integer and integer->SSE moves */
343 4, 4, /* Gather load static, per_elt. */
344 4, 4, /* Gather store static, per_elt. */
345 8, /* size of l1 cache. */
346 8, /* size of l2 cache */
347 0, /* size of prefetch block */
348 0, /* number of parallel prefetches */
349 2, /* Branch cost */
350 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
351 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
352 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
353 COSTS_N_INSNS (1), /* cost of FABS instruction. */
354 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
355 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
357 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
358 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
359 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
360 COSTS_N_INSNS (3), /* cost of MULSD instruction. */
361 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
362 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
363 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
364 COSTS_N_INSNS (39), /* cost of DIVSD instruction. */
365 COSTS_N_INSNS (70), /* cost of SQRTSS instruction. */
366 COSTS_N_INSNS (70), /* cost of SQRTSD instruction. */
367 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
368 pentium_memcpy,
369 pentium_memset,
370 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
371 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
374 static const
375 struct processor_costs lakemont_cost = {
376 COSTS_N_INSNS (1), /* cost of an add instruction */
377 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
378 COSTS_N_INSNS (1), /* variable shift costs */
379 COSTS_N_INSNS (1), /* constant shift costs */
380 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
381 COSTS_N_INSNS (11), /* HI */
382 COSTS_N_INSNS (11), /* SI */
383 COSTS_N_INSNS (11), /* DI */
384 COSTS_N_INSNS (11)}, /* other */
385 0, /* cost of multiply per each bit set */
386 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
387 COSTS_N_INSNS (25), /* HI */
388 COSTS_N_INSNS (25), /* SI */
389 COSTS_N_INSNS (25), /* DI */
390 COSTS_N_INSNS (25)}, /* other */
391 COSTS_N_INSNS (3), /* cost of movsx */
392 COSTS_N_INSNS (2), /* cost of movzx */
393 8, /* "large" insn */
394 17, /* MOVE_RATIO */
396 /* All move costs are relative to integer->integer move times 2 and thus
397 they are latency*2. */
398 6, /* cost for loading QImode using movzbl */
399 {2, 4, 2}, /* cost of loading integer registers
400 in QImode, HImode and SImode.
401 Relative to reg-reg move (2). */
402 {2, 4, 2}, /* cost of storing integer registers */
403 2, /* cost of reg,reg fld/fst */
404 {2, 2, 6}, /* cost of loading fp registers
405 in SFmode, DFmode and XFmode */
406 {4, 4, 6}, /* cost of storing fp registers
407 in SFmode, DFmode and XFmode */
408 8, /* cost of moving MMX register */
409 {8, 8}, /* cost of loading MMX registers
410 in SImode and DImode */
411 {8, 8}, /* cost of storing MMX registers
412 in SImode and DImode */
413 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
414 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
415 in 32,64,128,256 and 512-bit */
416 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
417 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
418 in 32,64,128,256 and 512-bit */
419 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
420 3, 3, /* SSE->integer and integer->SSE moves */
421 4, 4, /* Gather load static, per_elt. */
422 4, 4, /* Gather store static, per_elt. */
423 8, /* size of l1 cache. */
424 8, /* size of l2 cache */
425 0, /* size of prefetch block */
426 0, /* number of parallel prefetches */
427 2, /* Branch cost */
428 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
429 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
430 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
431 COSTS_N_INSNS (1), /* cost of FABS instruction. */
432 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
433 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
435 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
436 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
437 COSTS_N_INSNS (5), /* cost of MULSS instruction. */
438 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
439 COSTS_N_INSNS (10), /* cost of FMA SS instruction. */
440 COSTS_N_INSNS (10), /* cost of FMA SD instruction. */
441 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
442 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
443 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
444 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
445 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
446 pentium_memcpy,
447 pentium_memset,
448 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
449 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
452 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
453 (we ensure the alignment). For small blocks inline loop is still a
454 noticeable win, for bigger blocks either rep movsl or rep movsb is
455 way to go. Rep movsb has apparently more expensive startup time in CPU,
456 but after 4K the difference is down in the noise. */
457 static stringop_algs pentiumpro_memcpy[2] = {
458 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
459 {8192, rep_prefix_4_byte, false},
460 {-1, rep_prefix_1_byte, false}}},
461 DUMMY_STRINGOP_ALGS};
462 static stringop_algs pentiumpro_memset[2] = {
463 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
464 {8192, rep_prefix_4_byte, false},
465 {-1, libcall, false}}},
466 DUMMY_STRINGOP_ALGS};
467 static const
468 struct processor_costs pentiumpro_cost = {
469 COSTS_N_INSNS (1), /* cost of an add instruction */
470 COSTS_N_INSNS (1), /* cost of a lea instruction */
471 COSTS_N_INSNS (1), /* variable shift costs */
472 COSTS_N_INSNS (1), /* constant shift costs */
473 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
474 COSTS_N_INSNS (4), /* HI */
475 COSTS_N_INSNS (4), /* SI */
476 COSTS_N_INSNS (4), /* DI */
477 COSTS_N_INSNS (4)}, /* other */
478 0, /* cost of multiply per each bit set */
479 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
480 COSTS_N_INSNS (17), /* HI */
481 COSTS_N_INSNS (17), /* SI */
482 COSTS_N_INSNS (17), /* DI */
483 COSTS_N_INSNS (17)}, /* other */
484 COSTS_N_INSNS (1), /* cost of movsx */
485 COSTS_N_INSNS (1), /* cost of movzx */
486 8, /* "large" insn */
487 6, /* MOVE_RATIO */
489 /* All move costs are relative to integer->integer move times 2 and thus
490 they are latency*2. */
491 2, /* cost for loading QImode using movzbl */
492 {4, 4, 4}, /* cost of loading integer registers
493 in QImode, HImode and SImode.
494 Relative to reg-reg move (2). */
495 {2, 2, 2}, /* cost of storing integer registers */
496 2, /* cost of reg,reg fld/fst */
497 {2, 2, 6}, /* cost of loading fp registers
498 in SFmode, DFmode and XFmode */
499 {4, 4, 6}, /* cost of storing fp registers
500 in SFmode, DFmode and XFmode */
501 2, /* cost of moving MMX register */
502 {2, 2}, /* cost of loading MMX registers
503 in SImode and DImode */
504 {2, 2}, /* cost of storing MMX registers
505 in SImode and DImode */
506 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
507 {4, 8, 16, 32, 64}, /* cost of loading SSE registers
508 in 32,64,128,256 and 512-bit */
509 {4, 8, 16, 32, 64}, /* cost of unaligned loads. */
510 {4, 8, 16, 32, 64}, /* cost of storing SSE registers
511 in 32,64,128,256 and 512-bit */
512 {4, 8, 16, 32, 64}, /* cost of unaligned stores. */
513 3, 3, /* SSE->integer and integer->SSE moves */
514 4, 4, /* Gather load static, per_elt. */
515 4, 4, /* Gather store static, per_elt. */
516 8, /* size of l1 cache. */
517 256, /* size of l2 cache */
518 32, /* size of prefetch block */
519 6, /* number of parallel prefetches */
520 2, /* Branch cost */
521 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
522 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
523 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
524 COSTS_N_INSNS (2), /* cost of FABS instruction. */
525 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
526 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
528 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
529 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
530 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
531 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
532 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
533 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
534 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
535 COSTS_N_INSNS (18), /* cost of DIVSD instruction. */
536 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
537 COSTS_N_INSNS (31), /* cost of SQRTSD instruction. */
538 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
539 pentiumpro_memcpy,
540 pentiumpro_memset,
541 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
542 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
545 static stringop_algs geode_memcpy[2] = {
546 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
547 DUMMY_STRINGOP_ALGS};
548 static stringop_algs geode_memset[2] = {
549 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
550 DUMMY_STRINGOP_ALGS};
551 static const
552 struct processor_costs geode_cost = {
553 COSTS_N_INSNS (1), /* cost of an add instruction */
554 COSTS_N_INSNS (1), /* cost of a lea instruction */
555 COSTS_N_INSNS (2), /* variable shift costs */
556 COSTS_N_INSNS (1), /* constant shift costs */
557 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
558 COSTS_N_INSNS (4), /* HI */
559 COSTS_N_INSNS (7), /* SI */
560 COSTS_N_INSNS (7), /* DI */
561 COSTS_N_INSNS (7)}, /* other */
562 0, /* cost of multiply per each bit set */
563 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
564 COSTS_N_INSNS (23), /* HI */
565 COSTS_N_INSNS (39), /* SI */
566 COSTS_N_INSNS (39), /* DI */
567 COSTS_N_INSNS (39)}, /* other */
568 COSTS_N_INSNS (1), /* cost of movsx */
569 COSTS_N_INSNS (1), /* cost of movzx */
570 8, /* "large" insn */
571 4, /* MOVE_RATIO */
573 /* All move costs are relative to integer->integer move times 2 and thus
574 they are latency*2. */
575 2, /* cost for loading QImode using movzbl */
576 {2, 2, 2}, /* cost of loading integer registers
577 in QImode, HImode and SImode.
578 Relative to reg-reg move (2). */
579 {2, 2, 2}, /* cost of storing integer registers */
580 2, /* cost of reg,reg fld/fst */
581 {2, 2, 2}, /* cost of loading fp registers
582 in SFmode, DFmode and XFmode */
583 {4, 6, 6}, /* cost of storing fp registers
584 in SFmode, DFmode and XFmode */
586 2, /* cost of moving MMX register */
587 {2, 2}, /* cost of loading MMX registers
588 in SImode and DImode */
589 {2, 2}, /* cost of storing MMX registers
590 in SImode and DImode */
591 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
592 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
593 in 32,64,128,256 and 512-bit */
594 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
595 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
596 in 32,64,128,256 and 512-bit */
597 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
598 6, 6, /* SSE->integer and integer->SSE moves */
599 2, 2, /* Gather load static, per_elt. */
600 2, 2, /* Gather store static, per_elt. */
601 64, /* size of l1 cache. */
602 128, /* size of l2 cache. */
603 32, /* size of prefetch block */
604 1, /* number of parallel prefetches */
605 1, /* Branch cost */
606 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
607 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
608 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
609 COSTS_N_INSNS (1), /* cost of FABS instruction. */
610 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
611 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
613 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
614 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
615 COSTS_N_INSNS (11), /* cost of MULSS instruction. */
616 COSTS_N_INSNS (11), /* cost of MULSD instruction. */
617 COSTS_N_INSNS (17), /* cost of FMA SS instruction. */
618 COSTS_N_INSNS (17), /* cost of FMA SD instruction. */
619 COSTS_N_INSNS (47), /* cost of DIVSS instruction. */
620 COSTS_N_INSNS (47), /* cost of DIVSD instruction. */
621 COSTS_N_INSNS (54), /* cost of SQRTSS instruction. */
622 COSTS_N_INSNS (54), /* cost of SQRTSD instruction. */
623 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
624 geode_memcpy,
625 geode_memset,
626 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
627 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
630 static stringop_algs k6_memcpy[2] = {
631 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
632 DUMMY_STRINGOP_ALGS};
633 static stringop_algs k6_memset[2] = {
634 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
635 DUMMY_STRINGOP_ALGS};
636 static const
637 struct processor_costs k6_cost = {
638 COSTS_N_INSNS (1), /* cost of an add instruction */
639 COSTS_N_INSNS (2), /* cost of a lea instruction */
640 COSTS_N_INSNS (1), /* variable shift costs */
641 COSTS_N_INSNS (1), /* constant shift costs */
642 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
643 COSTS_N_INSNS (3), /* HI */
644 COSTS_N_INSNS (3), /* SI */
645 COSTS_N_INSNS (3), /* DI */
646 COSTS_N_INSNS (3)}, /* other */
647 0, /* cost of multiply per each bit set */
648 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
649 COSTS_N_INSNS (18), /* HI */
650 COSTS_N_INSNS (18), /* SI */
651 COSTS_N_INSNS (18), /* DI */
652 COSTS_N_INSNS (18)}, /* other */
653 COSTS_N_INSNS (2), /* cost of movsx */
654 COSTS_N_INSNS (2), /* cost of movzx */
655 8, /* "large" insn */
656 4, /* MOVE_RATIO */
658 /* All move costs are relative to integer->integer move times 2 and thus
659 they are latency*2. */
660 3, /* cost for loading QImode using movzbl */
661 {4, 5, 4}, /* cost of loading integer registers
662 in QImode, HImode and SImode.
663 Relative to reg-reg move (2). */
664 {2, 3, 2}, /* cost of storing integer registers */
665 4, /* cost of reg,reg fld/fst */
666 {6, 6, 6}, /* cost of loading fp registers
667 in SFmode, DFmode and XFmode */
668 {4, 4, 4}, /* cost of storing fp registers
669 in SFmode, DFmode and XFmode */
670 2, /* cost of moving MMX register */
671 {2, 2}, /* cost of loading MMX registers
672 in SImode and DImode */
673 {2, 2}, /* cost of storing MMX registers
674 in SImode and DImode */
675 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
676 {2, 2, 8, 16, 32}, /* cost of loading SSE registers
677 in 32,64,128,256 and 512-bit */
678 {2, 2, 8, 16, 32}, /* cost of unaligned loads. */
679 {2, 2, 8, 16, 32}, /* cost of storing SSE registers
680 in 32,64,128,256 and 512-bit */
681 {2, 2, 8, 16, 32}, /* cost of unaligned stores. */
682 6, 6, /* SSE->integer and integer->SSE moves */
683 2, 2, /* Gather load static, per_elt. */
684 2, 2, /* Gather store static, per_elt. */
685 32, /* size of l1 cache. */
686 32, /* size of l2 cache. Some models
687 have integrated l2 cache, but
688 optimizing for k6 is not important
689 enough to worry about that. */
690 32, /* size of prefetch block */
691 1, /* number of parallel prefetches */
692 1, /* Branch cost */
693 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
694 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
695 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
696 COSTS_N_INSNS (2), /* cost of FABS instruction. */
697 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
698 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
700 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
701 COSTS_N_INSNS (2), /* cost of ADDSS/SD SUBSS/SD insns. */
702 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
703 COSTS_N_INSNS (2), /* cost of MULSD instruction. */
704 COSTS_N_INSNS (4), /* cost of FMA SS instruction. */
705 COSTS_N_INSNS (4), /* cost of FMA SD instruction. */
706 COSTS_N_INSNS (56), /* cost of DIVSS instruction. */
707 COSTS_N_INSNS (56), /* cost of DIVSD instruction. */
708 COSTS_N_INSNS (56), /* cost of SQRTSS instruction. */
709 COSTS_N_INSNS (56), /* cost of SQRTSD instruction. */
710 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
711 k6_memcpy,
712 k6_memset,
713 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
714 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
717 /* For some reason, Athlon deals better with REP prefix (relative to loops)
718 compared to K8. Alignment becomes important after 8 bytes for memcpy and
719 128 bytes for memset. */
720 static stringop_algs athlon_memcpy[2] = {
721 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
722 DUMMY_STRINGOP_ALGS};
723 static stringop_algs athlon_memset[2] = {
724 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
725 DUMMY_STRINGOP_ALGS};
726 static const
727 struct processor_costs athlon_cost = {
728 COSTS_N_INSNS (1), /* cost of an add instruction */
729 COSTS_N_INSNS (2), /* cost of a lea instruction */
730 COSTS_N_INSNS (1), /* variable shift costs */
731 COSTS_N_INSNS (1), /* constant shift costs */
732 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
733 COSTS_N_INSNS (5), /* HI */
734 COSTS_N_INSNS (5), /* SI */
735 COSTS_N_INSNS (5), /* DI */
736 COSTS_N_INSNS (5)}, /* other */
737 0, /* cost of multiply per each bit set */
738 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
739 COSTS_N_INSNS (26), /* HI */
740 COSTS_N_INSNS (42), /* SI */
741 COSTS_N_INSNS (74), /* DI */
742 COSTS_N_INSNS (74)}, /* other */
743 COSTS_N_INSNS (1), /* cost of movsx */
744 COSTS_N_INSNS (1), /* cost of movzx */
745 8, /* "large" insn */
746 9, /* MOVE_RATIO */
748 /* All move costs are relative to integer->integer move times 2 and thus
749 they are latency*2. */
750 4, /* cost for loading QImode using movzbl */
751 {3, 4, 3}, /* cost of loading integer registers
752 in QImode, HImode and SImode.
753 Relative to reg-reg move (2). */
754 {3, 4, 3}, /* cost of storing integer registers */
755 4, /* cost of reg,reg fld/fst */
756 {4, 4, 12}, /* cost of loading fp registers
757 in SFmode, DFmode and XFmode */
758 {6, 6, 8}, /* cost of storing fp registers
759 in SFmode, DFmode and XFmode */
760 2, /* cost of moving MMX register */
761 {4, 4}, /* cost of loading MMX registers
762 in SImode and DImode */
763 {4, 4}, /* cost of storing MMX registers
764 in SImode and DImode */
765 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
766 {4, 4, 6, 12, 24}, /* cost of loading SSE registers
767 in 32,64,128,256 and 512-bit */
768 {4, 4, 6, 12, 24}, /* cost of unaligned loads. */
769 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
770 in 32,64,128,256 and 512-bit */
771 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
772 5, 5, /* SSE->integer and integer->SSE moves */
773 4, 4, /* Gather load static, per_elt. */
774 4, 4, /* Gather store static, per_elt. */
775 64, /* size of l1 cache. */
776 256, /* size of l2 cache. */
777 64, /* size of prefetch block */
778 6, /* number of parallel prefetches */
779 5, /* Branch cost */
780 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
781 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
782 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
783 COSTS_N_INSNS (2), /* cost of FABS instruction. */
784 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
785 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
787 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
788 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
789 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
790 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
791 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
792 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
793 /* 11-16 */
794 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
795 COSTS_N_INSNS (24), /* cost of DIVSD instruction. */
796 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
797 COSTS_N_INSNS (19), /* cost of SQRTSD instruction. */
798 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
799 athlon_memcpy,
800 athlon_memset,
801 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
802 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
805 /* K8 has optimized REP instruction for medium sized blocks, but for very
806 small blocks it is better to use loop. For large blocks, libcall can
807 do nontemporary accesses and beat inline considerably. */
808 static stringop_algs k8_memcpy[2] = {
809 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
810 {-1, rep_prefix_4_byte, false}}},
811 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
812 {-1, libcall, false}}}};
813 static stringop_algs k8_memset[2] = {
814 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
815 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
816 {libcall, {{48, unrolled_loop, false},
817 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
818 static const
819 struct processor_costs k8_cost = {
820 COSTS_N_INSNS (1), /* cost of an add instruction */
821 COSTS_N_INSNS (2), /* cost of a lea instruction */
822 COSTS_N_INSNS (1), /* variable shift costs */
823 COSTS_N_INSNS (1), /* constant shift costs */
824 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
825 COSTS_N_INSNS (4), /* HI */
826 COSTS_N_INSNS (3), /* SI */
827 COSTS_N_INSNS (4), /* DI */
828 COSTS_N_INSNS (5)}, /* other */
829 0, /* cost of multiply per each bit set */
830 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
831 COSTS_N_INSNS (26), /* HI */
832 COSTS_N_INSNS (42), /* SI */
833 COSTS_N_INSNS (74), /* DI */
834 COSTS_N_INSNS (74)}, /* other */
835 COSTS_N_INSNS (1), /* cost of movsx */
836 COSTS_N_INSNS (1), /* cost of movzx */
837 8, /* "large" insn */
838 9, /* MOVE_RATIO */
840 /* All move costs are relative to integer->integer move times 2 and thus
841 they are latency*2. */
842 4, /* cost for loading QImode using movzbl */
843 {3, 4, 3}, /* cost of loading integer registers
844 in QImode, HImode and SImode.
845 Relative to reg-reg move (2). */
846 {3, 4, 3}, /* cost of storing integer registers */
847 4, /* cost of reg,reg fld/fst */
848 {4, 4, 12}, /* cost of loading fp registers
849 in SFmode, DFmode and XFmode */
850 {6, 6, 8}, /* cost of storing fp registers
851 in SFmode, DFmode and XFmode */
852 2, /* cost of moving MMX register */
853 {3, 3}, /* cost of loading MMX registers
854 in SImode and DImode */
855 {4, 4}, /* cost of storing MMX registers
856 in SImode and DImode */
857 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
858 {4, 3, 6, 12, 24}, /* cost of loading SSE registers
859 in 32,64,128,256 and 512-bit */
860 {4, 3, 6, 12, 24}, /* cost of unaligned loads. */
861 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
862 in 32,64,128,256 and 512-bit */
863 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
864 5, 5, /* SSE->integer and integer->SSE moves */
865 4, 4, /* Gather load static, per_elt. */
866 4, 4, /* Gather store static, per_elt. */
867 64, /* size of l1 cache. */
868 512, /* size of l2 cache. */
869 64, /* size of prefetch block */
870 /* New AMD processors never drop prefetches; if they cannot be performed
871 immediately, they are queued. We set number of simultaneous prefetches
872 to a large constant to reflect this (it probably is not a good idea not
873 to limit number of prefetches at all, as their execution also takes some
874 time). */
875 100, /* number of parallel prefetches */
876 3, /* Branch cost */
877 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
878 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
879 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
880 COSTS_N_INSNS (2), /* cost of FABS instruction. */
881 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
882 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
884 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
885 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
886 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
887 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
888 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
889 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
890 /* 11-16 */
891 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
892 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
893 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
894 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
895 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
896 k8_memcpy,
897 k8_memset,
898 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
899 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
902 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
903 very small blocks it is better to use loop. For large blocks, libcall can
904 do nontemporary accesses and beat inline considerably. */
905 static stringop_algs amdfam10_memcpy[2] = {
906 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}};
910 static stringop_algs amdfam10_memset[2] = {
911 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
912 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
913 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
914 {-1, libcall, false}}}};
915 struct processor_costs amdfam10_cost = {
916 COSTS_N_INSNS (1), /* cost of an add instruction */
917 COSTS_N_INSNS (2), /* cost of a lea instruction */
918 COSTS_N_INSNS (1), /* variable shift costs */
919 COSTS_N_INSNS (1), /* constant shift costs */
920 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
921 COSTS_N_INSNS (4), /* HI */
922 COSTS_N_INSNS (3), /* SI */
923 COSTS_N_INSNS (4), /* DI */
924 COSTS_N_INSNS (5)}, /* other */
925 0, /* cost of multiply per each bit set */
926 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
927 COSTS_N_INSNS (35), /* HI */
928 COSTS_N_INSNS (51), /* SI */
929 COSTS_N_INSNS (83), /* DI */
930 COSTS_N_INSNS (83)}, /* other */
931 COSTS_N_INSNS (1), /* cost of movsx */
932 COSTS_N_INSNS (1), /* cost of movzx */
933 8, /* "large" insn */
934 9, /* MOVE_RATIO */
936 /* All move costs are relative to integer->integer move times 2 and thus
937 they are latency*2. */
938 4, /* cost for loading QImode using movzbl */
939 {3, 4, 3}, /* cost of loading integer registers
940 in QImode, HImode and SImode.
941 Relative to reg-reg move (2). */
942 {3, 4, 3}, /* cost of storing integer registers */
943 4, /* cost of reg,reg fld/fst */
944 {4, 4, 12}, /* cost of loading fp registers
945 in SFmode, DFmode and XFmode */
946 {6, 6, 8}, /* cost of storing fp registers
947 in SFmode, DFmode and XFmode */
948 2, /* cost of moving MMX register */
949 {3, 3}, /* cost of loading MMX registers
950 in SImode and DImode */
951 {4, 4}, /* cost of storing MMX registers
952 in SImode and DImode */
953 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
954 {4, 4, 3, 6, 12}, /* cost of loading SSE registers
955 in 32,64,128,256 and 512-bit */
956 {4, 4, 3, 7, 12}, /* cost of unaligned loads. */
957 {4, 4, 5, 10, 20}, /* cost of storing SSE registers
958 in 32,64,128,256 and 512-bit */
959 {4, 4, 5, 10, 20}, /* cost of unaligned stores. */
960 3, 3, /* SSE->integer and integer->SSE moves */
961 /* On K8:
962 MOVD reg64, xmmreg Double FSTORE 4
963 MOVD reg32, xmmreg Double FSTORE 4
964 On AMDFAM10:
965 MOVD reg64, xmmreg Double FADD 3
966 1/1 1/1
967 MOVD reg32, xmmreg Double FADD 3
968 1/1 1/1 */
969 4, 4, /* Gather load static, per_elt. */
970 4, 4, /* Gather store static, per_elt. */
971 64, /* size of l1 cache. */
972 512, /* size of l2 cache. */
973 64, /* size of prefetch block */
974 /* New AMD processors never drop prefetches; if they cannot be performed
975 immediately, they are queued. We set number of simultaneous prefetches
976 to a large constant to reflect this (it probably is not a good idea not
977 to limit number of prefetches at all, as their execution also takes some
978 time). */
979 100, /* number of parallel prefetches */
980 2, /* Branch cost */
981 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
982 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
983 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
984 COSTS_N_INSNS (2), /* cost of FABS instruction. */
985 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
986 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
988 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
989 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
990 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
991 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
992 COSTS_N_INSNS (8), /* cost of FMA SS instruction. */
993 COSTS_N_INSNS (8), /* cost of FMA SD instruction. */
994 /* 11-16 */
995 COSTS_N_INSNS (16), /* cost of DIVSS instruction. */
996 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
997 COSTS_N_INSNS (19), /* cost of SQRTSS instruction. */
998 COSTS_N_INSNS (27), /* cost of SQRTSD instruction. */
999 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1000 amdfam10_memcpy,
1001 amdfam10_memset,
1002 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1003 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1006 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1007 very small blocks it is better to use loop. For large blocks, libcall
1008 can do nontemporary accesses and beat inline considerably. */
1009 static stringop_algs bdver1_memcpy[2] = {
1010 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1011 {-1, rep_prefix_4_byte, false}}},
1012 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1013 {-1, libcall, false}}}};
1014 static stringop_algs bdver1_memset[2] = {
1015 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1016 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1017 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1018 {-1, libcall, false}}}};
1020 const struct processor_costs bdver1_cost = {
1021 COSTS_N_INSNS (1), /* cost of an add instruction */
1022 COSTS_N_INSNS (1), /* cost of a lea instruction */
1023 COSTS_N_INSNS (1), /* variable shift costs */
1024 COSTS_N_INSNS (1), /* constant shift costs */
1025 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1026 COSTS_N_INSNS (4), /* HI */
1027 COSTS_N_INSNS (4), /* SI */
1028 COSTS_N_INSNS (6), /* DI */
1029 COSTS_N_INSNS (6)}, /* other */
1030 0, /* cost of multiply per each bit set */
1031 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1032 COSTS_N_INSNS (35), /* HI */
1033 COSTS_N_INSNS (51), /* SI */
1034 COSTS_N_INSNS (83), /* DI */
1035 COSTS_N_INSNS (83)}, /* other */
1036 COSTS_N_INSNS (1), /* cost of movsx */
1037 COSTS_N_INSNS (1), /* cost of movzx */
1038 8, /* "large" insn */
1039 9, /* MOVE_RATIO */
1041 /* All move costs are relative to integer->integer move times 2 and thus
1042 they are latency*2. */
1043 8, /* cost for loading QImode using movzbl */
1044 {8, 8, 8}, /* cost of loading integer registers
1045 in QImode, HImode and SImode.
1046 Relative to reg-reg move (2). */
1047 {8, 8, 8}, /* cost of storing integer registers */
1048 4, /* cost of reg,reg fld/fst */
1049 {12, 12, 28}, /* cost of loading fp registers
1050 in SFmode, DFmode and XFmode */
1051 {10, 10, 18}, /* cost of storing fp registers
1052 in SFmode, DFmode and XFmode */
1053 4, /* cost of moving MMX register */
1054 {12, 12}, /* cost of loading MMX registers
1055 in SImode and DImode */
1056 {10, 10}, /* cost of storing MMX registers
1057 in SImode and DImode */
1058 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1059 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1060 in 32,64,128,256 and 512-bit */
1061 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1062 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1063 in 32,64,128,256 and 512-bit */
1064 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1065 16, 20, /* SSE->integer and integer->SSE moves */
1066 12, 12, /* Gather load static, per_elt. */
1067 10, 10, /* Gather store static, per_elt. */
1068 16, /* size of l1 cache. */
1069 2048, /* size of l2 cache. */
1070 64, /* size of prefetch block */
1071 /* New AMD processors never drop prefetches; if they cannot be performed
1072 immediately, they are queued. We set number of simultaneous prefetches
1073 to a large constant to reflect this (it probably is not a good idea not
1074 to limit number of prefetches at all, as their execution also takes some
1075 time). */
1076 100, /* number of parallel prefetches */
1077 2, /* Branch cost */
1078 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1079 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1080 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1081 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1082 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1083 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1085 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1086 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1087 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1088 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1089 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1090 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1091 /* 9-24 */
1092 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1093 /* 9-27 */
1094 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1095 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1096 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1097 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1098 bdver1_memcpy,
1099 bdver1_memset,
1100 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1101 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1104 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1105 very small blocks it is better to use loop. For large blocks, libcall
1106 can do nontemporary accesses and beat inline considerably. */
1108 static stringop_algs bdver2_memcpy[2] = {
1109 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1110 {-1, rep_prefix_4_byte, false}}},
1111 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1112 {-1, libcall, false}}}};
1113 static stringop_algs bdver2_memset[2] = {
1114 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1115 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1116 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1117 {-1, libcall, false}}}};
1119 const struct processor_costs bdver2_cost = {
1120 COSTS_N_INSNS (1), /* cost of an add instruction */
1121 COSTS_N_INSNS (1), /* cost of a lea instruction */
1122 COSTS_N_INSNS (1), /* variable shift costs */
1123 COSTS_N_INSNS (1), /* constant shift costs */
1124 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1125 COSTS_N_INSNS (4), /* HI */
1126 COSTS_N_INSNS (4), /* SI */
1127 COSTS_N_INSNS (6), /* DI */
1128 COSTS_N_INSNS (6)}, /* other */
1129 0, /* cost of multiply per each bit set */
1130 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1131 COSTS_N_INSNS (35), /* HI */
1132 COSTS_N_INSNS (51), /* SI */
1133 COSTS_N_INSNS (83), /* DI */
1134 COSTS_N_INSNS (83)}, /* other */
1135 COSTS_N_INSNS (1), /* cost of movsx */
1136 COSTS_N_INSNS (1), /* cost of movzx */
1137 8, /* "large" insn */
1138 9, /* MOVE_RATIO */
1140 /* All move costs are relative to integer->integer move times 2 and thus
1141 they are latency*2. */
1142 8, /* cost for loading QImode using movzbl */
1143 {8, 8, 8}, /* cost of loading integer registers
1144 in QImode, HImode and SImode.
1145 Relative to reg-reg move (2). */
1146 {8, 8, 8}, /* cost of storing integer registers */
1147 4, /* cost of reg,reg fld/fst */
1148 {12, 12, 28}, /* cost of loading fp registers
1149 in SFmode, DFmode and XFmode */
1150 {10, 10, 18}, /* cost of storing fp registers
1151 in SFmode, DFmode and XFmode */
1152 4, /* cost of moving MMX register */
1153 {12, 12}, /* cost of loading MMX registers
1154 in SImode and DImode */
1155 {10, 10}, /* cost of storing MMX registers
1156 in SImode and DImode */
1157 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1158 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1159 in 32,64,128,256 and 512-bit */
1160 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1161 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1162 in 32,64,128,256 and 512-bit */
1163 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1164 16, 20, /* SSE->integer and integer->SSE moves */
1165 12, 12, /* Gather load static, per_elt. */
1166 10, 10, /* Gather store static, per_elt. */
1167 16, /* size of l1 cache. */
1168 2048, /* size of l2 cache. */
1169 64, /* size of prefetch block */
1170 /* New AMD processors never drop prefetches; if they cannot be performed
1171 immediately, they are queued. We set number of simultaneous prefetches
1172 to a large constant to reflect this (it probably is not a good idea not
1173 to limit number of prefetches at all, as their execution also takes some
1174 time). */
1175 100, /* number of parallel prefetches */
1176 2, /* Branch cost */
1177 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1178 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1179 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1180 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1181 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1182 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1184 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1185 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1186 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1187 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1188 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1189 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1190 /* 9-24 */
1191 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1192 /* 9-27 */
1193 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1194 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1195 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1196 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1197 bdver2_memcpy,
1198 bdver2_memset,
1199 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1200 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1204 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1205 very small blocks it is better to use loop. For large blocks, libcall
1206 can do nontemporary accesses and beat inline considerably. */
1207 static stringop_algs bdver3_memcpy[2] = {
1208 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1209 {-1, rep_prefix_4_byte, false}}},
1210 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1211 {-1, libcall, false}}}};
1212 static stringop_algs bdver3_memset[2] = {
1213 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1214 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1215 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1216 {-1, libcall, false}}}};
1217 struct processor_costs bdver3_cost = {
1218 COSTS_N_INSNS (1), /* cost of an add instruction */
1219 COSTS_N_INSNS (1), /* cost of a lea instruction */
1220 COSTS_N_INSNS (1), /* variable shift costs */
1221 COSTS_N_INSNS (1), /* constant shift costs */
1222 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1223 COSTS_N_INSNS (4), /* HI */
1224 COSTS_N_INSNS (4), /* SI */
1225 COSTS_N_INSNS (6), /* DI */
1226 COSTS_N_INSNS (6)}, /* other */
1227 0, /* cost of multiply per each bit set */
1228 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1229 COSTS_N_INSNS (35), /* HI */
1230 COSTS_N_INSNS (51), /* SI */
1231 COSTS_N_INSNS (83), /* DI */
1232 COSTS_N_INSNS (83)}, /* other */
1233 COSTS_N_INSNS (1), /* cost of movsx */
1234 COSTS_N_INSNS (1), /* cost of movzx */
1235 8, /* "large" insn */
1236 9, /* MOVE_RATIO */
1238 /* All move costs are relative to integer->integer move times 2 and thus
1239 they are latency*2. */
1240 8, /* cost for loading QImode using movzbl */
1241 {8, 8, 8}, /* cost of loading integer registers
1242 in QImode, HImode and SImode.
1243 Relative to reg-reg move (2). */
1244 {8, 8, 8}, /* cost of storing integer registers */
1245 4, /* cost of reg,reg fld/fst */
1246 {12, 12, 28}, /* cost of loading fp registers
1247 in SFmode, DFmode and XFmode */
1248 {10, 10, 18}, /* cost of storing fp registers
1249 in SFmode, DFmode and XFmode */
1250 4, /* cost of moving MMX register */
1251 {12, 12}, /* cost of loading MMX registers
1252 in SImode and DImode */
1253 {10, 10}, /* cost of storing MMX registers
1254 in SImode and DImode */
1255 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1256 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1257 in 32,64,128,256 and 512-bit */
1258 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1259 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1260 in 32,64,128,256 and 512-bit */
1261 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1262 16, 20, /* SSE->integer and integer->SSE moves */
1263 12, 12, /* Gather load static, per_elt. */
1264 10, 10, /* Gather store static, per_elt. */
1265 16, /* size of l1 cache. */
1266 2048, /* size of l2 cache. */
1267 64, /* size of prefetch block */
1268 /* New AMD processors never drop prefetches; if they cannot be performed
1269 immediately, they are queued. We set number of simultaneous prefetches
1270 to a large constant to reflect this (it probably is not a good idea not
1271 to limit number of prefetches at all, as their execution also takes some
1272 time). */
1273 100, /* number of parallel prefetches */
1274 2, /* Branch cost */
1275 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1276 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1277 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1278 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1279 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1280 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1282 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1283 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1284 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1285 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1286 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1287 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1288 /* 9-24 */
1289 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1290 /* 9-27 */
1291 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1292 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1293 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1294 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1295 bdver3_memcpy,
1296 bdver3_memset,
1297 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1298 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1301 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1302 very small blocks it is better to use loop. For large blocks, libcall
1303 can do nontemporary accesses and beat inline considerably. */
1304 static stringop_algs bdver4_memcpy[2] = {
1305 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1306 {-1, rep_prefix_4_byte, false}}},
1307 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1308 {-1, libcall, false}}}};
1309 static stringop_algs bdver4_memset[2] = {
1310 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1311 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1312 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1313 {-1, libcall, false}}}};
1314 struct processor_costs bdver4_cost = {
1315 COSTS_N_INSNS (1), /* cost of an add instruction */
1316 COSTS_N_INSNS (1), /* cost of a lea instruction */
1317 COSTS_N_INSNS (1), /* variable shift costs */
1318 COSTS_N_INSNS (1), /* constant shift costs */
1319 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1320 COSTS_N_INSNS (4), /* HI */
1321 COSTS_N_INSNS (4), /* SI */
1322 COSTS_N_INSNS (6), /* DI */
1323 COSTS_N_INSNS (6)}, /* other */
1324 0, /* cost of multiply per each bit set */
1325 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1326 COSTS_N_INSNS (35), /* HI */
1327 COSTS_N_INSNS (51), /* SI */
1328 COSTS_N_INSNS (83), /* DI */
1329 COSTS_N_INSNS (83)}, /* other */
1330 COSTS_N_INSNS (1), /* cost of movsx */
1331 COSTS_N_INSNS (1), /* cost of movzx */
1332 8, /* "large" insn */
1333 9, /* MOVE_RATIO */
1335 /* All move costs are relative to integer->integer move times 2 and thus
1336 they are latency*2. */
1337 8, /* cost for loading QImode using movzbl */
1338 {8, 8, 8}, /* cost of loading integer registers
1339 in QImode, HImode and SImode.
1340 Relative to reg-reg move (2). */
1341 {8, 8, 8}, /* cost of storing integer registers */
1342 4, /* cost of reg,reg fld/fst */
1343 {12, 12, 28}, /* cost of loading fp registers
1344 in SFmode, DFmode and XFmode */
1345 {10, 10, 18}, /* cost of storing fp registers
1346 in SFmode, DFmode and XFmode */
1347 4, /* cost of moving MMX register */
1348 {12, 12}, /* cost of loading MMX registers
1349 in SImode and DImode */
1350 {10, 10}, /* cost of storing MMX registers
1351 in SImode and DImode */
1352 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1353 {12, 12, 10, 20, 30}, /* cost of loading SSE registers
1354 in 32,64,128,256 and 512-bit */
1355 {12, 12, 10, 20, 30}, /* cost of unaligned loads. */
1356 {10, 10, 10, 20, 30}, /* cost of storing SSE registers
1357 in 32,64,128,256 and 512-bit */
1358 {10, 10, 10, 20, 30}, /* cost of unaligned stores. */
1359 16, 20, /* SSE->integer and integer->SSE moves */
1360 12, 12, /* Gather load static, per_elt. */
1361 10, 10, /* Gather store static, per_elt. */
1362 16, /* size of l1 cache. */
1363 2048, /* size of l2 cache. */
1364 64, /* size of prefetch block */
1365 /* New AMD processors never drop prefetches; if they cannot be performed
1366 immediately, they are queued. We set number of simultaneous prefetches
1367 to a large constant to reflect this (it probably is not a good idea not
1368 to limit number of prefetches at all, as their execution also takes some
1369 time). */
1370 100, /* number of parallel prefetches */
1371 2, /* Branch cost */
1372 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1373 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1374 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1375 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1376 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1377 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1379 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1380 COSTS_N_INSNS (6), /* cost of ADDSS/SD SUBSS/SD insns. */
1381 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1382 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1383 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1384 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1385 /* 9-24 */
1386 COSTS_N_INSNS (24), /* cost of DIVSS instruction. */
1387 /* 9-27 */
1388 COSTS_N_INSNS (27), /* cost of DIVSD instruction. */
1389 COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */
1390 COSTS_N_INSNS (26), /* cost of SQRTSD instruction. */
1391 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1392 bdver4_memcpy,
1393 bdver4_memset,
1394 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1395 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1399 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1400 very small blocks it is better to use loop. For large blocks, libcall
1401 can do nontemporary accesses and beat inline considerably. */
1402 static stringop_algs znver1_memcpy[2] = {
1403 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1404 {-1, rep_prefix_4_byte, false}}},
1405 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1406 {-1, libcall, false}}}};
1407 static stringop_algs znver1_memset[2] = {
1408 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1409 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1410 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1411 {-1, libcall, false}}}};
1412 struct processor_costs znver1_cost = {
1413 COSTS_N_INSNS (1), /* cost of an add instruction. */
1414 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1415 COSTS_N_INSNS (1), /* variable shift costs. */
1416 COSTS_N_INSNS (1), /* constant shift costs. */
1417 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1418 COSTS_N_INSNS (3), /* HI. */
1419 COSTS_N_INSNS (3), /* SI. */
1420 COSTS_N_INSNS (3), /* DI. */
1421 COSTS_N_INSNS (3)}, /* other. */
1422 0, /* cost of multiply per each bit
1423 set. */
1424 /* Depending on parameters, idiv can get faster on ryzen. This is upper
1425 bound. */
1426 {COSTS_N_INSNS (16), /* cost of a divide/mod for QI. */
1427 COSTS_N_INSNS (22), /* HI. */
1428 COSTS_N_INSNS (30), /* SI. */
1429 COSTS_N_INSNS (45), /* DI. */
1430 COSTS_N_INSNS (45)}, /* other. */
1431 COSTS_N_INSNS (1), /* cost of movsx. */
1432 COSTS_N_INSNS (1), /* cost of movzx. */
1433 8, /* "large" insn. */
1434 9, /* MOVE_RATIO. */
1436 /* All move costs are relative to integer->integer move times 2 and thus
1437 they are latency*2. */
1439 /* reg-reg moves are done by renaming and thus they are even cheaper than
1440 1 cycle. Becuase reg-reg move cost is 2 and the following tables correspond
1441 to doubles of latencies, we do not model this correctly. It does not
1442 seem to make practical difference to bump prices up even more. */
1443 6, /* cost for loading QImode using
1444 movzbl. */
1445 {6, 6, 6}, /* cost of loading integer registers
1446 in QImode, HImode and SImode.
1447 Relative to reg-reg move (2). */
1448 {8, 8, 8}, /* cost of storing integer
1449 registers. */
1450 2, /* cost of reg,reg fld/fst. */
1451 {6, 6, 16}, /* cost of loading fp registers
1452 in SFmode, DFmode and XFmode. */
1453 {8, 8, 16}, /* cost of storing fp registers
1454 in SFmode, DFmode and XFmode. */
1455 2, /* cost of moving MMX register. */
1456 {6, 6}, /* cost of loading MMX registers
1457 in SImode and DImode. */
1458 {8, 8}, /* cost of storing MMX registers
1459 in SImode and DImode. */
1460 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. */
1461 {6, 6, 6, 10, 20}, /* cost of loading SSE registers
1462 in 32,64,128,256 and 512-bit. */
1463 {6, 6, 6, 10, 20}, /* cost of unaligned loads. */
1464 {8, 8, 8, 8, 16}, /* cost of storing SSE registers
1465 in 32,64,128,256 and 512-bit. */
1466 {8, 8, 8, 8, 16}, /* cost of unaligned stores. */
1467 6, 6, /* SSE->integer and integer->SSE moves. */
1468 /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
1469 throughput 12. Approx 9 uops do not depend on vector size and every load
1470 is 7 uops. */
1471 18, 8, /* Gather load static, per_elt. */
1472 18, 10, /* Gather store static, per_elt. */
1473 32, /* size of l1 cache. */
1474 512, /* size of l2 cache. */
1475 64, /* size of prefetch block. */
1476 /* New AMD processors never drop prefetches; if they cannot be performed
1477 immediately, they are queued. We set number of simultaneous prefetches
1478 to a large constant to reflect this (it probably is not a good idea not
1479 to limit number of prefetches at all, as their execution also takes some
1480 time). */
1481 100, /* number of parallel prefetches. */
1482 3, /* Branch cost. */
1483 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1485 /* Latency of fdiv is 8-15. */
1486 COSTS_N_INSNS (15), /* cost of FDIV instruction. */
1487 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1488 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1489 /* Latency of fsqrt is 4-10. */
1490 COSTS_N_INSNS (10), /* cost of FSQRT instruction. */
1492 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1493 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1494 COSTS_N_INSNS (3), /* cost of MULSS instruction. */
1495 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1496 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1497 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1498 COSTS_N_INSNS (10), /* cost of DIVSS instruction. */
1499 /* 9-13 */
1500 COSTS_N_INSNS (13), /* cost of DIVSD instruction. */
1501 COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */
1502 COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */
1503 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1504 and it can execute 2 integer additions and 2 multiplications thus
1505 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1506 that 4 works better than 6 probably due to register pressure.
1508 Integer vector operations are taken by FP unit and execute 3 vector
1509 plus/minus operations per cycle but only one multiply. This is adjusted
1510 in ix86_reassociation_width. */
1511 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1512 znver1_memcpy,
1513 znver1_memset,
1514 COSTS_N_INSNS (4), /* cond_taken_branch_cost. */
1515 COSTS_N_INSNS (2), /* cond_not_taken_branch_cost. */
1518 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1519 very small blocks it is better to use loop. For large blocks, libcall can
1520 do nontemporary accesses and beat inline considerably. */
1521 static stringop_algs btver1_memcpy[2] = {
1522 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1523 {-1, rep_prefix_4_byte, false}}},
1524 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1525 {-1, libcall, false}}}};
1526 static stringop_algs btver1_memset[2] = {
1527 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1528 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1529 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1530 {-1, libcall, false}}}};
1531 const struct processor_costs btver1_cost = {
1532 COSTS_N_INSNS (1), /* cost of an add instruction */
1533 COSTS_N_INSNS (2), /* cost of a lea instruction */
1534 COSTS_N_INSNS (1), /* variable shift costs */
1535 COSTS_N_INSNS (1), /* constant shift costs */
1536 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1537 COSTS_N_INSNS (4), /* HI */
1538 COSTS_N_INSNS (3), /* SI */
1539 COSTS_N_INSNS (4), /* DI */
1540 COSTS_N_INSNS (5)}, /* other */
1541 0, /* cost of multiply per each bit set */
1542 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1543 COSTS_N_INSNS (35), /* HI */
1544 COSTS_N_INSNS (51), /* SI */
1545 COSTS_N_INSNS (83), /* DI */
1546 COSTS_N_INSNS (83)}, /* other */
1547 COSTS_N_INSNS (1), /* cost of movsx */
1548 COSTS_N_INSNS (1), /* cost of movzx */
1549 8, /* "large" insn */
1550 9, /* MOVE_RATIO */
1552 /* All move costs are relative to integer->integer move times 2 and thus
1553 they are latency*2. */
1554 8, /* cost for loading QImode using movzbl */
1555 {6, 8, 6}, /* cost of loading integer registers
1556 in QImode, HImode and SImode.
1557 Relative to reg-reg move (2). */
1558 {6, 8, 6}, /* cost of storing integer registers */
1559 4, /* cost of reg,reg fld/fst */
1560 {12, 12, 28}, /* cost of loading fp registers
1561 in SFmode, DFmode and XFmode */
1562 {12, 12, 38}, /* cost of storing fp registers
1563 in SFmode, DFmode and XFmode */
1564 4, /* cost of moving MMX register */
1565 {10, 10}, /* cost of loading MMX registers
1566 in SImode and DImode */
1567 {12, 12}, /* cost of storing MMX registers
1568 in SImode and DImode */
1569 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1570 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1571 in 32,64,128,256 and 512-bit */
1572 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1573 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1574 in 32,64,128,256 and 512-bit */
1575 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1576 14, 14, /* SSE->integer and integer->SSE moves */
1577 10, 10, /* Gather load static, per_elt. */
1578 10, 10, /* Gather store static, per_elt. */
1579 32, /* size of l1 cache. */
1580 512, /* size of l2 cache. */
1581 64, /* size of prefetch block */
1582 100, /* number of parallel prefetches */
1583 2, /* Branch cost */
1584 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1585 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1586 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1587 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1588 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1589 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1591 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1592 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1593 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1594 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1595 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1596 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1597 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1598 COSTS_N_INSNS (17), /* cost of DIVSD instruction. */
1599 COSTS_N_INSNS (14), /* cost of SQRTSS instruction. */
1600 COSTS_N_INSNS (48), /* cost of SQRTSD instruction. */
1601 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1602 btver1_memcpy,
1603 btver1_memset,
1604 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1605 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1608 static stringop_algs btver2_memcpy[2] = {
1609 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1610 {-1, rep_prefix_4_byte, false}}},
1611 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1612 {-1, libcall, false}}}};
1613 static stringop_algs btver2_memset[2] = {
1614 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1615 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1616 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1617 {-1, libcall, false}}}};
1618 const struct processor_costs btver2_cost = {
1619 COSTS_N_INSNS (1), /* cost of an add instruction */
1620 COSTS_N_INSNS (2), /* cost of a lea instruction */
1621 COSTS_N_INSNS (1), /* variable shift costs */
1622 COSTS_N_INSNS (1), /* constant shift costs */
1623 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1624 COSTS_N_INSNS (4), /* HI */
1625 COSTS_N_INSNS (3), /* SI */
1626 COSTS_N_INSNS (4), /* DI */
1627 COSTS_N_INSNS (5)}, /* other */
1628 0, /* cost of multiply per each bit set */
1629 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1630 COSTS_N_INSNS (35), /* HI */
1631 COSTS_N_INSNS (51), /* SI */
1632 COSTS_N_INSNS (83), /* DI */
1633 COSTS_N_INSNS (83)}, /* other */
1634 COSTS_N_INSNS (1), /* cost of movsx */
1635 COSTS_N_INSNS (1), /* cost of movzx */
1636 8, /* "large" insn */
1637 9, /* MOVE_RATIO */
1639 /* All move costs are relative to integer->integer move times 2 and thus
1640 they are latency*2. */
1641 8, /* cost for loading QImode using movzbl */
1642 {8, 8, 6}, /* cost of loading integer registers
1643 in QImode, HImode and SImode.
1644 Relative to reg-reg move (2). */
1645 {8, 8, 6}, /* cost of storing integer registers */
1646 4, /* cost of reg,reg fld/fst */
1647 {12, 12, 28}, /* cost of loading fp registers
1648 in SFmode, DFmode and XFmode */
1649 {12, 12, 38}, /* cost of storing fp registers
1650 in SFmode, DFmode and XFmode */
1651 4, /* cost of moving MMX register */
1652 {10, 10}, /* cost of loading MMX registers
1653 in SImode and DImode */
1654 {12, 12}, /* cost of storing MMX registers
1655 in SImode and DImode */
1656 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1657 {10, 10, 12, 24, 48}, /* cost of loading SSE registers
1658 in 32,64,128,256 and 512-bit */
1659 {10, 10, 12, 24, 48}, /* cost of unaligned loads. */
1660 {10, 10, 12, 24, 48}, /* cost of storing SSE registers
1661 in 32,64,128,256 and 512-bit */
1662 {10, 10, 12, 24, 48}, /* cost of unaligned stores. */
1663 14, 14, /* SSE->integer and integer->SSE moves */
1664 10, 10, /* Gather load static, per_elt. */
1665 10, 10, /* Gather store static, per_elt. */
1666 32, /* size of l1 cache. */
1667 2048, /* size of l2 cache. */
1668 64, /* size of prefetch block */
1669 100, /* number of parallel prefetches */
1670 2, /* Branch cost */
1671 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1672 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1673 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1674 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1675 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1676 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1678 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1679 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
1680 COSTS_N_INSNS (2), /* cost of MULSS instruction. */
1681 COSTS_N_INSNS (4), /* cost of MULSD instruction. */
1682 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
1683 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
1684 COSTS_N_INSNS (13), /* cost of DIVSS instruction. */
1685 COSTS_N_INSNS (19), /* cost of DIVSD instruction. */
1686 COSTS_N_INSNS (16), /* cost of SQRTSS instruction. */
1687 COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */
1688 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1689 btver2_memcpy,
1690 btver2_memset,
1691 COSTS_N_INSNS (2), /* cond_taken_branch_cost. */
1692 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1695 static stringop_algs pentium4_memcpy[2] = {
1696 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1697 DUMMY_STRINGOP_ALGS};
1698 static stringop_algs pentium4_memset[2] = {
1699 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1700 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1701 DUMMY_STRINGOP_ALGS};
1703 static const
1704 struct processor_costs pentium4_cost = {
1705 COSTS_N_INSNS (1), /* cost of an add instruction */
1706 COSTS_N_INSNS (3), /* cost of a lea instruction */
1707 COSTS_N_INSNS (4), /* variable shift costs */
1708 COSTS_N_INSNS (4), /* constant shift costs */
1709 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1710 COSTS_N_INSNS (15), /* HI */
1711 COSTS_N_INSNS (15), /* SI */
1712 COSTS_N_INSNS (15), /* DI */
1713 COSTS_N_INSNS (15)}, /* other */
1714 0, /* cost of multiply per each bit set */
1715 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1716 COSTS_N_INSNS (56), /* HI */
1717 COSTS_N_INSNS (56), /* SI */
1718 COSTS_N_INSNS (56), /* DI */
1719 COSTS_N_INSNS (56)}, /* other */
1720 COSTS_N_INSNS (1), /* cost of movsx */
1721 COSTS_N_INSNS (1), /* cost of movzx */
1722 16, /* "large" insn */
1723 6, /* MOVE_RATIO */
1725 /* All move costs are relative to integer->integer move times 2 and thus
1726 they are latency*2. */
1727 5, /* cost for loading QImode using movzbl */
1728 {4, 5, 4}, /* cost of loading integer registers
1729 in QImode, HImode and SImode.
1730 Relative to reg-reg move (2). */
1731 {2, 3, 2}, /* cost of storing integer registers */
1732 12, /* cost of reg,reg fld/fst */
1733 {14, 14, 14}, /* cost of loading fp registers
1734 in SFmode, DFmode and XFmode */
1735 {14, 14, 14}, /* cost of storing fp registers
1736 in SFmode, DFmode and XFmode */
1737 12, /* cost of moving MMX register */
1738 {16, 16}, /* cost of loading MMX registers
1739 in SImode and DImode */
1740 {16, 16}, /* cost of storing MMX registers
1741 in SImode and DImode */
1742 12, 24, 48, /* cost of moving XMM,YMM,ZMM register */
1743 {16, 16, 16, 32, 64}, /* cost of loading SSE registers
1744 in 32,64,128,256 and 512-bit */
1745 {32, 32, 32, 64, 128}, /* cost of unaligned loads. */
1746 {16, 16, 16, 32, 64}, /* cost of storing SSE registers
1747 in 32,64,128,256 and 512-bit */
1748 {32, 32, 32, 64, 128}, /* cost of unaligned stores. */
1749 20, 12, /* SSE->integer and integer->SSE moves */
1750 16, 16, /* Gather load static, per_elt. */
1751 16, 16, /* Gather store static, per_elt. */
1752 8, /* size of l1 cache. */
1753 256, /* size of l2 cache. */
1754 64, /* size of prefetch block */
1755 6, /* number of parallel prefetches */
1756 2, /* Branch cost */
1757 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1758 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1759 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1760 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1761 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1762 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1764 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1765 COSTS_N_INSNS (4), /* cost of ADDSS/SD SUBSS/SD insns. */
1766 COSTS_N_INSNS (6), /* cost of MULSS instruction. */
1767 COSTS_N_INSNS (6), /* cost of MULSD instruction. */
1768 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1769 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1770 COSTS_N_INSNS (23), /* cost of DIVSS instruction. */
1771 COSTS_N_INSNS (38), /* cost of DIVSD instruction. */
1772 COSTS_N_INSNS (23), /* cost of SQRTSS instruction. */
1773 COSTS_N_INSNS (38), /* cost of SQRTSD instruction. */
1774 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1775 pentium4_memcpy,
1776 pentium4_memset,
1777 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1778 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1781 static stringop_algs nocona_memcpy[2] = {
1782 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1783 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1784 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1786 static stringop_algs nocona_memset[2] = {
1787 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1788 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1789 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1790 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1792 static const
1793 struct processor_costs nocona_cost = {
1794 COSTS_N_INSNS (1), /* cost of an add instruction */
1795 COSTS_N_INSNS (1), /* cost of a lea instruction */
1796 COSTS_N_INSNS (1), /* variable shift costs */
1797 COSTS_N_INSNS (1), /* constant shift costs */
1798 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1799 COSTS_N_INSNS (10), /* HI */
1800 COSTS_N_INSNS (10), /* SI */
1801 COSTS_N_INSNS (10), /* DI */
1802 COSTS_N_INSNS (10)}, /* other */
1803 0, /* cost of multiply per each bit set */
1804 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1805 COSTS_N_INSNS (66), /* HI */
1806 COSTS_N_INSNS (66), /* SI */
1807 COSTS_N_INSNS (66), /* DI */
1808 COSTS_N_INSNS (66)}, /* other */
1809 COSTS_N_INSNS (1), /* cost of movsx */
1810 COSTS_N_INSNS (1), /* cost of movzx */
1811 16, /* "large" insn */
1812 17, /* MOVE_RATIO */
1814 /* All move costs are relative to integer->integer move times 2 and thus
1815 they are latency*2. */
1816 4, /* cost for loading QImode using movzbl */
1817 {4, 4, 4}, /* cost of loading integer registers
1818 in QImode, HImode and SImode.
1819 Relative to reg-reg move (2). */
1820 {4, 4, 4}, /* cost of storing integer registers */
1821 12, /* cost of reg,reg fld/fst */
1822 {14, 14, 14}, /* cost of loading fp registers
1823 in SFmode, DFmode and XFmode */
1824 {14, 14, 14}, /* cost of storing fp registers
1825 in SFmode, DFmode and XFmode */
1826 14, /* cost of moving MMX register */
1827 {12, 12}, /* cost of loading MMX registers
1828 in SImode and DImode */
1829 {12, 12}, /* cost of storing MMX registers
1830 in SImode and DImode */
1831 6, 12, 24, /* cost of moving XMM,YMM,ZMM register */
1832 {12, 12, 12, 24, 48}, /* cost of loading SSE registers
1833 in 32,64,128,256 and 512-bit */
1834 {24, 24, 24, 48, 96}, /* cost of unaligned loads. */
1835 {12, 12, 12, 24, 48}, /* cost of storing SSE registers
1836 in 32,64,128,256 and 512-bit */
1837 {24, 24, 24, 48, 96}, /* cost of unaligned stores. */
1838 20, 12, /* SSE->integer and integer->SSE moves */
1839 12, 12, /* Gather load static, per_elt. */
1840 12, 12, /* Gather store static, per_elt. */
1841 8, /* size of l1 cache. */
1842 1024, /* size of l2 cache. */
1843 64, /* size of prefetch block */
1844 8, /* number of parallel prefetches */
1845 1, /* Branch cost */
1846 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1847 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1848 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1849 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1850 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1851 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1853 COSTS_N_INSNS (2), /* cost of cheap SSE instruction. */
1854 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1855 COSTS_N_INSNS (7), /* cost of MULSS instruction. */
1856 COSTS_N_INSNS (7), /* cost of MULSD instruction. */
1857 COSTS_N_INSNS (7), /* cost of FMA SS instruction. */
1858 COSTS_N_INSNS (7), /* cost of FMA SD instruction. */
1859 COSTS_N_INSNS (32), /* cost of DIVSS instruction. */
1860 COSTS_N_INSNS (40), /* cost of DIVSD instruction. */
1861 COSTS_N_INSNS (32), /* cost of SQRTSS instruction. */
1862 COSTS_N_INSNS (41), /* cost of SQRTSD instruction. */
1863 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1864 nocona_memcpy,
1865 nocona_memset,
1866 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1867 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1870 static stringop_algs atom_memcpy[2] = {
1871 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1872 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1873 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1874 static stringop_algs atom_memset[2] = {
1875 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1876 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1877 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1878 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1879 static const
1880 struct processor_costs atom_cost = {
1881 COSTS_N_INSNS (1), /* cost of an add instruction */
1882 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1883 COSTS_N_INSNS (1), /* variable shift costs */
1884 COSTS_N_INSNS (1), /* constant shift costs */
1885 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1886 COSTS_N_INSNS (4), /* HI */
1887 COSTS_N_INSNS (3), /* SI */
1888 COSTS_N_INSNS (4), /* DI */
1889 COSTS_N_INSNS (2)}, /* other */
1890 0, /* cost of multiply per each bit set */
1891 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1892 COSTS_N_INSNS (26), /* HI */
1893 COSTS_N_INSNS (42), /* SI */
1894 COSTS_N_INSNS (74), /* DI */
1895 COSTS_N_INSNS (74)}, /* other */
1896 COSTS_N_INSNS (1), /* cost of movsx */
1897 COSTS_N_INSNS (1), /* cost of movzx */
1898 8, /* "large" insn */
1899 17, /* MOVE_RATIO */
1901 /* All move costs are relative to integer->integer move times 2 and thus
1902 they are latency*2. */
1903 6, /* cost for loading QImode using movzbl */
1904 {6, 6, 6}, /* cost of loading integer registers
1905 in QImode, HImode and SImode.
1906 Relative to reg-reg move (2). */
1907 {6, 6, 6}, /* cost of storing integer registers */
1908 4, /* cost of reg,reg fld/fst */
1909 {6, 6, 18}, /* cost of loading fp registers
1910 in SFmode, DFmode and XFmode */
1911 {14, 14, 24}, /* cost of storing fp registers
1912 in SFmode, DFmode and XFmode */
1913 2, /* cost of moving MMX register */
1914 {8, 8}, /* cost of loading MMX registers
1915 in SImode and DImode */
1916 {10, 10}, /* cost of storing MMX registers
1917 in SImode and DImode */
1918 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
1919 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
1920 in 32,64,128,256 and 512-bit */
1921 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
1922 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
1923 in 32,64,128,256 and 512-bit */
1924 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
1925 8, 6, /* SSE->integer and integer->SSE moves */
1926 8, 8, /* Gather load static, per_elt. */
1927 8, 8, /* Gather store static, per_elt. */
1928 32, /* size of l1 cache. */
1929 256, /* size of l2 cache. */
1930 64, /* size of prefetch block */
1931 6, /* number of parallel prefetches */
1932 3, /* Branch cost */
1933 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1934 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1935 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1936 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1937 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1938 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1940 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
1941 COSTS_N_INSNS (5), /* cost of ADDSS/SD SUBSS/SD insns. */
1942 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
1943 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
1944 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
1945 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
1946 COSTS_N_INSNS (31), /* cost of DIVSS instruction. */
1947 COSTS_N_INSNS (60), /* cost of DIVSD instruction. */
1948 COSTS_N_INSNS (31), /* cost of SQRTSS instruction. */
1949 COSTS_N_INSNS (63), /* cost of SQRTSD instruction. */
1950 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1951 atom_memcpy,
1952 atom_memset,
1953 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
1954 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
1957 static stringop_algs slm_memcpy[2] = {
1958 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1959 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1960 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1961 static stringop_algs slm_memset[2] = {
1962 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1963 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1964 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1965 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1966 static const
1967 struct processor_costs slm_cost = {
1968 COSTS_N_INSNS (1), /* cost of an add instruction */
1969 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1970 COSTS_N_INSNS (1), /* variable shift costs */
1971 COSTS_N_INSNS (1), /* constant shift costs */
1972 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1973 COSTS_N_INSNS (3), /* HI */
1974 COSTS_N_INSNS (3), /* SI */
1975 COSTS_N_INSNS (4), /* DI */
1976 COSTS_N_INSNS (2)}, /* other */
1977 0, /* cost of multiply per each bit set */
1978 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1979 COSTS_N_INSNS (26), /* HI */
1980 COSTS_N_INSNS (42), /* SI */
1981 COSTS_N_INSNS (74), /* DI */
1982 COSTS_N_INSNS (74)}, /* other */
1983 COSTS_N_INSNS (1), /* cost of movsx */
1984 COSTS_N_INSNS (1), /* cost of movzx */
1985 8, /* "large" insn */
1986 17, /* MOVE_RATIO */
1988 /* All move costs are relative to integer->integer move times 2 and thus
1989 they are latency*2. */
1990 8, /* cost for loading QImode using movzbl */
1991 {8, 8, 8}, /* cost of loading integer registers
1992 in QImode, HImode and SImode.
1993 Relative to reg-reg move (2). */
1994 {6, 6, 6}, /* cost of storing integer registers */
1995 2, /* cost of reg,reg fld/fst */
1996 {8, 8, 18}, /* cost of loading fp registers
1997 in SFmode, DFmode and XFmode */
1998 {6, 6, 18}, /* cost of storing fp registers
1999 in SFmode, DFmode and XFmode */
2000 2, /* cost of moving MMX register */
2001 {8, 8}, /* cost of loading MMX registers
2002 in SImode and DImode */
2003 {6, 6}, /* cost of storing MMX registers
2004 in SImode and DImode */
2005 2, 4, 8, /* cost of moving XMM,YMM,ZMM register */
2006 {8, 8, 8, 16, 32}, /* cost of loading SSE registers
2007 in 32,64,128,256 and 512-bit */
2008 {16, 16, 16, 32, 64}, /* cost of unaligned loads. */
2009 {8, 8, 8, 16, 32}, /* cost of storing SSE registers
2010 in 32,64,128,256 and 512-bit */
2011 {16, 16, 16, 32, 64}, /* cost of unaligned stores. */
2012 8, 6, /* SSE->integer and integer->SSE moves */
2013 8, 8, /* Gather load static, per_elt. */
2014 8, 8, /* Gather store static, per_elt. */
2015 32, /* size of l1 cache. */
2016 256, /* size of l2 cache. */
2017 64, /* size of prefetch block */
2018 6, /* number of parallel prefetches */
2019 3, /* Branch cost */
2020 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2021 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2022 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2023 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2024 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2025 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2027 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2028 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2029 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2030 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2031 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2032 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2033 COSTS_N_INSNS (39), /* cost of DIVSS instruction. */
2034 COSTS_N_INSNS (69), /* cost of DIVSD instruction. */
2035 COSTS_N_INSNS (20), /* cost of SQRTSS instruction. */
2036 COSTS_N_INSNS (35), /* cost of SQRTSD instruction. */
2037 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2038 slm_memcpy,
2039 slm_memset,
2040 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2041 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2044 static stringop_algs intel_memcpy[2] = {
2045 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
2046 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
2047 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2048 static stringop_algs intel_memset[2] = {
2049 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
2050 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
2051 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
2052 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
2053 static const
2054 struct processor_costs intel_cost = {
2055 COSTS_N_INSNS (1), /* cost of an add instruction */
2056 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2057 COSTS_N_INSNS (1), /* variable shift costs */
2058 COSTS_N_INSNS (1), /* constant shift costs */
2059 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2060 COSTS_N_INSNS (3), /* HI */
2061 COSTS_N_INSNS (3), /* SI */
2062 COSTS_N_INSNS (4), /* DI */
2063 COSTS_N_INSNS (2)}, /* other */
2064 0, /* cost of multiply per each bit set */
2065 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2066 COSTS_N_INSNS (26), /* HI */
2067 COSTS_N_INSNS (42), /* SI */
2068 COSTS_N_INSNS (74), /* DI */
2069 COSTS_N_INSNS (74)}, /* other */
2070 COSTS_N_INSNS (1), /* cost of movsx */
2071 COSTS_N_INSNS (1), /* cost of movzx */
2072 8, /* "large" insn */
2073 17, /* MOVE_RATIO */
2075 /* All move costs are relative to integer->integer move times 2 and thus
2076 they are latency*2. */
2077 6, /* cost for loading QImode using movzbl */
2078 {4, 4, 4}, /* cost of loading integer registers
2079 in QImode, HImode and SImode.
2080 Relative to reg-reg move (2). */
2081 {6, 6, 6}, /* cost of storing integer registers */
2082 2, /* cost of reg,reg fld/fst */
2083 {6, 6, 8}, /* cost of loading fp registers
2084 in SFmode, DFmode and XFmode */
2085 {6, 6, 10}, /* cost of storing fp registers
2086 in SFmode, DFmode and XFmode */
2087 2, /* cost of moving MMX register */
2088 {6, 6}, /* cost of loading MMX registers
2089 in SImode and DImode */
2090 {6, 6}, /* cost of storing MMX registers
2091 in SImode and DImode */
2092 2, 2, 2, /* cost of moving XMM,YMM,ZMM register */
2093 {6, 6, 6, 6, 6}, /* cost of loading SSE registers
2094 in 32,64,128,256 and 512-bit */
2095 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2096 {6, 6, 6, 6, 6}, /* cost of storing SSE registers
2097 in 32,64,128,256 and 512-bit */
2098 {10, 10, 10, 10, 10}, /* cost of unaligned loads. */
2099 4, 4, /* SSE->integer and integer->SSE moves */
2100 6, 6, /* Gather load static, per_elt. */
2101 6, 6, /* Gather store static, per_elt. */
2102 32, /* size of l1 cache. */
2103 256, /* size of l2 cache. */
2104 64, /* size of prefetch block */
2105 6, /* number of parallel prefetches */
2106 3, /* Branch cost */
2107 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2108 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2109 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2110 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2111 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2112 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2114 COSTS_N_INSNS (8), /* cost of cheap SSE instruction. */
2115 COSTS_N_INSNS (8), /* cost of ADDSS/SD SUBSS/SD insns. */
2116 COSTS_N_INSNS (8), /* cost of MULSS instruction. */
2117 COSTS_N_INSNS (8), /* cost of MULSD instruction. */
2118 COSTS_N_INSNS (6), /* cost of FMA SS instruction. */
2119 COSTS_N_INSNS (6), /* cost of FMA SD instruction. */
2120 COSTS_N_INSNS (20), /* cost of DIVSS instruction. */
2121 COSTS_N_INSNS (20), /* cost of DIVSD instruction. */
2122 COSTS_N_INSNS (40), /* cost of SQRTSS instruction. */
2123 COSTS_N_INSNS (40), /* cost of SQRTSD instruction. */
2124 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2125 intel_memcpy,
2126 intel_memset,
2127 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2128 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2131 /* Generic should produce code tuned for Core-i7 (and newer chips)
2132 and btver1 (and newer chips). */
2134 static stringop_algs generic_memcpy[2] = {
2135 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2136 {-1, libcall, false}}},
2137 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2138 {-1, libcall, false}}}};
2139 static stringop_algs generic_memset[2] = {
2140 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
2141 {-1, libcall, false}}},
2142 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
2143 {-1, libcall, false}}}};
2144 static const
2145 struct processor_costs generic_cost = {
2146 COSTS_N_INSNS (1), /* cost of an add instruction */
2147 /* Setting cost to 2 makes our current implementation of synth_mult result in
2148 use of unnecessary temporary registers causing regression on several
2149 SPECfp benchmarks. */
2150 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2151 COSTS_N_INSNS (1), /* variable shift costs */
2152 COSTS_N_INSNS (1), /* constant shift costs */
2153 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2154 COSTS_N_INSNS (4), /* HI */
2155 COSTS_N_INSNS (3), /* SI */
2156 COSTS_N_INSNS (4), /* DI */
2157 COSTS_N_INSNS (2)}, /* other */
2158 0, /* cost of multiply per each bit set */
2159 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2160 COSTS_N_INSNS (26), /* HI */
2161 COSTS_N_INSNS (42), /* SI */
2162 COSTS_N_INSNS (74), /* DI */
2163 COSTS_N_INSNS (74)}, /* other */
2164 COSTS_N_INSNS (1), /* cost of movsx */
2165 COSTS_N_INSNS (1), /* cost of movzx */
2166 8, /* "large" insn */
2167 17, /* MOVE_RATIO */
2169 /* All move costs are relative to integer->integer move times 2 and thus
2170 they are latency*2. */
2171 4, /* cost for loading QImode using movzbl */
2172 {4, 4, 4}, /* cost of loading integer registers
2173 in QImode, HImode and SImode.
2174 Relative to reg-reg move (2). */
2175 {6, 6, 6}, /* cost of storing integer registers */
2176 4, /* cost of reg,reg fld/fst */
2177 {6, 6, 12}, /* cost of loading fp registers
2178 in SFmode, DFmode and XFmode */
2179 {6, 6, 12}, /* cost of storing fp registers
2180 in SFmode, DFmode and XFmode */
2181 2, /* cost of moving MMX register */
2182 {6, 6}, /* cost of loading MMX registers
2183 in SImode and DImode */
2184 {6, 6}, /* cost of storing MMX registers
2185 in SImode and DImode */
2186 2, 3, 4, /* cost of moving XMM,YMM,ZMM register */
2187 {6, 6, 6, 10, 15}, /* cost of loading SSE registers
2188 in 32,64,128,256 and 512-bit */
2189 {10, 10, 10, 15, 20}, /* cost of unaligned loads. */
2190 {6, 6, 6, 10, 15}, /* cost of storing SSE registers
2191 in 32,64,128,256 and 512-bit */
2192 {10, 10, 10, 15, 20}, /* cost of unaligned storess. */
2193 20, 20, /* SSE->integer and integer->SSE moves */
2194 6, 6, /* Gather load static, per_elt. */
2195 6, 6, /* Gather store static, per_elt. */
2196 32, /* size of l1 cache. */
2197 512, /* size of l2 cache. */
2198 64, /* size of prefetch block */
2199 6, /* number of parallel prefetches */
2200 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
2201 value is increased to perhaps more appropriate value of 5. */
2202 3, /* Branch cost */
2203 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2204 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
2205 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2206 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2207 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2208 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2210 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2211 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2212 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2213 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2214 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2215 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2216 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2217 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2218 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2219 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2220 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
2221 generic_memcpy,
2222 generic_memset,
2223 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2224 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */
2227 /* core_cost should produce code tuned for Core familly of CPUs. */
2228 static stringop_algs core_memcpy[2] = {
2229 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2230 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2231 {-1, libcall, false}}}};
2232 static stringop_algs core_memset[2] = {
2233 {libcall, {{6, loop_1_byte, true},
2234 {24, loop, true},
2235 {8192, rep_prefix_4_byte, true},
2236 {-1, libcall, false}}},
2237 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2238 {-1, libcall, false}}}};
2240 static const
2241 struct processor_costs core_cost = {
2242 COSTS_N_INSNS (1), /* cost of an add instruction */
2243 /* On all chips taken into consideration lea is 2 cycles and more. With
2244 this cost however our current implementation of synth_mult results in
2245 use of unnecessary temporary registers causing regression on several
2246 SPECfp benchmarks. */
2247 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2248 COSTS_N_INSNS (1), /* variable shift costs */
2249 COSTS_N_INSNS (1), /* constant shift costs */
2250 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2251 COSTS_N_INSNS (4), /* HI */
2252 COSTS_N_INSNS (3), /* SI */
2253 COSTS_N_INSNS (4), /* DI */
2254 COSTS_N_INSNS (4)}, /* other */
2255 0, /* cost of multiply per each bit set */
2256 {COSTS_N_INSNS (8), /* cost of a divide/mod for QI */
2257 COSTS_N_INSNS (8), /* HI */
2258 /* 8-11 */
2259 COSTS_N_INSNS (11), /* SI */
2260 /* 24-81 */
2261 COSTS_N_INSNS (81), /* DI */
2262 COSTS_N_INSNS (81)}, /* other */
2263 COSTS_N_INSNS (1), /* cost of movsx */
2264 COSTS_N_INSNS (1), /* cost of movzx */
2265 8, /* "large" insn */
2266 17, /* MOVE_RATIO */
2268 /* All move costs are relative to integer->integer move times 2 and thus
2269 they are latency*2. */
2270 6, /* cost for loading QImode using movzbl */
2271 {4, 4, 4}, /* cost of loading integer registers
2272 in QImode, HImode and SImode.
2273 Relative to reg-reg move (2). */
2274 {6, 6, 6}, /* cost of storing integer registers */
2275 2, /* cost of reg,reg fld/fst */
2276 {6, 6, 8}, /* cost of loading fp registers
2277 in SFmode, DFmode and XFmode */
2278 {6, 6, 10}, /* cost of storing fp registers
2279 in SFmode, DFmode and XFmode */
2280 2, /* cost of moving MMX register */
2281 {6, 6}, /* cost of loading MMX registers
2282 in SImode and DImode */
2283 {6, 6}, /* cost of storing MMX registers
2284 in SImode and DImode */
2285 2, 2, 4, /* cost of moving XMM,YMM,ZMM register */
2286 {6, 6, 6, 6, 12}, /* cost of loading SSE registers
2287 in 32,64,128,256 and 512-bit */
2288 {6, 6, 6, 6, 12}, /* cost of unaligned loads. */
2289 {6, 6, 6, 6, 12}, /* cost of storing SSE registers
2290 in 32,64,128,256 and 512-bit */
2291 {6, 6, 6, 6, 12}, /* cost of unaligned stores. */
2292 2, 2, /* SSE->integer and integer->SSE moves */
2293 /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
2294 rec. throughput 6.
2295 So 5 uops statically and one uops per load. */
2296 10, 6, /* Gather load static, per_elt. */
2297 10, 6, /* Gather store static, per_elt. */
2298 64, /* size of l1 cache. */
2299 512, /* size of l2 cache. */
2300 64, /* size of prefetch block */
2301 6, /* number of parallel prefetches */
2302 /* FIXME perhaps more appropriate value is 5. */
2303 3, /* Branch cost */
2304 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
2305 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
2306 /* 10-24 */
2307 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
2308 COSTS_N_INSNS (1), /* cost of FABS instruction. */
2309 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
2310 COSTS_N_INSNS (23), /* cost of FSQRT instruction. */
2312 COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */
2313 COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. */
2314 COSTS_N_INSNS (4), /* cost of MULSS instruction. */
2315 COSTS_N_INSNS (5), /* cost of MULSD instruction. */
2316 COSTS_N_INSNS (5), /* cost of FMA SS instruction. */
2317 COSTS_N_INSNS (5), /* cost of FMA SD instruction. */
2318 COSTS_N_INSNS (18), /* cost of DIVSS instruction. */
2319 COSTS_N_INSNS (32), /* cost of DIVSD instruction. */
2320 COSTS_N_INSNS (30), /* cost of SQRTSS instruction. */
2321 COSTS_N_INSNS (58), /* cost of SQRTSD instruction. */
2322 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2323 core_memcpy,
2324 core_memset,
2325 COSTS_N_INSNS (3), /* cond_taken_branch_cost. */
2326 COSTS_N_INSNS (1), /* cond_not_taken_branch_cost. */