* x86-tune-sched.c (ix86_adjust_cost): Fix Zen support.
[official-gcc.git] / gcc / config / i386 / x86-tune-costs.h
blobd27072c0901f1a70d16d62f2c9faaa662012474a
2 /* Processor costs (relative to an add) */
3 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
4 #define COSTS_N_BYTES(N) ((N) * 2)
6 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
8 static stringop_algs ix86_size_memcpy[2] = {
9 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
10 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
11 static stringop_algs ix86_size_memset[2] = {
12 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
13 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}};
15 const
16 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
17 COSTS_N_BYTES (2), /* cost of an add instruction */
18 COSTS_N_BYTES (3), /* cost of a lea instruction */
19 COSTS_N_BYTES (2), /* variable shift costs */
20 COSTS_N_BYTES (3), /* constant shift costs */
21 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
22 COSTS_N_BYTES (3), /* HI */
23 COSTS_N_BYTES (3), /* SI */
24 COSTS_N_BYTES (3), /* DI */
25 COSTS_N_BYTES (5)}, /* other */
26 0, /* cost of multiply per each bit set */
27 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
28 COSTS_N_BYTES (3), /* HI */
29 COSTS_N_BYTES (3), /* SI */
30 COSTS_N_BYTES (3), /* DI */
31 COSTS_N_BYTES (5)}, /* other */
32 COSTS_N_BYTES (3), /* cost of movsx */
33 COSTS_N_BYTES (3), /* cost of movzx */
34 0, /* "large" insn */
35 2, /* MOVE_RATIO */
36 2, /* cost for loading QImode using movzbl */
37 {2, 2, 2}, /* cost of loading integer registers
38 in QImode, HImode and SImode.
39 Relative to reg-reg move (2). */
40 {2, 2, 2}, /* cost of storing integer registers */
41 2, /* cost of reg,reg fld/fst */
42 {2, 2, 2}, /* cost of loading fp registers
43 in SFmode, DFmode and XFmode */
44 {2, 2, 2}, /* cost of storing fp registers
45 in SFmode, DFmode and XFmode */
46 3, /* cost of moving MMX register */
47 {3, 3}, /* cost of loading MMX registers
48 in SImode and DImode */
49 {3, 3}, /* cost of storing MMX registers
50 in SImode and DImode */
51 3, /* cost of moving SSE register */
52 {3, 3, 3}, /* cost of loading SSE registers
53 in SImode, DImode and TImode */
54 {3, 3, 3}, /* cost of storing SSE registers
55 in SImode, DImode and TImode */
56 3, /* MMX or SSE register to integer */
57 0, /* size of l1 cache */
58 0, /* size of l2 cache */
59 0, /* size of prefetch block */
60 0, /* number of parallel prefetches */
61 2, /* Branch cost */
62 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
63 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
64 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
65 COSTS_N_BYTES (2), /* cost of FABS instruction. */
66 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
67 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
68 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
69 ix86_size_memcpy,
70 ix86_size_memset,
71 1, /* scalar_stmt_cost. */
72 1, /* scalar load_cost. */
73 1, /* scalar_store_cost. */
74 1, /* vec_stmt_cost. */
75 1, /* vec_to_scalar_cost. */
76 1, /* scalar_to_vec_cost. */
77 1, /* vec_align_load_cost. */
78 1, /* vec_unalign_load_cost. */
79 1, /* vec_store_cost. */
80 1, /* cond_taken_branch_cost. */
81 1, /* cond_not_taken_branch_cost. */
84 /* Processor costs (relative to an add) */
85 static stringop_algs i386_memcpy[2] = {
86 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
87 DUMMY_STRINGOP_ALGS};
88 static stringop_algs i386_memset[2] = {
89 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
90 DUMMY_STRINGOP_ALGS};
92 static const
93 struct processor_costs i386_cost = { /* 386 specific costs */
94 COSTS_N_INSNS (1), /* cost of an add instruction */
95 COSTS_N_INSNS (1), /* cost of a lea instruction */
96 COSTS_N_INSNS (3), /* variable shift costs */
97 COSTS_N_INSNS (2), /* constant shift costs */
98 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
99 COSTS_N_INSNS (6), /* HI */
100 COSTS_N_INSNS (6), /* SI */
101 COSTS_N_INSNS (6), /* DI */
102 COSTS_N_INSNS (6)}, /* other */
103 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
104 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
105 COSTS_N_INSNS (23), /* HI */
106 COSTS_N_INSNS (23), /* SI */
107 COSTS_N_INSNS (23), /* DI */
108 COSTS_N_INSNS (23)}, /* other */
109 COSTS_N_INSNS (3), /* cost of movsx */
110 COSTS_N_INSNS (2), /* cost of movzx */
111 15, /* "large" insn */
112 3, /* MOVE_RATIO */
113 4, /* cost for loading QImode using movzbl */
114 {2, 4, 2}, /* cost of loading integer registers
115 in QImode, HImode and SImode.
116 Relative to reg-reg move (2). */
117 {2, 4, 2}, /* cost of storing integer registers */
118 2, /* cost of reg,reg fld/fst */
119 {8, 8, 8}, /* cost of loading fp registers
120 in SFmode, DFmode and XFmode */
121 {8, 8, 8}, /* cost of storing fp registers
122 in SFmode, DFmode and XFmode */
123 2, /* cost of moving MMX register */
124 {4, 8}, /* cost of loading MMX registers
125 in SImode and DImode */
126 {4, 8}, /* cost of storing MMX registers
127 in SImode and DImode */
128 2, /* cost of moving SSE register */
129 {4, 8, 16}, /* cost of loading SSE registers
130 in SImode, DImode and TImode */
131 {4, 8, 16}, /* cost of storing SSE registers
132 in SImode, DImode and TImode */
133 3, /* MMX or SSE register to integer */
134 0, /* size of l1 cache */
135 0, /* size of l2 cache */
136 0, /* size of prefetch block */
137 0, /* number of parallel prefetches */
138 1, /* Branch cost */
139 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
140 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
141 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
142 COSTS_N_INSNS (22), /* cost of FABS instruction. */
143 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
144 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
145 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
146 i386_memcpy,
147 i386_memset,
148 1, /* scalar_stmt_cost. */
149 1, /* scalar load_cost. */
150 1, /* scalar_store_cost. */
151 1, /* vec_stmt_cost. */
152 1, /* vec_to_scalar_cost. */
153 1, /* scalar_to_vec_cost. */
154 1, /* vec_align_load_cost. */
155 2, /* vec_unalign_load_cost. */
156 1, /* vec_store_cost. */
157 3, /* cond_taken_branch_cost. */
158 1, /* cond_not_taken_branch_cost. */
161 static stringop_algs i486_memcpy[2] = {
162 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
163 DUMMY_STRINGOP_ALGS};
164 static stringop_algs i486_memset[2] = {
165 {rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
166 DUMMY_STRINGOP_ALGS};
168 static const
169 struct processor_costs i486_cost = { /* 486 specific costs */
170 COSTS_N_INSNS (1), /* cost of an add instruction */
171 COSTS_N_INSNS (1), /* cost of a lea instruction */
172 COSTS_N_INSNS (3), /* variable shift costs */
173 COSTS_N_INSNS (2), /* constant shift costs */
174 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
175 COSTS_N_INSNS (12), /* HI */
176 COSTS_N_INSNS (12), /* SI */
177 COSTS_N_INSNS (12), /* DI */
178 COSTS_N_INSNS (12)}, /* other */
179 1, /* cost of multiply per each bit set */
180 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
181 COSTS_N_INSNS (40), /* HI */
182 COSTS_N_INSNS (40), /* SI */
183 COSTS_N_INSNS (40), /* DI */
184 COSTS_N_INSNS (40)}, /* other */
185 COSTS_N_INSNS (3), /* cost of movsx */
186 COSTS_N_INSNS (2), /* cost of movzx */
187 15, /* "large" insn */
188 3, /* MOVE_RATIO */
189 4, /* cost for loading QImode using movzbl */
190 {2, 4, 2}, /* cost of loading integer registers
191 in QImode, HImode and SImode.
192 Relative to reg-reg move (2). */
193 {2, 4, 2}, /* cost of storing integer registers */
194 2, /* cost of reg,reg fld/fst */
195 {8, 8, 8}, /* cost of loading fp registers
196 in SFmode, DFmode and XFmode */
197 {8, 8, 8}, /* cost of storing fp registers
198 in SFmode, DFmode and XFmode */
199 2, /* cost of moving MMX register */
200 {4, 8}, /* cost of loading MMX registers
201 in SImode and DImode */
202 {4, 8}, /* cost of storing MMX registers
203 in SImode and DImode */
204 2, /* cost of moving SSE register */
205 {4, 8, 16}, /* cost of loading SSE registers
206 in SImode, DImode and TImode */
207 {4, 8, 16}, /* cost of storing SSE registers
208 in SImode, DImode and TImode */
209 3, /* MMX or SSE register to integer */
210 4, /* size of l1 cache. 486 has 8kB cache
211 shared for code and data, so 4kB is
212 not really precise. */
213 4, /* size of l2 cache */
214 0, /* size of prefetch block */
215 0, /* number of parallel prefetches */
216 1, /* Branch cost */
217 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
218 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
219 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
220 COSTS_N_INSNS (3), /* cost of FABS instruction. */
221 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
222 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
223 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
224 i486_memcpy,
225 i486_memset,
226 1, /* scalar_stmt_cost. */
227 1, /* scalar load_cost. */
228 1, /* scalar_store_cost. */
229 1, /* vec_stmt_cost. */
230 1, /* vec_to_scalar_cost. */
231 1, /* scalar_to_vec_cost. */
232 1, /* vec_align_load_cost. */
233 2, /* vec_unalign_load_cost. */
234 1, /* vec_store_cost. */
235 3, /* cond_taken_branch_cost. */
236 1, /* cond_not_taken_branch_cost. */
239 static stringop_algs pentium_memcpy[2] = {
240 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
241 DUMMY_STRINGOP_ALGS};
242 static stringop_algs pentium_memset[2] = {
243 {libcall, {{-1, rep_prefix_4_byte, false}}},
244 DUMMY_STRINGOP_ALGS};
246 static const
247 struct processor_costs pentium_cost = {
248 COSTS_N_INSNS (1), /* cost of an add instruction */
249 COSTS_N_INSNS (1), /* cost of a lea instruction */
250 COSTS_N_INSNS (4), /* variable shift costs */
251 COSTS_N_INSNS (1), /* constant shift costs */
252 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
253 COSTS_N_INSNS (11), /* HI */
254 COSTS_N_INSNS (11), /* SI */
255 COSTS_N_INSNS (11), /* DI */
256 COSTS_N_INSNS (11)}, /* other */
257 0, /* cost of multiply per each bit set */
258 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
259 COSTS_N_INSNS (25), /* HI */
260 COSTS_N_INSNS (25), /* SI */
261 COSTS_N_INSNS (25), /* DI */
262 COSTS_N_INSNS (25)}, /* other */
263 COSTS_N_INSNS (3), /* cost of movsx */
264 COSTS_N_INSNS (2), /* cost of movzx */
265 8, /* "large" insn */
266 6, /* MOVE_RATIO */
267 6, /* cost for loading QImode using movzbl */
268 {2, 4, 2}, /* cost of loading integer registers
269 in QImode, HImode and SImode.
270 Relative to reg-reg move (2). */
271 {2, 4, 2}, /* cost of storing integer registers */
272 2, /* cost of reg,reg fld/fst */
273 {2, 2, 6}, /* cost of loading fp registers
274 in SFmode, DFmode and XFmode */
275 {4, 4, 6}, /* cost of storing fp registers
276 in SFmode, DFmode and XFmode */
277 8, /* cost of moving MMX register */
278 {8, 8}, /* cost of loading MMX registers
279 in SImode and DImode */
280 {8, 8}, /* cost of storing MMX registers
281 in SImode and DImode */
282 2, /* cost of moving SSE register */
283 {4, 8, 16}, /* cost of loading SSE registers
284 in SImode, DImode and TImode */
285 {4, 8, 16}, /* cost of storing SSE registers
286 in SImode, DImode and TImode */
287 3, /* MMX or SSE register to integer */
288 8, /* size of l1 cache. */
289 8, /* size of l2 cache */
290 0, /* size of prefetch block */
291 0, /* number of parallel prefetches */
292 2, /* Branch cost */
293 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
294 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
295 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
296 COSTS_N_INSNS (1), /* cost of FABS instruction. */
297 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
298 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
299 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
300 pentium_memcpy,
301 pentium_memset,
302 1, /* scalar_stmt_cost. */
303 1, /* scalar load_cost. */
304 1, /* scalar_store_cost. */
305 1, /* vec_stmt_cost. */
306 1, /* vec_to_scalar_cost. */
307 1, /* scalar_to_vec_cost. */
308 1, /* vec_align_load_cost. */
309 2, /* vec_unalign_load_cost. */
310 1, /* vec_store_cost. */
311 3, /* cond_taken_branch_cost. */
312 1, /* cond_not_taken_branch_cost. */
315 static const
316 struct processor_costs lakemont_cost = {
317 COSTS_N_INSNS (1), /* cost of an add instruction */
318 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
319 COSTS_N_INSNS (1), /* variable shift costs */
320 COSTS_N_INSNS (1), /* constant shift costs */
321 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
322 COSTS_N_INSNS (11), /* HI */
323 COSTS_N_INSNS (11), /* SI */
324 COSTS_N_INSNS (11), /* DI */
325 COSTS_N_INSNS (11)}, /* other */
326 0, /* cost of multiply per each bit set */
327 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
328 COSTS_N_INSNS (25), /* HI */
329 COSTS_N_INSNS (25), /* SI */
330 COSTS_N_INSNS (25), /* DI */
331 COSTS_N_INSNS (25)}, /* other */
332 COSTS_N_INSNS (3), /* cost of movsx */
333 COSTS_N_INSNS (2), /* cost of movzx */
334 8, /* "large" insn */
335 17, /* MOVE_RATIO */
336 6, /* cost for loading QImode using movzbl */
337 {2, 4, 2}, /* cost of loading integer registers
338 in QImode, HImode and SImode.
339 Relative to reg-reg move (2). */
340 {2, 4, 2}, /* cost of storing integer registers */
341 2, /* cost of reg,reg fld/fst */
342 {2, 2, 6}, /* cost of loading fp registers
343 in SFmode, DFmode and XFmode */
344 {4, 4, 6}, /* cost of storing fp registers
345 in SFmode, DFmode and XFmode */
346 8, /* cost of moving MMX register */
347 {8, 8}, /* cost of loading MMX registers
348 in SImode and DImode */
349 {8, 8}, /* cost of storing MMX registers
350 in SImode and DImode */
351 2, /* cost of moving SSE register */
352 {4, 8, 16}, /* cost of loading SSE registers
353 in SImode, DImode and TImode */
354 {4, 8, 16}, /* cost of storing SSE registers
355 in SImode, DImode and TImode */
356 3, /* MMX or SSE register to integer */
357 8, /* size of l1 cache. */
358 8, /* size of l2 cache */
359 0, /* size of prefetch block */
360 0, /* number of parallel prefetches */
361 2, /* Branch cost */
362 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
363 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
364 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
365 COSTS_N_INSNS (1), /* cost of FABS instruction. */
366 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
367 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
368 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
369 pentium_memcpy,
370 pentium_memset,
371 1, /* scalar_stmt_cost. */
372 1, /* scalar load_cost. */
373 1, /* scalar_store_cost. */
374 1, /* vec_stmt_cost. */
375 1, /* vec_to_scalar_cost. */
376 1, /* scalar_to_vec_cost. */
377 1, /* vec_align_load_cost. */
378 2, /* vec_unalign_load_cost. */
379 1, /* vec_store_cost. */
380 3, /* cond_taken_branch_cost. */
381 1, /* cond_not_taken_branch_cost. */
384 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
385 (we ensure the alignment). For small blocks inline loop is still a
386 noticeable win, for bigger blocks either rep movsl or rep movsb is
387 way to go. Rep movsb has apparently more expensive startup time in CPU,
388 but after 4K the difference is down in the noise. */
389 static stringop_algs pentiumpro_memcpy[2] = {
390 {rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
391 {8192, rep_prefix_4_byte, false},
392 {-1, rep_prefix_1_byte, false}}},
393 DUMMY_STRINGOP_ALGS};
394 static stringop_algs pentiumpro_memset[2] = {
395 {rep_prefix_4_byte, {{1024, unrolled_loop, false},
396 {8192, rep_prefix_4_byte, false},
397 {-1, libcall, false}}},
398 DUMMY_STRINGOP_ALGS};
399 static const
400 struct processor_costs pentiumpro_cost = {
401 COSTS_N_INSNS (1), /* cost of an add instruction */
402 COSTS_N_INSNS (1), /* cost of a lea instruction */
403 COSTS_N_INSNS (1), /* variable shift costs */
404 COSTS_N_INSNS (1), /* constant shift costs */
405 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
406 COSTS_N_INSNS (4), /* HI */
407 COSTS_N_INSNS (4), /* SI */
408 COSTS_N_INSNS (4), /* DI */
409 COSTS_N_INSNS (4)}, /* other */
410 0, /* cost of multiply per each bit set */
411 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
412 COSTS_N_INSNS (17), /* HI */
413 COSTS_N_INSNS (17), /* SI */
414 COSTS_N_INSNS (17), /* DI */
415 COSTS_N_INSNS (17)}, /* other */
416 COSTS_N_INSNS (1), /* cost of movsx */
417 COSTS_N_INSNS (1), /* cost of movzx */
418 8, /* "large" insn */
419 6, /* MOVE_RATIO */
420 2, /* cost for loading QImode using movzbl */
421 {4, 4, 4}, /* cost of loading integer registers
422 in QImode, HImode and SImode.
423 Relative to reg-reg move (2). */
424 {2, 2, 2}, /* cost of storing integer registers */
425 2, /* cost of reg,reg fld/fst */
426 {2, 2, 6}, /* cost of loading fp registers
427 in SFmode, DFmode and XFmode */
428 {4, 4, 6}, /* cost of storing fp registers
429 in SFmode, DFmode and XFmode */
430 2, /* cost of moving MMX register */
431 {2, 2}, /* cost of loading MMX registers
432 in SImode and DImode */
433 {2, 2}, /* cost of storing MMX registers
434 in SImode and DImode */
435 2, /* cost of moving SSE register */
436 {2, 2, 8}, /* cost of loading SSE registers
437 in SImode, DImode and TImode */
438 {2, 2, 8}, /* cost of storing SSE registers
439 in SImode, DImode and TImode */
440 3, /* MMX or SSE register to integer */
441 8, /* size of l1 cache. */
442 256, /* size of l2 cache */
443 32, /* size of prefetch block */
444 6, /* number of parallel prefetches */
445 2, /* Branch cost */
446 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
447 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
448 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
449 COSTS_N_INSNS (2), /* cost of FABS instruction. */
450 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
451 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
452 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
453 pentiumpro_memcpy,
454 pentiumpro_memset,
455 1, /* scalar_stmt_cost. */
456 1, /* scalar load_cost. */
457 1, /* scalar_store_cost. */
458 1, /* vec_stmt_cost. */
459 1, /* vec_to_scalar_cost. */
460 1, /* scalar_to_vec_cost. */
461 1, /* vec_align_load_cost. */
462 2, /* vec_unalign_load_cost. */
463 1, /* vec_store_cost. */
464 3, /* cond_taken_branch_cost. */
465 1, /* cond_not_taken_branch_cost. */
468 static stringop_algs geode_memcpy[2] = {
469 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
470 DUMMY_STRINGOP_ALGS};
471 static stringop_algs geode_memset[2] = {
472 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
473 DUMMY_STRINGOP_ALGS};
474 static const
475 struct processor_costs geode_cost = {
476 COSTS_N_INSNS (1), /* cost of an add instruction */
477 COSTS_N_INSNS (1), /* cost of a lea instruction */
478 COSTS_N_INSNS (2), /* variable shift costs */
479 COSTS_N_INSNS (1), /* constant shift costs */
480 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
481 COSTS_N_INSNS (4), /* HI */
482 COSTS_N_INSNS (7), /* SI */
483 COSTS_N_INSNS (7), /* DI */
484 COSTS_N_INSNS (7)}, /* other */
485 0, /* cost of multiply per each bit set */
486 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
487 COSTS_N_INSNS (23), /* HI */
488 COSTS_N_INSNS (39), /* SI */
489 COSTS_N_INSNS (39), /* DI */
490 COSTS_N_INSNS (39)}, /* other */
491 COSTS_N_INSNS (1), /* cost of movsx */
492 COSTS_N_INSNS (1), /* cost of movzx */
493 8, /* "large" insn */
494 4, /* MOVE_RATIO */
495 1, /* cost for loading QImode using movzbl */
496 {1, 1, 1}, /* cost of loading integer registers
497 in QImode, HImode and SImode.
498 Relative to reg-reg move (2). */
499 {1, 1, 1}, /* cost of storing integer registers */
500 1, /* cost of reg,reg fld/fst */
501 {1, 1, 1}, /* cost of loading fp registers
502 in SFmode, DFmode and XFmode */
503 {4, 6, 6}, /* cost of storing fp registers
504 in SFmode, DFmode and XFmode */
506 2, /* cost of moving MMX register */
507 {2, 2}, /* cost of loading MMX registers
508 in SImode and DImode */
509 {2, 2}, /* cost of storing MMX registers
510 in SImode and DImode */
511 2, /* cost of moving SSE register */
512 {2, 2, 8}, /* cost of loading SSE registers
513 in SImode, DImode and TImode */
514 {2, 2, 8}, /* cost of storing SSE registers
515 in SImode, DImode and TImode */
516 3, /* MMX or SSE register to integer */
517 64, /* size of l1 cache. */
518 128, /* size of l2 cache. */
519 32, /* size of prefetch block */
520 1, /* number of parallel prefetches */
521 1, /* Branch cost */
522 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
523 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
524 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
525 COSTS_N_INSNS (1), /* cost of FABS instruction. */
526 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
527 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
528 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
529 geode_memcpy,
530 geode_memset,
531 1, /* scalar_stmt_cost. */
532 1, /* scalar load_cost. */
533 1, /* scalar_store_cost. */
534 1, /* vec_stmt_cost. */
535 1, /* vec_to_scalar_cost. */
536 1, /* scalar_to_vec_cost. */
537 1, /* vec_align_load_cost. */
538 2, /* vec_unalign_load_cost. */
539 1, /* vec_store_cost. */
540 3, /* cond_taken_branch_cost. */
541 1, /* cond_not_taken_branch_cost. */
544 static stringop_algs k6_memcpy[2] = {
545 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
546 DUMMY_STRINGOP_ALGS};
547 static stringop_algs k6_memset[2] = {
548 {libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
549 DUMMY_STRINGOP_ALGS};
550 static const
551 struct processor_costs k6_cost = {
552 COSTS_N_INSNS (1), /* cost of an add instruction */
553 COSTS_N_INSNS (2), /* cost of a lea instruction */
554 COSTS_N_INSNS (1), /* variable shift costs */
555 COSTS_N_INSNS (1), /* constant shift costs */
556 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
557 COSTS_N_INSNS (3), /* HI */
558 COSTS_N_INSNS (3), /* SI */
559 COSTS_N_INSNS (3), /* DI */
560 COSTS_N_INSNS (3)}, /* other */
561 0, /* cost of multiply per each bit set */
562 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
563 COSTS_N_INSNS (18), /* HI */
564 COSTS_N_INSNS (18), /* SI */
565 COSTS_N_INSNS (18), /* DI */
566 COSTS_N_INSNS (18)}, /* other */
567 COSTS_N_INSNS (2), /* cost of movsx */
568 COSTS_N_INSNS (2), /* cost of movzx */
569 8, /* "large" insn */
570 4, /* MOVE_RATIO */
571 3, /* cost for loading QImode using movzbl */
572 {4, 5, 4}, /* cost of loading integer registers
573 in QImode, HImode and SImode.
574 Relative to reg-reg move (2). */
575 {2, 3, 2}, /* cost of storing integer registers */
576 4, /* cost of reg,reg fld/fst */
577 {6, 6, 6}, /* cost of loading fp registers
578 in SFmode, DFmode and XFmode */
579 {4, 4, 4}, /* cost of storing fp registers
580 in SFmode, DFmode and XFmode */
581 2, /* cost of moving MMX register */
582 {2, 2}, /* cost of loading MMX registers
583 in SImode and DImode */
584 {2, 2}, /* cost of storing MMX registers
585 in SImode and DImode */
586 2, /* cost of moving SSE register */
587 {2, 2, 8}, /* cost of loading SSE registers
588 in SImode, DImode and TImode */
589 {2, 2, 8}, /* cost of storing SSE registers
590 in SImode, DImode and TImode */
591 6, /* MMX or SSE register to integer */
592 32, /* size of l1 cache. */
593 32, /* size of l2 cache. Some models
594 have integrated l2 cache, but
595 optimizing for k6 is not important
596 enough to worry about that. */
597 32, /* size of prefetch block */
598 1, /* number of parallel prefetches */
599 1, /* Branch cost */
600 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
601 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
602 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
603 COSTS_N_INSNS (2), /* cost of FABS instruction. */
604 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
605 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
606 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
607 k6_memcpy,
608 k6_memset,
609 1, /* scalar_stmt_cost. */
610 1, /* scalar load_cost. */
611 1, /* scalar_store_cost. */
612 1, /* vec_stmt_cost. */
613 1, /* vec_to_scalar_cost. */
614 1, /* scalar_to_vec_cost. */
615 1, /* vec_align_load_cost. */
616 2, /* vec_unalign_load_cost. */
617 1, /* vec_store_cost. */
618 3, /* cond_taken_branch_cost. */
619 1, /* cond_not_taken_branch_cost. */
622 /* For some reason, Athlon deals better with REP prefix (relative to loops)
623 compared to K8. Alignment becomes important after 8 bytes for memcpy and
624 128 bytes for memset. */
625 static stringop_algs athlon_memcpy[2] = {
626 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
627 DUMMY_STRINGOP_ALGS};
628 static stringop_algs athlon_memset[2] = {
629 {libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
630 DUMMY_STRINGOP_ALGS};
631 static const
632 struct processor_costs athlon_cost = {
633 COSTS_N_INSNS (1), /* cost of an add instruction */
634 COSTS_N_INSNS (2), /* cost of a lea instruction */
635 COSTS_N_INSNS (1), /* variable shift costs */
636 COSTS_N_INSNS (1), /* constant shift costs */
637 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
638 COSTS_N_INSNS (5), /* HI */
639 COSTS_N_INSNS (5), /* SI */
640 COSTS_N_INSNS (5), /* DI */
641 COSTS_N_INSNS (5)}, /* other */
642 0, /* cost of multiply per each bit set */
643 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
644 COSTS_N_INSNS (26), /* HI */
645 COSTS_N_INSNS (42), /* SI */
646 COSTS_N_INSNS (74), /* DI */
647 COSTS_N_INSNS (74)}, /* other */
648 COSTS_N_INSNS (1), /* cost of movsx */
649 COSTS_N_INSNS (1), /* cost of movzx */
650 8, /* "large" insn */
651 9, /* MOVE_RATIO */
652 4, /* cost for loading QImode using movzbl */
653 {3, 4, 3}, /* cost of loading integer registers
654 in QImode, HImode and SImode.
655 Relative to reg-reg move (2). */
656 {3, 4, 3}, /* cost of storing integer registers */
657 4, /* cost of reg,reg fld/fst */
658 {4, 4, 12}, /* cost of loading fp registers
659 in SFmode, DFmode and XFmode */
660 {6, 6, 8}, /* cost of storing fp registers
661 in SFmode, DFmode and XFmode */
662 2, /* cost of moving MMX register */
663 {4, 4}, /* cost of loading MMX registers
664 in SImode and DImode */
665 {4, 4}, /* cost of storing MMX registers
666 in SImode and DImode */
667 2, /* cost of moving SSE register */
668 {4, 4, 6}, /* cost of loading SSE registers
669 in SImode, DImode and TImode */
670 {4, 4, 5}, /* cost of storing SSE registers
671 in SImode, DImode and TImode */
672 5, /* MMX or SSE register to integer */
673 64, /* size of l1 cache. */
674 256, /* size of l2 cache. */
675 64, /* size of prefetch block */
676 6, /* number of parallel prefetches */
677 5, /* Branch cost */
678 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
679 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
680 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
681 COSTS_N_INSNS (2), /* cost of FABS instruction. */
682 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
683 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
684 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
685 athlon_memcpy,
686 athlon_memset,
687 1, /* scalar_stmt_cost. */
688 1, /* scalar load_cost. */
689 1, /* scalar_store_cost. */
690 1, /* vec_stmt_cost. */
691 1, /* vec_to_scalar_cost. */
692 1, /* scalar_to_vec_cost. */
693 1, /* vec_align_load_cost. */
694 2, /* vec_unalign_load_cost. */
695 1, /* vec_store_cost. */
696 3, /* cond_taken_branch_cost. */
697 1, /* cond_not_taken_branch_cost. */
700 /* K8 has optimized REP instruction for medium sized blocks, but for very
701 small blocks it is better to use loop. For large blocks, libcall can
702 do nontemporary accesses and beat inline considerably. */
703 static stringop_algs k8_memcpy[2] = {
704 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
705 {-1, rep_prefix_4_byte, false}}},
706 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
707 {-1, libcall, false}}}};
708 static stringop_algs k8_memset[2] = {
709 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
710 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
711 {libcall, {{48, unrolled_loop, false},
712 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
713 static const
714 struct processor_costs k8_cost = {
715 COSTS_N_INSNS (1), /* cost of an add instruction */
716 COSTS_N_INSNS (2), /* cost of a lea instruction */
717 COSTS_N_INSNS (1), /* variable shift costs */
718 COSTS_N_INSNS (1), /* constant shift costs */
719 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
720 COSTS_N_INSNS (4), /* HI */
721 COSTS_N_INSNS (3), /* SI */
722 COSTS_N_INSNS (4), /* DI */
723 COSTS_N_INSNS (5)}, /* other */
724 0, /* cost of multiply per each bit set */
725 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
726 COSTS_N_INSNS (26), /* HI */
727 COSTS_N_INSNS (42), /* SI */
728 COSTS_N_INSNS (74), /* DI */
729 COSTS_N_INSNS (74)}, /* other */
730 COSTS_N_INSNS (1), /* cost of movsx */
731 COSTS_N_INSNS (1), /* cost of movzx */
732 8, /* "large" insn */
733 9, /* MOVE_RATIO */
734 4, /* cost for loading QImode using movzbl */
735 {3, 4, 3}, /* cost of loading integer registers
736 in QImode, HImode and SImode.
737 Relative to reg-reg move (2). */
738 {3, 4, 3}, /* cost of storing integer registers */
739 4, /* cost of reg,reg fld/fst */
740 {4, 4, 12}, /* cost of loading fp registers
741 in SFmode, DFmode and XFmode */
742 {6, 6, 8}, /* cost of storing fp registers
743 in SFmode, DFmode and XFmode */
744 2, /* cost of moving MMX register */
745 {3, 3}, /* cost of loading MMX registers
746 in SImode and DImode */
747 {4, 4}, /* cost of storing MMX registers
748 in SImode and DImode */
749 2, /* cost of moving SSE register */
750 {4, 3, 6}, /* cost of loading SSE registers
751 in SImode, DImode and TImode */
752 {4, 4, 5}, /* cost of storing SSE registers
753 in SImode, DImode and TImode */
754 5, /* MMX or SSE register to integer */
755 64, /* size of l1 cache. */
756 512, /* size of l2 cache. */
757 64, /* size of prefetch block */
758 /* New AMD processors never drop prefetches; if they cannot be performed
759 immediately, they are queued. We set number of simultaneous prefetches
760 to a large constant to reflect this (it probably is not a good idea not
761 to limit number of prefetches at all, as their execution also takes some
762 time). */
763 100, /* number of parallel prefetches */
764 3, /* Branch cost */
765 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
766 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
767 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
768 COSTS_N_INSNS (2), /* cost of FABS instruction. */
769 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
770 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
771 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
772 k8_memcpy,
773 k8_memset,
774 4, /* scalar_stmt_cost. */
775 2, /* scalar load_cost. */
776 2, /* scalar_store_cost. */
777 5, /* vec_stmt_cost. */
778 0, /* vec_to_scalar_cost. */
779 2, /* scalar_to_vec_cost. */
780 2, /* vec_align_load_cost. */
781 3, /* vec_unalign_load_cost. */
782 3, /* vec_store_cost. */
783 3, /* cond_taken_branch_cost. */
784 2, /* cond_not_taken_branch_cost. */
787 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
788 very small blocks it is better to use loop. For large blocks, libcall can
789 do nontemporary accesses and beat inline considerably. */
790 static stringop_algs amdfam10_memcpy[2] = {
791 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
792 {-1, rep_prefix_4_byte, false}}},
793 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
794 {-1, libcall, false}}}};
795 static stringop_algs amdfam10_memset[2] = {
796 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
797 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
798 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
799 {-1, libcall, false}}}};
800 struct processor_costs amdfam10_cost = {
801 COSTS_N_INSNS (1), /* cost of an add instruction */
802 COSTS_N_INSNS (2), /* cost of a lea instruction */
803 COSTS_N_INSNS (1), /* variable shift costs */
804 COSTS_N_INSNS (1), /* constant shift costs */
805 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
806 COSTS_N_INSNS (4), /* HI */
807 COSTS_N_INSNS (3), /* SI */
808 COSTS_N_INSNS (4), /* DI */
809 COSTS_N_INSNS (5)}, /* other */
810 0, /* cost of multiply per each bit set */
811 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
812 COSTS_N_INSNS (35), /* HI */
813 COSTS_N_INSNS (51), /* SI */
814 COSTS_N_INSNS (83), /* DI */
815 COSTS_N_INSNS (83)}, /* other */
816 COSTS_N_INSNS (1), /* cost of movsx */
817 COSTS_N_INSNS (1), /* cost of movzx */
818 8, /* "large" insn */
819 9, /* MOVE_RATIO */
820 4, /* cost for loading QImode using movzbl */
821 {3, 4, 3}, /* cost of loading integer registers
822 in QImode, HImode and SImode.
823 Relative to reg-reg move (2). */
824 {3, 4, 3}, /* cost of storing integer registers */
825 4, /* cost of reg,reg fld/fst */
826 {4, 4, 12}, /* cost of loading fp registers
827 in SFmode, DFmode and XFmode */
828 {6, 6, 8}, /* cost of storing fp registers
829 in SFmode, DFmode and XFmode */
830 2, /* cost of moving MMX register */
831 {3, 3}, /* cost of loading MMX registers
832 in SImode and DImode */
833 {4, 4}, /* cost of storing MMX registers
834 in SImode and DImode */
835 2, /* cost of moving SSE register */
836 {4, 4, 3}, /* cost of loading SSE registers
837 in SImode, DImode and TImode */
838 {4, 4, 5}, /* cost of storing SSE registers
839 in SImode, DImode and TImode */
840 3, /* MMX or SSE register to integer */
841 /* On K8:
842 MOVD reg64, xmmreg Double FSTORE 4
843 MOVD reg32, xmmreg Double FSTORE 4
844 On AMDFAM10:
845 MOVD reg64, xmmreg Double FADD 3
846 1/1 1/1
847 MOVD reg32, xmmreg Double FADD 3
848 1/1 1/1 */
849 64, /* size of l1 cache. */
850 512, /* size of l2 cache. */
851 64, /* size of prefetch block */
852 /* New AMD processors never drop prefetches; if they cannot be performed
853 immediately, they are queued. We set number of simultaneous prefetches
854 to a large constant to reflect this (it probably is not a good idea not
855 to limit number of prefetches at all, as their execution also takes some
856 time). */
857 100, /* number of parallel prefetches */
858 2, /* Branch cost */
859 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
860 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
861 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
862 COSTS_N_INSNS (2), /* cost of FABS instruction. */
863 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
864 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
865 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
866 amdfam10_memcpy,
867 amdfam10_memset,
868 4, /* scalar_stmt_cost. */
869 2, /* scalar load_cost. */
870 2, /* scalar_store_cost. */
871 6, /* vec_stmt_cost. */
872 0, /* vec_to_scalar_cost. */
873 2, /* scalar_to_vec_cost. */
874 2, /* vec_align_load_cost. */
875 2, /* vec_unalign_load_cost. */
876 2, /* vec_store_cost. */
877 2, /* cond_taken_branch_cost. */
878 1, /* cond_not_taken_branch_cost. */
881 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
882 very small blocks it is better to use loop. For large blocks, libcall
883 can do nontemporary accesses and beat inline considerably. */
884 static stringop_algs bdver1_memcpy[2] = {
885 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
886 {-1, rep_prefix_4_byte, false}}},
887 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
888 {-1, libcall, false}}}};
889 static stringop_algs bdver1_memset[2] = {
890 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
891 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
892 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
893 {-1, libcall, false}}}};
895 const struct processor_costs bdver1_cost = {
896 COSTS_N_INSNS (1), /* cost of an add instruction */
897 COSTS_N_INSNS (1), /* cost of a lea instruction */
898 COSTS_N_INSNS (1), /* variable shift costs */
899 COSTS_N_INSNS (1), /* constant shift costs */
900 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
901 COSTS_N_INSNS (4), /* HI */
902 COSTS_N_INSNS (4), /* SI */
903 COSTS_N_INSNS (6), /* DI */
904 COSTS_N_INSNS (6)}, /* other */
905 0, /* cost of multiply per each bit set */
906 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
907 COSTS_N_INSNS (35), /* HI */
908 COSTS_N_INSNS (51), /* SI */
909 COSTS_N_INSNS (83), /* DI */
910 COSTS_N_INSNS (83)}, /* other */
911 COSTS_N_INSNS (1), /* cost of movsx */
912 COSTS_N_INSNS (1), /* cost of movzx */
913 8, /* "large" insn */
914 9, /* MOVE_RATIO */
915 4, /* cost for loading QImode using movzbl */
916 {5, 5, 4}, /* cost of loading integer registers
917 in QImode, HImode and SImode.
918 Relative to reg-reg move (2). */
919 {4, 4, 4}, /* cost of storing integer registers */
920 2, /* cost of reg,reg fld/fst */
921 {5, 5, 12}, /* cost of loading fp registers
922 in SFmode, DFmode and XFmode */
923 {4, 4, 8}, /* cost of storing fp registers
924 in SFmode, DFmode and XFmode */
925 2, /* cost of moving MMX register */
926 {4, 4}, /* cost of loading MMX registers
927 in SImode and DImode */
928 {4, 4}, /* cost of storing MMX registers
929 in SImode and DImode */
930 2, /* cost of moving SSE register */
931 {4, 4, 4}, /* cost of loading SSE registers
932 in SImode, DImode and TImode */
933 {4, 4, 4}, /* cost of storing SSE registers
934 in SImode, DImode and TImode */
935 2, /* MMX or SSE register to integer */
936 /* On K8:
937 MOVD reg64, xmmreg Double FSTORE 4
938 MOVD reg32, xmmreg Double FSTORE 4
939 On AMDFAM10:
940 MOVD reg64, xmmreg Double FADD 3
941 1/1 1/1
942 MOVD reg32, xmmreg Double FADD 3
943 1/1 1/1 */
944 16, /* size of l1 cache. */
945 2048, /* size of l2 cache. */
946 64, /* size of prefetch block */
947 /* New AMD processors never drop prefetches; if they cannot be performed
948 immediately, they are queued. We set number of simultaneous prefetches
949 to a large constant to reflect this (it probably is not a good idea not
950 to limit number of prefetches at all, as their execution also takes some
951 time). */
952 100, /* number of parallel prefetches */
953 2, /* Branch cost */
954 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
955 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
956 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
957 COSTS_N_INSNS (2), /* cost of FABS instruction. */
958 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
959 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
960 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
961 bdver1_memcpy,
962 bdver1_memset,
963 6, /* scalar_stmt_cost. */
964 4, /* scalar load_cost. */
965 4, /* scalar_store_cost. */
966 6, /* vec_stmt_cost. */
967 0, /* vec_to_scalar_cost. */
968 2, /* scalar_to_vec_cost. */
969 4, /* vec_align_load_cost. */
970 4, /* vec_unalign_load_cost. */
971 4, /* vec_store_cost. */
972 4, /* cond_taken_branch_cost. */
973 2, /* cond_not_taken_branch_cost. */
976 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
977 very small blocks it is better to use loop. For large blocks, libcall
978 can do nontemporary accesses and beat inline considerably. */
980 static stringop_algs bdver2_memcpy[2] = {
981 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
982 {-1, rep_prefix_4_byte, false}}},
983 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
984 {-1, libcall, false}}}};
985 static stringop_algs bdver2_memset[2] = {
986 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
987 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
988 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
989 {-1, libcall, false}}}};
991 const struct processor_costs bdver2_cost = {
992 COSTS_N_INSNS (1), /* cost of an add instruction */
993 COSTS_N_INSNS (1), /* cost of a lea instruction */
994 COSTS_N_INSNS (1), /* variable shift costs */
995 COSTS_N_INSNS (1), /* constant shift costs */
996 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
997 COSTS_N_INSNS (4), /* HI */
998 COSTS_N_INSNS (4), /* SI */
999 COSTS_N_INSNS (6), /* DI */
1000 COSTS_N_INSNS (6)}, /* other */
1001 0, /* cost of multiply per each bit set */
1002 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1003 COSTS_N_INSNS (35), /* HI */
1004 COSTS_N_INSNS (51), /* SI */
1005 COSTS_N_INSNS (83), /* DI */
1006 COSTS_N_INSNS (83)}, /* other */
1007 COSTS_N_INSNS (1), /* cost of movsx */
1008 COSTS_N_INSNS (1), /* cost of movzx */
1009 8, /* "large" insn */
1010 9, /* MOVE_RATIO */
1011 4, /* cost for loading QImode using movzbl */
1012 {5, 5, 4}, /* cost of loading integer registers
1013 in QImode, HImode and SImode.
1014 Relative to reg-reg move (2). */
1015 {4, 4, 4}, /* cost of storing integer registers */
1016 2, /* cost of reg,reg fld/fst */
1017 {5, 5, 12}, /* cost of loading fp registers
1018 in SFmode, DFmode and XFmode */
1019 {4, 4, 8}, /* cost of storing fp registers
1020 in SFmode, DFmode and XFmode */
1021 2, /* cost of moving MMX register */
1022 {4, 4}, /* cost of loading MMX registers
1023 in SImode and DImode */
1024 {4, 4}, /* cost of storing MMX registers
1025 in SImode and DImode */
1026 2, /* cost of moving SSE register */
1027 {4, 4, 4}, /* cost of loading SSE registers
1028 in SImode, DImode and TImode */
1029 {4, 4, 4}, /* cost of storing SSE registers
1030 in SImode, DImode and TImode */
1031 2, /* MMX or SSE register to integer */
1032 /* On K8:
1033 MOVD reg64, xmmreg Double FSTORE 4
1034 MOVD reg32, xmmreg Double FSTORE 4
1035 On AMDFAM10:
1036 MOVD reg64, xmmreg Double FADD 3
1037 1/1 1/1
1038 MOVD reg32, xmmreg Double FADD 3
1039 1/1 1/1 */
1040 16, /* size of l1 cache. */
1041 2048, /* size of l2 cache. */
1042 64, /* size of prefetch block */
1043 /* New AMD processors never drop prefetches; if they cannot be performed
1044 immediately, they are queued. We set number of simultaneous prefetches
1045 to a large constant to reflect this (it probably is not a good idea not
1046 to limit number of prefetches at all, as their execution also takes some
1047 time). */
1048 100, /* number of parallel prefetches */
1049 2, /* Branch cost */
1050 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1051 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1052 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1053 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1054 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1055 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1056 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1057 bdver2_memcpy,
1058 bdver2_memset,
1059 6, /* scalar_stmt_cost. */
1060 4, /* scalar load_cost. */
1061 4, /* scalar_store_cost. */
1062 6, /* vec_stmt_cost. */
1063 0, /* vec_to_scalar_cost. */
1064 2, /* scalar_to_vec_cost. */
1065 4, /* vec_align_load_cost. */
1066 4, /* vec_unalign_load_cost. */
1067 4, /* vec_store_cost. */
1068 4, /* cond_taken_branch_cost. */
1069 2, /* cond_not_taken_branch_cost. */
1073 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1074 very small blocks it is better to use loop. For large blocks, libcall
1075 can do nontemporary accesses and beat inline considerably. */
1076 static stringop_algs bdver3_memcpy[2] = {
1077 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1078 {-1, rep_prefix_4_byte, false}}},
1079 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1080 {-1, libcall, false}}}};
1081 static stringop_algs bdver3_memset[2] = {
1082 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}};
1086 struct processor_costs bdver3_cost = {
1087 COSTS_N_INSNS (1), /* cost of an add instruction */
1088 COSTS_N_INSNS (1), /* cost of a lea instruction */
1089 COSTS_N_INSNS (1), /* variable shift costs */
1090 COSTS_N_INSNS (1), /* constant shift costs */
1091 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1092 COSTS_N_INSNS (4), /* HI */
1093 COSTS_N_INSNS (4), /* SI */
1094 COSTS_N_INSNS (6), /* DI */
1095 COSTS_N_INSNS (6)}, /* other */
1096 0, /* cost of multiply per each bit set */
1097 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1098 COSTS_N_INSNS (35), /* HI */
1099 COSTS_N_INSNS (51), /* SI */
1100 COSTS_N_INSNS (83), /* DI */
1101 COSTS_N_INSNS (83)}, /* other */
1102 COSTS_N_INSNS (1), /* cost of movsx */
1103 COSTS_N_INSNS (1), /* cost of movzx */
1104 8, /* "large" insn */
1105 9, /* MOVE_RATIO */
1106 4, /* cost for loading QImode using movzbl */
1107 {5, 5, 4}, /* cost of loading integer registers
1108 in QImode, HImode and SImode.
1109 Relative to reg-reg move (2). */
1110 {4, 4, 4}, /* cost of storing integer registers */
1111 2, /* cost of reg,reg fld/fst */
1112 {5, 5, 12}, /* cost of loading fp registers
1113 in SFmode, DFmode and XFmode */
1114 {4, 4, 8}, /* cost of storing fp registers
1115 in SFmode, DFmode and XFmode */
1116 2, /* cost of moving MMX register */
1117 {4, 4}, /* cost of loading MMX registers
1118 in SImode and DImode */
1119 {4, 4}, /* cost of storing MMX registers
1120 in SImode and DImode */
1121 2, /* cost of moving SSE register */
1122 {4, 4, 4}, /* cost of loading SSE registers
1123 in SImode, DImode and TImode */
1124 {4, 4, 4}, /* cost of storing SSE registers
1125 in SImode, DImode and TImode */
1126 2, /* MMX or SSE register to integer */
1127 16, /* size of l1 cache. */
1128 2048, /* size of l2 cache. */
1129 64, /* size of prefetch block */
1130 /* New AMD processors never drop prefetches; if they cannot be performed
1131 immediately, they are queued. We set number of simultaneous prefetches
1132 to a large constant to reflect this (it probably is not a good idea not
1133 to limit number of prefetches at all, as their execution also takes some
1134 time). */
1135 100, /* number of parallel prefetches */
1136 2, /* Branch cost */
1137 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1138 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1139 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1140 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1141 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1142 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1143 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1144 bdver3_memcpy,
1145 bdver3_memset,
1146 6, /* scalar_stmt_cost. */
1147 4, /* scalar load_cost. */
1148 4, /* scalar_store_cost. */
1149 6, /* vec_stmt_cost. */
1150 0, /* vec_to_scalar_cost. */
1151 2, /* scalar_to_vec_cost. */
1152 4, /* vec_align_load_cost. */
1153 4, /* vec_unalign_load_cost. */
1154 4, /* vec_store_cost. */
1155 4, /* cond_taken_branch_cost. */
1156 2, /* cond_not_taken_branch_cost. */
1159 /* BDVER4 has optimized REP instruction for medium sized blocks, but for
1160 very small blocks it is better to use loop. For large blocks, libcall
1161 can do nontemporary accesses and beat inline considerably. */
1162 static stringop_algs bdver4_memcpy[2] = {
1163 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164 {-1, rep_prefix_4_byte, false}}},
1165 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166 {-1, libcall, false}}}};
1167 static stringop_algs bdver4_memset[2] = {
1168 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1169 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1170 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1171 {-1, libcall, false}}}};
1172 struct processor_costs bdver4_cost = {
1173 COSTS_N_INSNS (1), /* cost of an add instruction */
1174 COSTS_N_INSNS (1), /* cost of a lea instruction */
1175 COSTS_N_INSNS (1), /* variable shift costs */
1176 COSTS_N_INSNS (1), /* constant shift costs */
1177 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1178 COSTS_N_INSNS (4), /* HI */
1179 COSTS_N_INSNS (4), /* SI */
1180 COSTS_N_INSNS (6), /* DI */
1181 COSTS_N_INSNS (6)}, /* other */
1182 0, /* cost of multiply per each bit set */
1183 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1184 COSTS_N_INSNS (35), /* HI */
1185 COSTS_N_INSNS (51), /* SI */
1186 COSTS_N_INSNS (83), /* DI */
1187 COSTS_N_INSNS (83)}, /* other */
1188 COSTS_N_INSNS (1), /* cost of movsx */
1189 COSTS_N_INSNS (1), /* cost of movzx */
1190 8, /* "large" insn */
1191 9, /* MOVE_RATIO */
1192 4, /* cost for loading QImode using movzbl */
1193 {5, 5, 4}, /* cost of loading integer registers
1194 in QImode, HImode and SImode.
1195 Relative to reg-reg move (2). */
1196 {4, 4, 4}, /* cost of storing integer registers */
1197 2, /* cost of reg,reg fld/fst */
1198 {5, 5, 12}, /* cost of loading fp registers
1199 in SFmode, DFmode and XFmode */
1200 {4, 4, 8}, /* cost of storing fp registers
1201 in SFmode, DFmode and XFmode */
1202 2, /* cost of moving MMX register */
1203 {4, 4}, /* cost of loading MMX registers
1204 in SImode and DImode */
1205 {4, 4}, /* cost of storing MMX registers
1206 in SImode and DImode */
1207 2, /* cost of moving SSE register */
1208 {4, 4, 4}, /* cost of loading SSE registers
1209 in SImode, DImode and TImode */
1210 {4, 4, 4}, /* cost of storing SSE registers
1211 in SImode, DImode and TImode */
1212 2, /* MMX or SSE register to integer */
1213 16, /* size of l1 cache. */
1214 2048, /* size of l2 cache. */
1215 64, /* size of prefetch block */
1216 /* New AMD processors never drop prefetches; if they cannot be performed
1217 immediately, they are queued. We set number of simultaneous prefetches
1218 to a large constant to reflect this (it probably is not a good idea not
1219 to limit number of prefetches at all, as their execution also takes some
1220 time). */
1221 100, /* number of parallel prefetches */
1222 2, /* Branch cost */
1223 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1224 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1225 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1226 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1227 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1228 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1229 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1230 bdver4_memcpy,
1231 bdver4_memset,
1232 6, /* scalar_stmt_cost. */
1233 4, /* scalar load_cost. */
1234 4, /* scalar_store_cost. */
1235 6, /* vec_stmt_cost. */
1236 0, /* vec_to_scalar_cost. */
1237 2, /* scalar_to_vec_cost. */
1238 4, /* vec_align_load_cost. */
1239 4, /* vec_unalign_load_cost. */
1240 4, /* vec_store_cost. */
1241 4, /* cond_taken_branch_cost. */
1242 2, /* cond_not_taken_branch_cost. */
1246 /* ZNVER1 has optimized REP instruction for medium sized blocks, but for
1247 very small blocks it is better to use loop. For large blocks, libcall
1248 can do nontemporary accesses and beat inline considerably. */
1249 static stringop_algs znver1_memcpy[2] = {
1250 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1251 {-1, rep_prefix_4_byte, false}}},
1252 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1253 {-1, libcall, false}}}};
1254 static stringop_algs znver1_memset[2] = {
1255 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1256 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1257 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1258 {-1, libcall, false}}}};
1259 struct processor_costs znver1_cost = {
1260 COSTS_N_INSNS (1), /* cost of an add instruction. */
1261 COSTS_N_INSNS (1), /* cost of a lea instruction. */
1262 COSTS_N_INSNS (1), /* variable shift costs. */
1263 COSTS_N_INSNS (1), /* constant shift costs. */
1264 {COSTS_N_INSNS (3), /* cost of starting multiply for QI. */
1265 COSTS_N_INSNS (3), /* HI. */
1266 COSTS_N_INSNS (3), /* SI. */
1267 COSTS_N_INSNS (4), /* DI. */
1268 COSTS_N_INSNS (4)}, /* other. */
1269 0, /* cost of multiply per each bit
1270 set. */
1271 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI. */
1272 COSTS_N_INSNS (35), /* HI. */
1273 COSTS_N_INSNS (51), /* SI. */
1274 COSTS_N_INSNS (83), /* DI. */
1275 COSTS_N_INSNS (83)}, /* other. */
1276 COSTS_N_INSNS (1), /* cost of movsx. */
1277 COSTS_N_INSNS (1), /* cost of movzx. */
1278 8, /* "large" insn. */
1279 9, /* MOVE_RATIO. */
1280 4, /* cost for loading QImode using
1281 movzbl. */
1282 {5, 5, 4}, /* cost of loading integer registers
1283 in QImode, HImode and SImode.
1284 Relative to reg-reg move (2). */
1285 {4, 4, 4}, /* cost of storing integer
1286 registers. */
1287 2, /* cost of reg,reg fld/fst. */
1288 {5, 5, 12}, /* cost of loading fp registers
1289 in SFmode, DFmode and XFmode. */
1290 {4, 4, 8}, /* cost of storing fp registers
1291 in SFmode, DFmode and XFmode. */
1292 2, /* cost of moving MMX register. */
1293 {4, 4}, /* cost of loading MMX registers
1294 in SImode and DImode. */
1295 {4, 4}, /* cost of storing MMX registers
1296 in SImode and DImode. */
1297 2, /* cost of moving SSE register. */
1298 {4, 4, 4}, /* cost of loading SSE registers
1299 in SImode, DImode and TImode. */
1300 {4, 4, 4}, /* cost of storing SSE registers
1301 in SImode, DImode and TImode. */
1302 2, /* MMX or SSE register to integer. */
1303 32, /* size of l1 cache. */
1304 512, /* size of l2 cache. */
1305 64, /* size of prefetch block. */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1310 time). */
1311 100, /* number of parallel prefetches. */
1312 3, /* Branch cost. */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1319 /* Zen can execute 4 integer operations per cycle. FP operations take 3 cycles
1320 and it can execute 2 integer additions and 2 multiplications thus
1321 reassociation may make sense up to with of 6. SPEC2k6 bencharks suggests
1322 that 4 works better than 6 probably due to register pressure.
1324 Integer vector operations are taken by FP unit and execute 3 vector
1325 plus/minus operations per cycle but only one multiply. This is adjusted
1326 in ix86_reassociation_width. */
1327 4, 4, 3, 6, /* reassoc int, fp, vec_int, vec_fp. */
1328 znver1_memcpy,
1329 znver1_memset,
1330 6, /* scalar_stmt_cost. */
1331 4, /* scalar load_cost. */
1332 4, /* scalar_store_cost. */
1333 6, /* vec_stmt_cost. */
1334 0, /* vec_to_scalar_cost. */
1335 2, /* scalar_to_vec_cost. */
1336 4, /* vec_align_load_cost. */
1337 4, /* vec_unalign_load_cost. */
1338 4, /* vec_store_cost. */
1339 4, /* cond_taken_branch_cost. */
1340 2, /* cond_not_taken_branch_cost. */
1343 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1344 very small blocks it is better to use loop. For large blocks, libcall can
1345 do nontemporary accesses and beat inline considerably. */
1346 static stringop_algs btver1_memcpy[2] = {
1347 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1348 {-1, rep_prefix_4_byte, false}}},
1349 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1350 {-1, libcall, false}}}};
1351 static stringop_algs btver1_memset[2] = {
1352 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1353 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1354 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1355 {-1, libcall, false}}}};
1356 const struct processor_costs btver1_cost = {
1357 COSTS_N_INSNS (1), /* cost of an add instruction */
1358 COSTS_N_INSNS (2), /* cost of a lea instruction */
1359 COSTS_N_INSNS (1), /* variable shift costs */
1360 COSTS_N_INSNS (1), /* constant shift costs */
1361 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1362 COSTS_N_INSNS (4), /* HI */
1363 COSTS_N_INSNS (3), /* SI */
1364 COSTS_N_INSNS (4), /* DI */
1365 COSTS_N_INSNS (5)}, /* other */
1366 0, /* cost of multiply per each bit set */
1367 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1368 COSTS_N_INSNS (35), /* HI */
1369 COSTS_N_INSNS (51), /* SI */
1370 COSTS_N_INSNS (83), /* DI */
1371 COSTS_N_INSNS (83)}, /* other */
1372 COSTS_N_INSNS (1), /* cost of movsx */
1373 COSTS_N_INSNS (1), /* cost of movzx */
1374 8, /* "large" insn */
1375 9, /* MOVE_RATIO */
1376 4, /* cost for loading QImode using movzbl */
1377 {3, 4, 3}, /* cost of loading integer registers
1378 in QImode, HImode and SImode.
1379 Relative to reg-reg move (2). */
1380 {3, 4, 3}, /* cost of storing integer registers */
1381 4, /* cost of reg,reg fld/fst */
1382 {4, 4, 12}, /* cost of loading fp registers
1383 in SFmode, DFmode and XFmode */
1384 {6, 6, 8}, /* cost of storing fp registers
1385 in SFmode, DFmode and XFmode */
1386 2, /* cost of moving MMX register */
1387 {3, 3}, /* cost of loading MMX registers
1388 in SImode and DImode */
1389 {4, 4}, /* cost of storing MMX registers
1390 in SImode and DImode */
1391 2, /* cost of moving SSE register */
1392 {4, 4, 3}, /* cost of loading SSE registers
1393 in SImode, DImode and TImode */
1394 {4, 4, 5}, /* cost of storing SSE registers
1395 in SImode, DImode and TImode */
1396 3, /* MMX or SSE register to integer */
1397 /* On K8:
1398 MOVD reg64, xmmreg Double FSTORE 4
1399 MOVD reg32, xmmreg Double FSTORE 4
1400 On AMDFAM10:
1401 MOVD reg64, xmmreg Double FADD 3
1402 1/1 1/1
1403 MOVD reg32, xmmreg Double FADD 3
1404 1/1 1/1 */
1405 32, /* size of l1 cache. */
1406 512, /* size of l2 cache. */
1407 64, /* size of prefetch block */
1408 100, /* number of parallel prefetches */
1409 2, /* Branch cost */
1410 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1411 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1412 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1413 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1414 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1415 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1416 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1417 btver1_memcpy,
1418 btver1_memset,
1419 4, /* scalar_stmt_cost. */
1420 2, /* scalar load_cost. */
1421 2, /* scalar_store_cost. */
1422 6, /* vec_stmt_cost. */
1423 0, /* vec_to_scalar_cost. */
1424 2, /* scalar_to_vec_cost. */
1425 2, /* vec_align_load_cost. */
1426 2, /* vec_unalign_load_cost. */
1427 2, /* vec_store_cost. */
1428 2, /* cond_taken_branch_cost. */
1429 1, /* cond_not_taken_branch_cost. */
1432 static stringop_algs btver2_memcpy[2] = {
1433 {libcall, {{6, loop, false}, {14, unrolled_loop, false},
1434 {-1, rep_prefix_4_byte, false}}},
1435 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1436 {-1, libcall, false}}}};
1437 static stringop_algs btver2_memset[2] = {
1438 {libcall, {{8, loop, false}, {24, unrolled_loop, false},
1439 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1440 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1441 {-1, libcall, false}}}};
1442 const struct processor_costs btver2_cost = {
1443 COSTS_N_INSNS (1), /* cost of an add instruction */
1444 COSTS_N_INSNS (2), /* cost of a lea instruction */
1445 COSTS_N_INSNS (1), /* variable shift costs */
1446 COSTS_N_INSNS (1), /* constant shift costs */
1447 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1448 COSTS_N_INSNS (4), /* HI */
1449 COSTS_N_INSNS (3), /* SI */
1450 COSTS_N_INSNS (4), /* DI */
1451 COSTS_N_INSNS (5)}, /* other */
1452 0, /* cost of multiply per each bit set */
1453 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1454 COSTS_N_INSNS (35), /* HI */
1455 COSTS_N_INSNS (51), /* SI */
1456 COSTS_N_INSNS (83), /* DI */
1457 COSTS_N_INSNS (83)}, /* other */
1458 COSTS_N_INSNS (1), /* cost of movsx */
1459 COSTS_N_INSNS (1), /* cost of movzx */
1460 8, /* "large" insn */
1461 9, /* MOVE_RATIO */
1462 4, /* cost for loading QImode using movzbl */
1463 {3, 4, 3}, /* cost of loading integer registers
1464 in QImode, HImode and SImode.
1465 Relative to reg-reg move (2). */
1466 {3, 4, 3}, /* cost of storing integer registers */
1467 4, /* cost of reg,reg fld/fst */
1468 {4, 4, 12}, /* cost of loading fp registers
1469 in SFmode, DFmode and XFmode */
1470 {6, 6, 8}, /* cost of storing fp registers
1471 in SFmode, DFmode and XFmode */
1472 2, /* cost of moving MMX register */
1473 {3, 3}, /* cost of loading MMX registers
1474 in SImode and DImode */
1475 {4, 4}, /* cost of storing MMX registers
1476 in SImode and DImode */
1477 2, /* cost of moving SSE register */
1478 {4, 4, 3}, /* cost of loading SSE registers
1479 in SImode, DImode and TImode */
1480 {4, 4, 5}, /* cost of storing SSE registers
1481 in SImode, DImode and TImode */
1482 3, /* MMX or SSE register to integer */
1483 /* On K8:
1484 MOVD reg64, xmmreg Double FSTORE 4
1485 MOVD reg32, xmmreg Double FSTORE 4
1486 On AMDFAM10:
1487 MOVD reg64, xmmreg Double FADD 3
1488 1/1 1/1
1489 MOVD reg32, xmmreg Double FADD 3
1490 1/1 1/1 */
1491 32, /* size of l1 cache. */
1492 2048, /* size of l2 cache. */
1493 64, /* size of prefetch block */
1494 100, /* number of parallel prefetches */
1495 2, /* Branch cost */
1496 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1497 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1498 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1499 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1500 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1501 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1502 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1503 btver2_memcpy,
1504 btver2_memset,
1505 4, /* scalar_stmt_cost. */
1506 2, /* scalar load_cost. */
1507 2, /* scalar_store_cost. */
1508 6, /* vec_stmt_cost. */
1509 0, /* vec_to_scalar_cost. */
1510 2, /* scalar_to_vec_cost. */
1511 2, /* vec_align_load_cost. */
1512 2, /* vec_unalign_load_cost. */
1513 2, /* vec_store_cost. */
1514 2, /* cond_taken_branch_cost. */
1515 1, /* cond_not_taken_branch_cost. */
1518 static stringop_algs pentium4_memcpy[2] = {
1519 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1520 DUMMY_STRINGOP_ALGS};
1521 static stringop_algs pentium4_memset[2] = {
1522 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1523 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1524 DUMMY_STRINGOP_ALGS};
1526 static const
1527 struct processor_costs pentium4_cost = {
1528 COSTS_N_INSNS (1), /* cost of an add instruction */
1529 COSTS_N_INSNS (3), /* cost of a lea instruction */
1530 COSTS_N_INSNS (4), /* variable shift costs */
1531 COSTS_N_INSNS (4), /* constant shift costs */
1532 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1533 COSTS_N_INSNS (15), /* HI */
1534 COSTS_N_INSNS (15), /* SI */
1535 COSTS_N_INSNS (15), /* DI */
1536 COSTS_N_INSNS (15)}, /* other */
1537 0, /* cost of multiply per each bit set */
1538 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1539 COSTS_N_INSNS (56), /* HI */
1540 COSTS_N_INSNS (56), /* SI */
1541 COSTS_N_INSNS (56), /* DI */
1542 COSTS_N_INSNS (56)}, /* other */
1543 COSTS_N_INSNS (1), /* cost of movsx */
1544 COSTS_N_INSNS (1), /* cost of movzx */
1545 16, /* "large" insn */
1546 6, /* MOVE_RATIO */
1547 2, /* cost for loading QImode using movzbl */
1548 {4, 5, 4}, /* cost of loading integer registers
1549 in QImode, HImode and SImode.
1550 Relative to reg-reg move (2). */
1551 {2, 3, 2}, /* cost of storing integer registers */
1552 2, /* cost of reg,reg fld/fst */
1553 {2, 2, 6}, /* cost of loading fp registers
1554 in SFmode, DFmode and XFmode */
1555 {4, 4, 6}, /* cost of storing fp registers
1556 in SFmode, DFmode and XFmode */
1557 2, /* cost of moving MMX register */
1558 {2, 2}, /* cost of loading MMX registers
1559 in SImode and DImode */
1560 {2, 2}, /* cost of storing MMX registers
1561 in SImode and DImode */
1562 12, /* cost of moving SSE register */
1563 {12, 12, 12}, /* cost of loading SSE registers
1564 in SImode, DImode and TImode */
1565 {2, 2, 8}, /* cost of storing SSE registers
1566 in SImode, DImode and TImode */
1567 10, /* MMX or SSE register to integer */
1568 8, /* size of l1 cache. */
1569 256, /* size of l2 cache. */
1570 64, /* size of prefetch block */
1571 6, /* number of parallel prefetches */
1572 2, /* Branch cost */
1573 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1574 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1575 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1576 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1577 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1578 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1579 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1580 pentium4_memcpy,
1581 pentium4_memset,
1582 1, /* scalar_stmt_cost. */
1583 1, /* scalar load_cost. */
1584 1, /* scalar_store_cost. */
1585 1, /* vec_stmt_cost. */
1586 1, /* vec_to_scalar_cost. */
1587 1, /* scalar_to_vec_cost. */
1588 1, /* vec_align_load_cost. */
1589 2, /* vec_unalign_load_cost. */
1590 1, /* vec_store_cost. */
1591 3, /* cond_taken_branch_cost. */
1592 1, /* cond_not_taken_branch_cost. */
1595 static stringop_algs nocona_memcpy[2] = {
1596 {libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1597 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1598 {100000, unrolled_loop, false}, {-1, libcall, false}}}};
1600 static stringop_algs nocona_memset[2] = {
1601 {libcall, {{6, loop_1_byte, false}, {48, loop, false},
1602 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1603 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1604 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1606 static const
1607 struct processor_costs nocona_cost = {
1608 COSTS_N_INSNS (1), /* cost of an add instruction */
1609 COSTS_N_INSNS (1), /* cost of a lea instruction */
1610 COSTS_N_INSNS (1), /* variable shift costs */
1611 COSTS_N_INSNS (1), /* constant shift costs */
1612 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1613 COSTS_N_INSNS (10), /* HI */
1614 COSTS_N_INSNS (10), /* SI */
1615 COSTS_N_INSNS (10), /* DI */
1616 COSTS_N_INSNS (10)}, /* other */
1617 0, /* cost of multiply per each bit set */
1618 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1619 COSTS_N_INSNS (66), /* HI */
1620 COSTS_N_INSNS (66), /* SI */
1621 COSTS_N_INSNS (66), /* DI */
1622 COSTS_N_INSNS (66)}, /* other */
1623 COSTS_N_INSNS (1), /* cost of movsx */
1624 COSTS_N_INSNS (1), /* cost of movzx */
1625 16, /* "large" insn */
1626 17, /* MOVE_RATIO */
1627 4, /* cost for loading QImode using movzbl */
1628 {4, 4, 4}, /* cost of loading integer registers
1629 in QImode, HImode and SImode.
1630 Relative to reg-reg move (2). */
1631 {4, 4, 4}, /* cost of storing integer registers */
1632 3, /* cost of reg,reg fld/fst */
1633 {12, 12, 12}, /* cost of loading fp registers
1634 in SFmode, DFmode and XFmode */
1635 {4, 4, 4}, /* cost of storing fp registers
1636 in SFmode, DFmode and XFmode */
1637 6, /* cost of moving MMX register */
1638 {12, 12}, /* cost of loading MMX registers
1639 in SImode and DImode */
1640 {12, 12}, /* cost of storing MMX registers
1641 in SImode and DImode */
1642 6, /* cost of moving SSE register */
1643 {12, 12, 12}, /* cost of loading SSE registers
1644 in SImode, DImode and TImode */
1645 {12, 12, 12}, /* cost of storing SSE registers
1646 in SImode, DImode and TImode */
1647 8, /* MMX or SSE register to integer */
1648 8, /* size of l1 cache. */
1649 1024, /* size of l2 cache. */
1650 64, /* size of prefetch block */
1651 8, /* number of parallel prefetches */
1652 1, /* Branch cost */
1653 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1654 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1655 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1656 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1657 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1658 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1659 1, 1, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1660 nocona_memcpy,
1661 nocona_memset,
1662 1, /* scalar_stmt_cost. */
1663 1, /* scalar load_cost. */
1664 1, /* scalar_store_cost. */
1665 1, /* vec_stmt_cost. */
1666 1, /* vec_to_scalar_cost. */
1667 1, /* scalar_to_vec_cost. */
1668 1, /* vec_align_load_cost. */
1669 2, /* vec_unalign_load_cost. */
1670 1, /* vec_store_cost. */
1671 3, /* cond_taken_branch_cost. */
1672 1, /* cond_not_taken_branch_cost. */
1675 static stringop_algs atom_memcpy[2] = {
1676 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1677 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1678 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1679 static stringop_algs atom_memset[2] = {
1680 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1681 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1682 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1683 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1684 static const
1685 struct processor_costs atom_cost = {
1686 COSTS_N_INSNS (1), /* cost of an add instruction */
1687 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1688 COSTS_N_INSNS (1), /* variable shift costs */
1689 COSTS_N_INSNS (1), /* constant shift costs */
1690 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1691 COSTS_N_INSNS (4), /* HI */
1692 COSTS_N_INSNS (3), /* SI */
1693 COSTS_N_INSNS (4), /* DI */
1694 COSTS_N_INSNS (2)}, /* other */
1695 0, /* cost of multiply per each bit set */
1696 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1697 COSTS_N_INSNS (26), /* HI */
1698 COSTS_N_INSNS (42), /* SI */
1699 COSTS_N_INSNS (74), /* DI */
1700 COSTS_N_INSNS (74)}, /* other */
1701 COSTS_N_INSNS (1), /* cost of movsx */
1702 COSTS_N_INSNS (1), /* cost of movzx */
1703 8, /* "large" insn */
1704 17, /* MOVE_RATIO */
1705 4, /* cost for loading QImode using movzbl */
1706 {4, 4, 4}, /* cost of loading integer registers
1707 in QImode, HImode and SImode.
1708 Relative to reg-reg move (2). */
1709 {4, 4, 4}, /* cost of storing integer registers */
1710 4, /* cost of reg,reg fld/fst */
1711 {12, 12, 12}, /* cost of loading fp registers
1712 in SFmode, DFmode and XFmode */
1713 {6, 6, 8}, /* cost of storing fp registers
1714 in SFmode, DFmode and XFmode */
1715 2, /* cost of moving MMX register */
1716 {8, 8}, /* cost of loading MMX registers
1717 in SImode and DImode */
1718 {8, 8}, /* cost of storing MMX registers
1719 in SImode and DImode */
1720 2, /* cost of moving SSE register */
1721 {8, 8, 8}, /* cost of loading SSE registers
1722 in SImode, DImode and TImode */
1723 {8, 8, 8}, /* cost of storing SSE registers
1724 in SImode, DImode and TImode */
1725 5, /* MMX or SSE register to integer */
1726 32, /* size of l1 cache. */
1727 256, /* size of l2 cache. */
1728 64, /* size of prefetch block */
1729 6, /* number of parallel prefetches */
1730 3, /* Branch cost */
1731 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1732 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1733 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1734 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1735 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1736 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1737 2, 2, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
1738 atom_memcpy,
1739 atom_memset,
1740 1, /* scalar_stmt_cost. */
1741 1, /* scalar load_cost. */
1742 1, /* scalar_store_cost. */
1743 1, /* vec_stmt_cost. */
1744 1, /* vec_to_scalar_cost. */
1745 1, /* scalar_to_vec_cost. */
1746 1, /* vec_align_load_cost. */
1747 2, /* vec_unalign_load_cost. */
1748 1, /* vec_store_cost. */
1749 3, /* cond_taken_branch_cost. */
1750 1, /* cond_not_taken_branch_cost. */
1753 static stringop_algs slm_memcpy[2] = {
1754 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1755 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1756 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1757 static stringop_algs slm_memset[2] = {
1758 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1759 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1760 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1761 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1762 static const
1763 struct processor_costs slm_cost = {
1764 COSTS_N_INSNS (1), /* cost of an add instruction */
1765 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1766 COSTS_N_INSNS (1), /* variable shift costs */
1767 COSTS_N_INSNS (1), /* constant shift costs */
1768 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1769 COSTS_N_INSNS (3), /* HI */
1770 COSTS_N_INSNS (3), /* SI */
1771 COSTS_N_INSNS (4), /* DI */
1772 COSTS_N_INSNS (2)}, /* other */
1773 0, /* cost of multiply per each bit set */
1774 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1775 COSTS_N_INSNS (26), /* HI */
1776 COSTS_N_INSNS (42), /* SI */
1777 COSTS_N_INSNS (74), /* DI */
1778 COSTS_N_INSNS (74)}, /* other */
1779 COSTS_N_INSNS (1), /* cost of movsx */
1780 COSTS_N_INSNS (1), /* cost of movzx */
1781 8, /* "large" insn */
1782 17, /* MOVE_RATIO */
1783 4, /* cost for loading QImode using movzbl */
1784 {4, 4, 4}, /* cost of loading integer registers
1785 in QImode, HImode and SImode.
1786 Relative to reg-reg move (2). */
1787 {4, 4, 4}, /* cost of storing integer registers */
1788 4, /* cost of reg,reg fld/fst */
1789 {12, 12, 12}, /* cost of loading fp registers
1790 in SFmode, DFmode and XFmode */
1791 {6, 6, 8}, /* cost of storing fp registers
1792 in SFmode, DFmode and XFmode */
1793 2, /* cost of moving MMX register */
1794 {8, 8}, /* cost of loading MMX registers
1795 in SImode and DImode */
1796 {8, 8}, /* cost of storing MMX registers
1797 in SImode and DImode */
1798 2, /* cost of moving SSE register */
1799 {8, 8, 8}, /* cost of loading SSE registers
1800 in SImode, DImode and TImode */
1801 {8, 8, 8}, /* cost of storing SSE registers
1802 in SImode, DImode and TImode */
1803 5, /* MMX or SSE register to integer */
1804 32, /* size of l1 cache. */
1805 256, /* size of l2 cache. */
1806 64, /* size of prefetch block */
1807 6, /* number of parallel prefetches */
1808 3, /* Branch cost */
1809 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1810 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1811 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1812 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1813 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1814 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1815 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1816 slm_memcpy,
1817 slm_memset,
1818 1, /* scalar_stmt_cost. */
1819 1, /* scalar load_cost. */
1820 1, /* scalar_store_cost. */
1821 1, /* vec_stmt_cost. */
1822 4, /* vec_to_scalar_cost. */
1823 1, /* scalar_to_vec_cost. */
1824 1, /* vec_align_load_cost. */
1825 2, /* vec_unalign_load_cost. */
1826 1, /* vec_store_cost. */
1827 3, /* cond_taken_branch_cost. */
1828 1, /* cond_not_taken_branch_cost. */
1831 static stringop_algs intel_memcpy[2] = {
1832 {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1833 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1834 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1835 static stringop_algs intel_memset[2] = {
1836 {libcall, {{8, loop, false}, {15, unrolled_loop, false},
1837 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1838 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1839 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
1840 static const
1841 struct processor_costs intel_cost = {
1842 COSTS_N_INSNS (1), /* cost of an add instruction */
1843 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1844 COSTS_N_INSNS (1), /* variable shift costs */
1845 COSTS_N_INSNS (1), /* constant shift costs */
1846 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1847 COSTS_N_INSNS (3), /* HI */
1848 COSTS_N_INSNS (3), /* SI */
1849 COSTS_N_INSNS (4), /* DI */
1850 COSTS_N_INSNS (2)}, /* other */
1851 0, /* cost of multiply per each bit set */
1852 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1853 COSTS_N_INSNS (26), /* HI */
1854 COSTS_N_INSNS (42), /* SI */
1855 COSTS_N_INSNS (74), /* DI */
1856 COSTS_N_INSNS (74)}, /* other */
1857 COSTS_N_INSNS (1), /* cost of movsx */
1858 COSTS_N_INSNS (1), /* cost of movzx */
1859 8, /* "large" insn */
1860 17, /* MOVE_RATIO */
1861 4, /* cost for loading QImode using movzbl */
1862 {4, 4, 4}, /* cost of loading integer registers
1863 in QImode, HImode and SImode.
1864 Relative to reg-reg move (2). */
1865 {4, 4, 4}, /* cost of storing integer registers */
1866 4, /* cost of reg,reg fld/fst */
1867 {12, 12, 12}, /* cost of loading fp registers
1868 in SFmode, DFmode and XFmode */
1869 {6, 6, 8}, /* cost of storing fp registers
1870 in SFmode, DFmode and XFmode */
1871 2, /* cost of moving MMX register */
1872 {8, 8}, /* cost of loading MMX registers
1873 in SImode and DImode */
1874 {8, 8}, /* cost of storing MMX registers
1875 in SImode and DImode */
1876 2, /* cost of moving SSE register */
1877 {8, 8, 8}, /* cost of loading SSE registers
1878 in SImode, DImode and TImode */
1879 {8, 8, 8}, /* cost of storing SSE registers
1880 in SImode, DImode and TImode */
1881 5, /* MMX or SSE register to integer */
1882 32, /* size of l1 cache. */
1883 256, /* size of l2 cache. */
1884 64, /* size of prefetch block */
1885 6, /* number of parallel prefetches */
1886 3, /* Branch cost */
1887 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1888 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1889 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1890 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1891 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1892 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1893 1, 4, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1894 intel_memcpy,
1895 intel_memset,
1896 1, /* scalar_stmt_cost. */
1897 1, /* scalar load_cost. */
1898 1, /* scalar_store_cost. */
1899 1, /* vec_stmt_cost. */
1900 4, /* vec_to_scalar_cost. */
1901 1, /* scalar_to_vec_cost. */
1902 1, /* vec_align_load_cost. */
1903 2, /* vec_unalign_load_cost. */
1904 1, /* vec_store_cost. */
1905 3, /* cond_taken_branch_cost. */
1906 1, /* cond_not_taken_branch_cost. */
1909 /* Generic should produce code tuned for Core-i7 (and newer chips)
1910 and btver1 (and newer chips). */
1912 static stringop_algs generic_memcpy[2] = {
1913 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1914 {-1, libcall, false}}},
1915 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1916 {-1, libcall, false}}}};
1917 static stringop_algs generic_memset[2] = {
1918 {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1919 {-1, libcall, false}}},
1920 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1921 {-1, libcall, false}}}};
1922 static const
1923 struct processor_costs generic_cost = {
1924 COSTS_N_INSNS (1), /* cost of an add instruction */
1925 /* On all chips taken into consideration lea is 2 cycles and more. With
1926 this cost however our current implementation of synth_mult results in
1927 use of unnecessary temporary registers causing regression on several
1928 SPECfp benchmarks. */
1929 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1930 COSTS_N_INSNS (1), /* variable shift costs */
1931 COSTS_N_INSNS (1), /* constant shift costs */
1932 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1933 COSTS_N_INSNS (4), /* HI */
1934 COSTS_N_INSNS (3), /* SI */
1935 COSTS_N_INSNS (4), /* DI */
1936 COSTS_N_INSNS (2)}, /* other */
1937 0, /* cost of multiply per each bit set */
1938 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1939 COSTS_N_INSNS (26), /* HI */
1940 COSTS_N_INSNS (42), /* SI */
1941 COSTS_N_INSNS (74), /* DI */
1942 COSTS_N_INSNS (74)}, /* other */
1943 COSTS_N_INSNS (1), /* cost of movsx */
1944 COSTS_N_INSNS (1), /* cost of movzx */
1945 8, /* "large" insn */
1946 17, /* MOVE_RATIO */
1947 4, /* cost for loading QImode using movzbl */
1948 {4, 4, 4}, /* cost of loading integer registers
1949 in QImode, HImode and SImode.
1950 Relative to reg-reg move (2). */
1951 {4, 4, 4}, /* cost of storing integer registers */
1952 4, /* cost of reg,reg fld/fst */
1953 {12, 12, 12}, /* cost of loading fp registers
1954 in SFmode, DFmode and XFmode */
1955 {6, 6, 8}, /* cost of storing fp registers
1956 in SFmode, DFmode and XFmode */
1957 2, /* cost of moving MMX register */
1958 {8, 8}, /* cost of loading MMX registers
1959 in SImode and DImode */
1960 {8, 8}, /* cost of storing MMX registers
1961 in SImode and DImode */
1962 2, /* cost of moving SSE register */
1963 {8, 8, 8}, /* cost of loading SSE registers
1964 in SImode, DImode and TImode */
1965 {8, 8, 8}, /* cost of storing SSE registers
1966 in SImode, DImode and TImode */
1967 5, /* MMX or SSE register to integer */
1968 32, /* size of l1 cache. */
1969 512, /* size of l2 cache. */
1970 64, /* size of prefetch block */
1971 6, /* number of parallel prefetches */
1972 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1973 value is increased to perhaps more appropriate value of 5. */
1974 3, /* Branch cost */
1975 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1976 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1977 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1978 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1979 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1980 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1981 1, 2, 1, 1, /* reassoc int, fp, vec_int, vec_fp. */
1982 generic_memcpy,
1983 generic_memset,
1984 1, /* scalar_stmt_cost. */
1985 1, /* scalar load_cost. */
1986 1, /* scalar_store_cost. */
1987 1, /* vec_stmt_cost. */
1988 1, /* vec_to_scalar_cost. */
1989 1, /* scalar_to_vec_cost. */
1990 1, /* vec_align_load_cost. */
1991 2, /* vec_unalign_load_cost. */
1992 1, /* vec_store_cost. */
1993 3, /* cond_taken_branch_cost. */
1994 1, /* cond_not_taken_branch_cost. */
1997 /* core_cost should produce code tuned for Core familly of CPUs. */
1998 static stringop_algs core_memcpy[2] = {
1999 {libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
2000 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
2001 {-1, libcall, false}}}};
2002 static stringop_algs core_memset[2] = {
2003 {libcall, {{6, loop_1_byte, true},
2004 {24, loop, true},
2005 {8192, rep_prefix_4_byte, true},
2006 {-1, libcall, false}}},
2007 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
2008 {-1, libcall, false}}}};
2010 static const
2011 struct processor_costs core_cost = {
2012 COSTS_N_INSNS (1), /* cost of an add instruction */
2013 /* On all chips taken into consideration lea is 2 cycles and more. With
2014 this cost however our current implementation of synth_mult results in
2015 use of unnecessary temporary registers causing regression on several
2016 SPECfp benchmarks. */
2017 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
2018 COSTS_N_INSNS (1), /* variable shift costs */
2019 COSTS_N_INSNS (1), /* constant shift costs */
2020 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
2021 COSTS_N_INSNS (4), /* HI */
2022 COSTS_N_INSNS (3), /* SI */
2023 COSTS_N_INSNS (4), /* DI */
2024 COSTS_N_INSNS (2)}, /* other */
2025 0, /* cost of multiply per each bit set */
2026 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
2027 COSTS_N_INSNS (26), /* HI */
2028 COSTS_N_INSNS (42), /* SI */
2029 COSTS_N_INSNS (74), /* DI */
2030 COSTS_N_INSNS (74)}, /* other */
2031 COSTS_N_INSNS (1), /* cost of movsx */
2032 COSTS_N_INSNS (1), /* cost of movzx */
2033 8, /* "large" insn */
2034 17, /* MOVE_RATIO */
2035 4, /* cost for loading QImode using movzbl */
2036 {4, 4, 4}, /* cost of loading integer registers
2037 in QImode, HImode and SImode.
2038 Relative to reg-reg move (2). */
2039 {4, 4, 4}, /* cost of storing integer registers */
2040 4, /* cost of reg,reg fld/fst */
2041 {12, 12, 12}, /* cost of loading fp registers
2042 in SFmode, DFmode and XFmode */
2043 {6, 6, 8}, /* cost of storing fp registers
2044 in SFmode, DFmode and XFmode */
2045 2, /* cost of moving MMX register */
2046 {8, 8}, /* cost of loading MMX registers
2047 in SImode and DImode */
2048 {8, 8}, /* cost of storing MMX registers
2049 in SImode and DImode */
2050 2, /* cost of moving SSE register */
2051 {8, 8, 8}, /* cost of loading SSE registers
2052 in SImode, DImode and TImode */
2053 {8, 8, 8}, /* cost of storing SSE registers
2054 in SImode, DImode and TImode */
2055 5, /* MMX or SSE register to integer */
2056 64, /* size of l1 cache. */
2057 512, /* size of l2 cache. */
2058 64, /* size of prefetch block */
2059 6, /* number of parallel prefetches */
2060 /* FIXME perhaps more appropriate value is 5. */
2061 3, /* Branch cost */
2062 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
2063 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
2064 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
2065 COSTS_N_INSNS (8), /* cost of FABS instruction. */
2066 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
2067 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
2068 1, 4, 2, 2, /* reassoc int, fp, vec_int, vec_fp. */
2069 core_memcpy,
2070 core_memset,
2071 1, /* scalar_stmt_cost. */
2072 1, /* scalar load_cost. */
2073 1, /* scalar_store_cost. */
2074 1, /* vec_stmt_cost. */
2075 1, /* vec_to_scalar_cost. */
2076 1, /* scalar_to_vec_cost. */
2077 1, /* vec_align_load_cost. */
2078 2, /* vec_unalign_load_cost. */
2079 1, /* vec_store_cost. */
2080 3, /* cond_taken_branch_cost. */
2081 1, /* cond_not_taken_branch_cost. */