Merge branches/gcc-4_8-branch rev 216856
[official-gcc.git] / gcc-4_8-branch / gcc / config / i386 / i386.c
blob73f8a44a209eb4042e75ad8346b2085900d61c1c
1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988-2013 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "rtl.h"
25 #include "tree.h"
26 #include "tm_p.h"
27 #include "regs.h"
28 #include "hard-reg-set.h"
29 #include "insn-config.h"
30 #include "conditions.h"
31 #include "output.h"
32 #include "insn-codes.h"
33 #include "insn-attr.h"
34 #include "flags.h"
35 #include "except.h"
36 #include "function.h"
37 #include "recog.h"
38 #include "expr.h"
39 #include "optabs.h"
40 #include "diagnostic-core.h"
41 #include "toplev.h"
42 #include "basic-block.h"
43 #include "ggc.h"
44 #include "target.h"
45 #include "target-def.h"
46 #include "common/common-target.h"
47 #include "langhooks.h"
48 #include "reload.h"
49 #include "cgraph.h"
50 #include "gimple.h"
51 #include "dwarf2.h"
52 #include "df.h"
53 #include "tm-constrs.h"
54 #include "params.h"
55 #include "cselib.h"
56 #include "debug.h"
57 #include "sched-int.h"
58 #include "sbitmap.h"
59 #include "fibheap.h"
60 #include "opts.h"
61 #include "diagnostic.h"
62 #include "dumpfile.h"
63 #include "tree-pass.h"
64 #include "tree-flow.h"
66 static rtx legitimize_dllimport_symbol (rtx, bool);
68 #ifndef CHECK_STACK_LIMIT
69 #define CHECK_STACK_LIMIT (-1)
70 #endif
72 /* Return index of given mode in mult and division cost tables. */
73 #define MODE_INDEX(mode) \
74 ((mode) == QImode ? 0 \
75 : (mode) == HImode ? 1 \
76 : (mode) == SImode ? 2 \
77 : (mode) == DImode ? 3 \
78 : 4)
80 /* Processor costs (relative to an add) */
81 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
82 #define COSTS_N_BYTES(N) ((N) * 2)
84 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall, false}}}
86 const
87 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
88 COSTS_N_BYTES (2), /* cost of an add instruction */
89 COSTS_N_BYTES (3), /* cost of a lea instruction */
90 COSTS_N_BYTES (2), /* variable shift costs */
91 COSTS_N_BYTES (3), /* constant shift costs */
92 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
93 COSTS_N_BYTES (3), /* HI */
94 COSTS_N_BYTES (3), /* SI */
95 COSTS_N_BYTES (3), /* DI */
96 COSTS_N_BYTES (5)}, /* other */
97 0, /* cost of multiply per each bit set */
98 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
99 COSTS_N_BYTES (3), /* HI */
100 COSTS_N_BYTES (3), /* SI */
101 COSTS_N_BYTES (3), /* DI */
102 COSTS_N_BYTES (5)}, /* other */
103 COSTS_N_BYTES (3), /* cost of movsx */
104 COSTS_N_BYTES (3), /* cost of movzx */
105 0, /* "large" insn */
106 2, /* MOVE_RATIO */
107 2, /* cost for loading QImode using movzbl */
108 {2, 2, 2}, /* cost of loading integer registers
109 in QImode, HImode and SImode.
110 Relative to reg-reg move (2). */
111 {2, 2, 2}, /* cost of storing integer registers */
112 2, /* cost of reg,reg fld/fst */
113 {2, 2, 2}, /* cost of loading fp registers
114 in SFmode, DFmode and XFmode */
115 {2, 2, 2}, /* cost of storing fp registers
116 in SFmode, DFmode and XFmode */
117 3, /* cost of moving MMX register */
118 {3, 3}, /* cost of loading MMX registers
119 in SImode and DImode */
120 {3, 3}, /* cost of storing MMX registers
121 in SImode and DImode */
122 3, /* cost of moving SSE register */
123 {3, 3, 3}, /* cost of loading SSE registers
124 in SImode, DImode and TImode */
125 {3, 3, 3}, /* cost of storing SSE registers
126 in SImode, DImode and TImode */
127 3, /* MMX or SSE register to integer */
128 0, /* size of l1 cache */
129 0, /* size of l2 cache */
130 0, /* size of prefetch block */
131 0, /* number of parallel prefetches */
132 2, /* Branch cost */
133 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
134 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
135 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
136 COSTS_N_BYTES (2), /* cost of FABS instruction. */
137 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
138 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
139 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
140 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
141 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
142 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}}},
143 1, /* scalar_stmt_cost. */
144 1, /* scalar load_cost. */
145 1, /* scalar_store_cost. */
146 1, /* vec_stmt_cost. */
147 1, /* vec_to_scalar_cost. */
148 1, /* scalar_to_vec_cost. */
149 1, /* vec_align_load_cost. */
150 1, /* vec_unalign_load_cost. */
151 1, /* vec_store_cost. */
152 1, /* cond_taken_branch_cost. */
153 1, /* cond_not_taken_branch_cost. */
156 /* Processor costs (relative to an add) */
157 static const
158 struct processor_costs i386_cost = { /* 386 specific costs */
159 COSTS_N_INSNS (1), /* cost of an add instruction */
160 COSTS_N_INSNS (1), /* cost of a lea instruction */
161 COSTS_N_INSNS (3), /* variable shift costs */
162 COSTS_N_INSNS (2), /* constant shift costs */
163 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
164 COSTS_N_INSNS (6), /* HI */
165 COSTS_N_INSNS (6), /* SI */
166 COSTS_N_INSNS (6), /* DI */
167 COSTS_N_INSNS (6)}, /* other */
168 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
169 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
170 COSTS_N_INSNS (23), /* HI */
171 COSTS_N_INSNS (23), /* SI */
172 COSTS_N_INSNS (23), /* DI */
173 COSTS_N_INSNS (23)}, /* other */
174 COSTS_N_INSNS (3), /* cost of movsx */
175 COSTS_N_INSNS (2), /* cost of movzx */
176 15, /* "large" insn */
177 3, /* MOVE_RATIO */
178 4, /* cost for loading QImode using movzbl */
179 {2, 4, 2}, /* cost of loading integer registers
180 in QImode, HImode and SImode.
181 Relative to reg-reg move (2). */
182 {2, 4, 2}, /* cost of storing integer registers */
183 2, /* cost of reg,reg fld/fst */
184 {8, 8, 8}, /* cost of loading fp registers
185 in SFmode, DFmode and XFmode */
186 {8, 8, 8}, /* cost of storing fp registers
187 in SFmode, DFmode and XFmode */
188 2, /* cost of moving MMX register */
189 {4, 8}, /* cost of loading MMX registers
190 in SImode and DImode */
191 {4, 8}, /* cost of storing MMX registers
192 in SImode and DImode */
193 2, /* cost of moving SSE register */
194 {4, 8, 16}, /* cost of loading SSE registers
195 in SImode, DImode and TImode */
196 {4, 8, 16}, /* cost of storing SSE registers
197 in SImode, DImode and TImode */
198 3, /* MMX or SSE register to integer */
199 0, /* size of l1 cache */
200 0, /* size of l2 cache */
201 0, /* size of prefetch block */
202 0, /* number of parallel prefetches */
203 1, /* Branch cost */
204 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
205 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
206 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
207 COSTS_N_INSNS (22), /* cost of FABS instruction. */
208 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
209 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
210 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
211 DUMMY_STRINGOP_ALGS},
212 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte, false}}},
213 DUMMY_STRINGOP_ALGS},
214 1, /* scalar_stmt_cost. */
215 1, /* scalar load_cost. */
216 1, /* scalar_store_cost. */
217 1, /* vec_stmt_cost. */
218 1, /* vec_to_scalar_cost. */
219 1, /* scalar_to_vec_cost. */
220 1, /* vec_align_load_cost. */
221 2, /* vec_unalign_load_cost. */
222 1, /* vec_store_cost. */
223 3, /* cond_taken_branch_cost. */
224 1, /* cond_not_taken_branch_cost. */
227 static const
228 struct processor_costs i486_cost = { /* 486 specific costs */
229 COSTS_N_INSNS (1), /* cost of an add instruction */
230 COSTS_N_INSNS (1), /* cost of a lea instruction */
231 COSTS_N_INSNS (3), /* variable shift costs */
232 COSTS_N_INSNS (2), /* constant shift costs */
233 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
234 COSTS_N_INSNS (12), /* HI */
235 COSTS_N_INSNS (12), /* SI */
236 COSTS_N_INSNS (12), /* DI */
237 COSTS_N_INSNS (12)}, /* other */
238 1, /* cost of multiply per each bit set */
239 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
240 COSTS_N_INSNS (40), /* HI */
241 COSTS_N_INSNS (40), /* SI */
242 COSTS_N_INSNS (40), /* DI */
243 COSTS_N_INSNS (40)}, /* other */
244 COSTS_N_INSNS (3), /* cost of movsx */
245 COSTS_N_INSNS (2), /* cost of movzx */
246 15, /* "large" insn */
247 3, /* MOVE_RATIO */
248 4, /* cost for loading QImode using movzbl */
249 {2, 4, 2}, /* cost of loading integer registers
250 in QImode, HImode and SImode.
251 Relative to reg-reg move (2). */
252 {2, 4, 2}, /* cost of storing integer registers */
253 2, /* cost of reg,reg fld/fst */
254 {8, 8, 8}, /* cost of loading fp registers
255 in SFmode, DFmode and XFmode */
256 {8, 8, 8}, /* cost of storing fp registers
257 in SFmode, DFmode and XFmode */
258 2, /* cost of moving MMX register */
259 {4, 8}, /* cost of loading MMX registers
260 in SImode and DImode */
261 {4, 8}, /* cost of storing MMX registers
262 in SImode and DImode */
263 2, /* cost of moving SSE register */
264 {4, 8, 16}, /* cost of loading SSE registers
265 in SImode, DImode and TImode */
266 {4, 8, 16}, /* cost of storing SSE registers
267 in SImode, DImode and TImode */
268 3, /* MMX or SSE register to integer */
269 4, /* size of l1 cache. 486 has 8kB cache
270 shared for code and data, so 4kB is
271 not really precise. */
272 4, /* size of l2 cache */
273 0, /* size of prefetch block */
274 0, /* number of parallel prefetches */
275 1, /* Branch cost */
276 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
277 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
278 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
279 COSTS_N_INSNS (3), /* cost of FABS instruction. */
280 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
281 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
282 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
283 DUMMY_STRINGOP_ALGS},
284 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte, false}}},
285 DUMMY_STRINGOP_ALGS},
286 1, /* scalar_stmt_cost. */
287 1, /* scalar load_cost. */
288 1, /* scalar_store_cost. */
289 1, /* vec_stmt_cost. */
290 1, /* vec_to_scalar_cost. */
291 1, /* scalar_to_vec_cost. */
292 1, /* vec_align_load_cost. */
293 2, /* vec_unalign_load_cost. */
294 1, /* vec_store_cost. */
295 3, /* cond_taken_branch_cost. */
296 1, /* cond_not_taken_branch_cost. */
299 static const
300 struct processor_costs pentium_cost = {
301 COSTS_N_INSNS (1), /* cost of an add instruction */
302 COSTS_N_INSNS (1), /* cost of a lea instruction */
303 COSTS_N_INSNS (4), /* variable shift costs */
304 COSTS_N_INSNS (1), /* constant shift costs */
305 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
306 COSTS_N_INSNS (11), /* HI */
307 COSTS_N_INSNS (11), /* SI */
308 COSTS_N_INSNS (11), /* DI */
309 COSTS_N_INSNS (11)}, /* other */
310 0, /* cost of multiply per each bit set */
311 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
312 COSTS_N_INSNS (25), /* HI */
313 COSTS_N_INSNS (25), /* SI */
314 COSTS_N_INSNS (25), /* DI */
315 COSTS_N_INSNS (25)}, /* other */
316 COSTS_N_INSNS (3), /* cost of movsx */
317 COSTS_N_INSNS (2), /* cost of movzx */
318 8, /* "large" insn */
319 6, /* MOVE_RATIO */
320 6, /* cost for loading QImode using movzbl */
321 {2, 4, 2}, /* cost of loading integer registers
322 in QImode, HImode and SImode.
323 Relative to reg-reg move (2). */
324 {2, 4, 2}, /* cost of storing integer registers */
325 2, /* cost of reg,reg fld/fst */
326 {2, 2, 6}, /* cost of loading fp registers
327 in SFmode, DFmode and XFmode */
328 {4, 4, 6}, /* cost of storing fp registers
329 in SFmode, DFmode and XFmode */
330 8, /* cost of moving MMX register */
331 {8, 8}, /* cost of loading MMX registers
332 in SImode and DImode */
333 {8, 8}, /* cost of storing MMX registers
334 in SImode and DImode */
335 2, /* cost of moving SSE register */
336 {4, 8, 16}, /* cost of loading SSE registers
337 in SImode, DImode and TImode */
338 {4, 8, 16}, /* cost of storing SSE registers
339 in SImode, DImode and TImode */
340 3, /* MMX or SSE register to integer */
341 8, /* size of l1 cache. */
342 8, /* size of l2 cache */
343 0, /* size of prefetch block */
344 0, /* number of parallel prefetches */
345 2, /* Branch cost */
346 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
347 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
348 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
349 COSTS_N_INSNS (1), /* cost of FABS instruction. */
350 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
351 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
352 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
353 DUMMY_STRINGOP_ALGS},
354 {{libcall, {{-1, rep_prefix_4_byte, false}}},
355 DUMMY_STRINGOP_ALGS},
356 1, /* scalar_stmt_cost. */
357 1, /* scalar load_cost. */
358 1, /* scalar_store_cost. */
359 1, /* vec_stmt_cost. */
360 1, /* vec_to_scalar_cost. */
361 1, /* scalar_to_vec_cost. */
362 1, /* vec_align_load_cost. */
363 2, /* vec_unalign_load_cost. */
364 1, /* vec_store_cost. */
365 3, /* cond_taken_branch_cost. */
366 1, /* cond_not_taken_branch_cost. */
369 static const
370 struct processor_costs pentiumpro_cost = {
371 COSTS_N_INSNS (1), /* cost of an add instruction */
372 COSTS_N_INSNS (1), /* cost of a lea instruction */
373 COSTS_N_INSNS (1), /* variable shift costs */
374 COSTS_N_INSNS (1), /* constant shift costs */
375 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
376 COSTS_N_INSNS (4), /* HI */
377 COSTS_N_INSNS (4), /* SI */
378 COSTS_N_INSNS (4), /* DI */
379 COSTS_N_INSNS (4)}, /* other */
380 0, /* cost of multiply per each bit set */
381 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
382 COSTS_N_INSNS (17), /* HI */
383 COSTS_N_INSNS (17), /* SI */
384 COSTS_N_INSNS (17), /* DI */
385 COSTS_N_INSNS (17)}, /* other */
386 COSTS_N_INSNS (1), /* cost of movsx */
387 COSTS_N_INSNS (1), /* cost of movzx */
388 8, /* "large" insn */
389 6, /* MOVE_RATIO */
390 2, /* cost for loading QImode using movzbl */
391 {4, 4, 4}, /* cost of loading integer registers
392 in QImode, HImode and SImode.
393 Relative to reg-reg move (2). */
394 {2, 2, 2}, /* cost of storing integer registers */
395 2, /* cost of reg,reg fld/fst */
396 {2, 2, 6}, /* cost of loading fp registers
397 in SFmode, DFmode and XFmode */
398 {4, 4, 6}, /* cost of storing fp registers
399 in SFmode, DFmode and XFmode */
400 2, /* cost of moving MMX register */
401 {2, 2}, /* cost of loading MMX registers
402 in SImode and DImode */
403 {2, 2}, /* cost of storing MMX registers
404 in SImode and DImode */
405 2, /* cost of moving SSE register */
406 {2, 2, 8}, /* cost of loading SSE registers
407 in SImode, DImode and TImode */
408 {2, 2, 8}, /* cost of storing SSE registers
409 in SImode, DImode and TImode */
410 3, /* MMX or SSE register to integer */
411 8, /* size of l1 cache. */
412 256, /* size of l2 cache */
413 32, /* size of prefetch block */
414 6, /* number of parallel prefetches */
415 2, /* Branch cost */
416 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
417 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
418 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
419 COSTS_N_INSNS (2), /* cost of FABS instruction. */
420 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
421 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
422 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
423 (we ensure the alignment). For small blocks inline loop is still a
424 noticeable win, for bigger blocks either rep movsl or rep movsb is
425 way to go. Rep movsb has apparently more expensive startup time in CPU,
426 but after 4K the difference is down in the noise. */
427 {{rep_prefix_4_byte, {{128, loop, false}, {1024, unrolled_loop, false},
428 {8192, rep_prefix_4_byte, false},
429 {-1, rep_prefix_1_byte, false}}},
430 DUMMY_STRINGOP_ALGS},
431 {{rep_prefix_4_byte, {{1024, unrolled_loop, false},
432 {8192, rep_prefix_4_byte, false},
433 {-1, libcall, false}}},
434 DUMMY_STRINGOP_ALGS},
435 1, /* scalar_stmt_cost. */
436 1, /* scalar load_cost. */
437 1, /* scalar_store_cost. */
438 1, /* vec_stmt_cost. */
439 1, /* vec_to_scalar_cost. */
440 1, /* scalar_to_vec_cost. */
441 1, /* vec_align_load_cost. */
442 2, /* vec_unalign_load_cost. */
443 1, /* vec_store_cost. */
444 3, /* cond_taken_branch_cost. */
445 1, /* cond_not_taken_branch_cost. */
448 static const
449 struct processor_costs geode_cost = {
450 COSTS_N_INSNS (1), /* cost of an add instruction */
451 COSTS_N_INSNS (1), /* cost of a lea instruction */
452 COSTS_N_INSNS (2), /* variable shift costs */
453 COSTS_N_INSNS (1), /* constant shift costs */
454 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
455 COSTS_N_INSNS (4), /* HI */
456 COSTS_N_INSNS (7), /* SI */
457 COSTS_N_INSNS (7), /* DI */
458 COSTS_N_INSNS (7)}, /* other */
459 0, /* cost of multiply per each bit set */
460 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
461 COSTS_N_INSNS (23), /* HI */
462 COSTS_N_INSNS (39), /* SI */
463 COSTS_N_INSNS (39), /* DI */
464 COSTS_N_INSNS (39)}, /* other */
465 COSTS_N_INSNS (1), /* cost of movsx */
466 COSTS_N_INSNS (1), /* cost of movzx */
467 8, /* "large" insn */
468 4, /* MOVE_RATIO */
469 1, /* cost for loading QImode using movzbl */
470 {1, 1, 1}, /* cost of loading integer registers
471 in QImode, HImode and SImode.
472 Relative to reg-reg move (2). */
473 {1, 1, 1}, /* cost of storing integer registers */
474 1, /* cost of reg,reg fld/fst */
475 {1, 1, 1}, /* cost of loading fp registers
476 in SFmode, DFmode and XFmode */
477 {4, 6, 6}, /* cost of storing fp registers
478 in SFmode, DFmode and XFmode */
480 1, /* cost of moving MMX register */
481 {1, 1}, /* cost of loading MMX registers
482 in SImode and DImode */
483 {1, 1}, /* cost of storing MMX registers
484 in SImode and DImode */
485 1, /* cost of moving SSE register */
486 {1, 1, 1}, /* cost of loading SSE registers
487 in SImode, DImode and TImode */
488 {1, 1, 1}, /* cost of storing SSE registers
489 in SImode, DImode and TImode */
490 1, /* MMX or SSE register to integer */
491 64, /* size of l1 cache. */
492 128, /* size of l2 cache. */
493 32, /* size of prefetch block */
494 1, /* number of parallel prefetches */
495 1, /* Branch cost */
496 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
497 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
498 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
499 COSTS_N_INSNS (1), /* cost of FABS instruction. */
500 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
501 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
502 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
503 DUMMY_STRINGOP_ALGS},
504 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
505 DUMMY_STRINGOP_ALGS},
506 1, /* scalar_stmt_cost. */
507 1, /* scalar load_cost. */
508 1, /* scalar_store_cost. */
509 1, /* vec_stmt_cost. */
510 1, /* vec_to_scalar_cost. */
511 1, /* scalar_to_vec_cost. */
512 1, /* vec_align_load_cost. */
513 2, /* vec_unalign_load_cost. */
514 1, /* vec_store_cost. */
515 3, /* cond_taken_branch_cost. */
516 1, /* cond_not_taken_branch_cost. */
519 static const
520 struct processor_costs k6_cost = {
521 COSTS_N_INSNS (1), /* cost of an add instruction */
522 COSTS_N_INSNS (2), /* cost of a lea instruction */
523 COSTS_N_INSNS (1), /* variable shift costs */
524 COSTS_N_INSNS (1), /* constant shift costs */
525 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
526 COSTS_N_INSNS (3), /* HI */
527 COSTS_N_INSNS (3), /* SI */
528 COSTS_N_INSNS (3), /* DI */
529 COSTS_N_INSNS (3)}, /* other */
530 0, /* cost of multiply per each bit set */
531 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
532 COSTS_N_INSNS (18), /* HI */
533 COSTS_N_INSNS (18), /* SI */
534 COSTS_N_INSNS (18), /* DI */
535 COSTS_N_INSNS (18)}, /* other */
536 COSTS_N_INSNS (2), /* cost of movsx */
537 COSTS_N_INSNS (2), /* cost of movzx */
538 8, /* "large" insn */
539 4, /* MOVE_RATIO */
540 3, /* cost for loading QImode using movzbl */
541 {4, 5, 4}, /* cost of loading integer registers
542 in QImode, HImode and SImode.
543 Relative to reg-reg move (2). */
544 {2, 3, 2}, /* cost of storing integer registers */
545 4, /* cost of reg,reg fld/fst */
546 {6, 6, 6}, /* cost of loading fp registers
547 in SFmode, DFmode and XFmode */
548 {4, 4, 4}, /* cost of storing fp registers
549 in SFmode, DFmode and XFmode */
550 2, /* cost of moving MMX register */
551 {2, 2}, /* cost of loading MMX registers
552 in SImode and DImode */
553 {2, 2}, /* cost of storing MMX registers
554 in SImode and DImode */
555 2, /* cost of moving SSE register */
556 {2, 2, 8}, /* cost of loading SSE registers
557 in SImode, DImode and TImode */
558 {2, 2, 8}, /* cost of storing SSE registers
559 in SImode, DImode and TImode */
560 6, /* MMX or SSE register to integer */
561 32, /* size of l1 cache. */
562 32, /* size of l2 cache. Some models
563 have integrated l2 cache, but
564 optimizing for k6 is not important
565 enough to worry about that. */
566 32, /* size of prefetch block */
567 1, /* number of parallel prefetches */
568 1, /* Branch cost */
569 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
570 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
571 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
572 COSTS_N_INSNS (2), /* cost of FABS instruction. */
573 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
574 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
575 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
576 DUMMY_STRINGOP_ALGS},
577 {{libcall, {{256, rep_prefix_4_byte, false}, {-1, libcall, false}}},
578 DUMMY_STRINGOP_ALGS},
579 1, /* scalar_stmt_cost. */
580 1, /* scalar load_cost. */
581 1, /* scalar_store_cost. */
582 1, /* vec_stmt_cost. */
583 1, /* vec_to_scalar_cost. */
584 1, /* scalar_to_vec_cost. */
585 1, /* vec_align_load_cost. */
586 2, /* vec_unalign_load_cost. */
587 1, /* vec_store_cost. */
588 3, /* cond_taken_branch_cost. */
589 1, /* cond_not_taken_branch_cost. */
592 static const
593 struct processor_costs athlon_cost = {
594 COSTS_N_INSNS (1), /* cost of an add instruction */
595 COSTS_N_INSNS (2), /* cost of a lea instruction */
596 COSTS_N_INSNS (1), /* variable shift costs */
597 COSTS_N_INSNS (1), /* constant shift costs */
598 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
599 COSTS_N_INSNS (5), /* HI */
600 COSTS_N_INSNS (5), /* SI */
601 COSTS_N_INSNS (5), /* DI */
602 COSTS_N_INSNS (5)}, /* other */
603 0, /* cost of multiply per each bit set */
604 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
605 COSTS_N_INSNS (26), /* HI */
606 COSTS_N_INSNS (42), /* SI */
607 COSTS_N_INSNS (74), /* DI */
608 COSTS_N_INSNS (74)}, /* other */
609 COSTS_N_INSNS (1), /* cost of movsx */
610 COSTS_N_INSNS (1), /* cost of movzx */
611 8, /* "large" insn */
612 9, /* MOVE_RATIO */
613 4, /* cost for loading QImode using movzbl */
614 {3, 4, 3}, /* cost of loading integer registers
615 in QImode, HImode and SImode.
616 Relative to reg-reg move (2). */
617 {3, 4, 3}, /* cost of storing integer registers */
618 4, /* cost of reg,reg fld/fst */
619 {4, 4, 12}, /* cost of loading fp registers
620 in SFmode, DFmode and XFmode */
621 {6, 6, 8}, /* cost of storing fp registers
622 in SFmode, DFmode and XFmode */
623 2, /* cost of moving MMX register */
624 {4, 4}, /* cost of loading MMX registers
625 in SImode and DImode */
626 {4, 4}, /* cost of storing MMX registers
627 in SImode and DImode */
628 2, /* cost of moving SSE register */
629 {4, 4, 6}, /* cost of loading SSE registers
630 in SImode, DImode and TImode */
631 {4, 4, 5}, /* cost of storing SSE registers
632 in SImode, DImode and TImode */
633 5, /* MMX or SSE register to integer */
634 64, /* size of l1 cache. */
635 256, /* size of l2 cache. */
636 64, /* size of prefetch block */
637 6, /* number of parallel prefetches */
638 5, /* Branch cost */
639 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
640 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
641 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
642 COSTS_N_INSNS (2), /* cost of FABS instruction. */
643 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
644 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
645 /* For some reason, Athlon deals better with REP prefix (relative to loops)
646 compared to K8. Alignment becomes important after 8 bytes for memcpy and
647 128 bytes for memset. */
648 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
649 DUMMY_STRINGOP_ALGS},
650 {{libcall, {{2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
651 DUMMY_STRINGOP_ALGS},
652 1, /* scalar_stmt_cost. */
653 1, /* scalar load_cost. */
654 1, /* scalar_store_cost. */
655 1, /* vec_stmt_cost. */
656 1, /* vec_to_scalar_cost. */
657 1, /* scalar_to_vec_cost. */
658 1, /* vec_align_load_cost. */
659 2, /* vec_unalign_load_cost. */
660 1, /* vec_store_cost. */
661 3, /* cond_taken_branch_cost. */
662 1, /* cond_not_taken_branch_cost. */
665 static const
666 struct processor_costs k8_cost = {
667 COSTS_N_INSNS (1), /* cost of an add instruction */
668 COSTS_N_INSNS (2), /* cost of a lea instruction */
669 COSTS_N_INSNS (1), /* variable shift costs */
670 COSTS_N_INSNS (1), /* constant shift costs */
671 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
672 COSTS_N_INSNS (4), /* HI */
673 COSTS_N_INSNS (3), /* SI */
674 COSTS_N_INSNS (4), /* DI */
675 COSTS_N_INSNS (5)}, /* other */
676 0, /* cost of multiply per each bit set */
677 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
678 COSTS_N_INSNS (26), /* HI */
679 COSTS_N_INSNS (42), /* SI */
680 COSTS_N_INSNS (74), /* DI */
681 COSTS_N_INSNS (74)}, /* other */
682 COSTS_N_INSNS (1), /* cost of movsx */
683 COSTS_N_INSNS (1), /* cost of movzx */
684 8, /* "large" insn */
685 9, /* MOVE_RATIO */
686 4, /* cost for loading QImode using movzbl */
687 {3, 4, 3}, /* cost of loading integer registers
688 in QImode, HImode and SImode.
689 Relative to reg-reg move (2). */
690 {3, 4, 3}, /* cost of storing integer registers */
691 4, /* cost of reg,reg fld/fst */
692 {4, 4, 12}, /* cost of loading fp registers
693 in SFmode, DFmode and XFmode */
694 {6, 6, 8}, /* cost of storing fp registers
695 in SFmode, DFmode and XFmode */
696 2, /* cost of moving MMX register */
697 {3, 3}, /* cost of loading MMX registers
698 in SImode and DImode */
699 {4, 4}, /* cost of storing MMX registers
700 in SImode and DImode */
701 2, /* cost of moving SSE register */
702 {4, 3, 6}, /* cost of loading SSE registers
703 in SImode, DImode and TImode */
704 {4, 4, 5}, /* cost of storing SSE registers
705 in SImode, DImode and TImode */
706 5, /* MMX or SSE register to integer */
707 64, /* size of l1 cache. */
708 512, /* size of l2 cache. */
709 64, /* size of prefetch block */
710 /* New AMD processors never drop prefetches; if they cannot be performed
711 immediately, they are queued. We set number of simultaneous prefetches
712 to a large constant to reflect this (it probably is not a good idea not
713 to limit number of prefetches at all, as their execution also takes some
714 time). */
715 100, /* number of parallel prefetches */
716 3, /* Branch cost */
717 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
718 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
719 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
720 COSTS_N_INSNS (2), /* cost of FABS instruction. */
721 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
722 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
723 /* K8 has optimized REP instruction for medium sized blocks, but for very
724 small blocks it is better to use loop. For large blocks, libcall can
725 do nontemporary accesses and beat inline considerably. */
726 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
727 {-1, rep_prefix_4_byte, false}}},
728 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
729 {-1, libcall, false}}}},
730 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
731 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
732 {libcall, {{48, unrolled_loop, false},
733 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
734 4, /* scalar_stmt_cost. */
735 2, /* scalar load_cost. */
736 2, /* scalar_store_cost. */
737 5, /* vec_stmt_cost. */
738 0, /* vec_to_scalar_cost. */
739 2, /* scalar_to_vec_cost. */
740 2, /* vec_align_load_cost. */
741 3, /* vec_unalign_load_cost. */
742 3, /* vec_store_cost. */
743 3, /* cond_taken_branch_cost. */
744 2, /* cond_not_taken_branch_cost. */
747 struct processor_costs amdfam10_cost = {
748 COSTS_N_INSNS (1), /* cost of an add instruction */
749 COSTS_N_INSNS (2), /* cost of a lea instruction */
750 COSTS_N_INSNS (1), /* variable shift costs */
751 COSTS_N_INSNS (1), /* constant shift costs */
752 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
753 COSTS_N_INSNS (4), /* HI */
754 COSTS_N_INSNS (3), /* SI */
755 COSTS_N_INSNS (4), /* DI */
756 COSTS_N_INSNS (5)}, /* other */
757 0, /* cost of multiply per each bit set */
758 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
759 COSTS_N_INSNS (35), /* HI */
760 COSTS_N_INSNS (51), /* SI */
761 COSTS_N_INSNS (83), /* DI */
762 COSTS_N_INSNS (83)}, /* other */
763 COSTS_N_INSNS (1), /* cost of movsx */
764 COSTS_N_INSNS (1), /* cost of movzx */
765 8, /* "large" insn */
766 9, /* MOVE_RATIO */
767 4, /* cost for loading QImode using movzbl */
768 {3, 4, 3}, /* cost of loading integer registers
769 in QImode, HImode and SImode.
770 Relative to reg-reg move (2). */
771 {3, 4, 3}, /* cost of storing integer registers */
772 4, /* cost of reg,reg fld/fst */
773 {4, 4, 12}, /* cost of loading fp registers
774 in SFmode, DFmode and XFmode */
775 {6, 6, 8}, /* cost of storing fp registers
776 in SFmode, DFmode and XFmode */
777 2, /* cost of moving MMX register */
778 {3, 3}, /* cost of loading MMX registers
779 in SImode and DImode */
780 {4, 4}, /* cost of storing MMX registers
781 in SImode and DImode */
782 2, /* cost of moving SSE register */
783 {4, 4, 3}, /* cost of loading SSE registers
784 in SImode, DImode and TImode */
785 {4, 4, 5}, /* cost of storing SSE registers
786 in SImode, DImode and TImode */
787 3, /* MMX or SSE register to integer */
788 /* On K8:
789 MOVD reg64, xmmreg Double FSTORE 4
790 MOVD reg32, xmmreg Double FSTORE 4
791 On AMDFAM10:
792 MOVD reg64, xmmreg Double FADD 3
793 1/1 1/1
794 MOVD reg32, xmmreg Double FADD 3
795 1/1 1/1 */
796 64, /* size of l1 cache. */
797 512, /* size of l2 cache. */
798 64, /* size of prefetch block */
799 /* New AMD processors never drop prefetches; if they cannot be performed
800 immediately, they are queued. We set number of simultaneous prefetches
801 to a large constant to reflect this (it probably is not a good idea not
802 to limit number of prefetches at all, as their execution also takes some
803 time). */
804 100, /* number of parallel prefetches */
805 2, /* Branch cost */
806 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
807 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
808 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
809 COSTS_N_INSNS (2), /* cost of FABS instruction. */
810 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
811 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
813 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
814 very small blocks it is better to use loop. For large blocks, libcall can
815 do nontemporary accesses and beat inline considerably. */
816 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
817 {-1, rep_prefix_4_byte, false}}},
818 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
819 {-1, libcall, false}}}},
820 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
821 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
822 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
823 {-1, libcall, false}}}},
824 4, /* scalar_stmt_cost. */
825 2, /* scalar load_cost. */
826 2, /* scalar_store_cost. */
827 6, /* vec_stmt_cost. */
828 0, /* vec_to_scalar_cost. */
829 2, /* scalar_to_vec_cost. */
830 2, /* vec_align_load_cost. */
831 2, /* vec_unalign_load_cost. */
832 2, /* vec_store_cost. */
833 2, /* cond_taken_branch_cost. */
834 1, /* cond_not_taken_branch_cost. */
837 struct processor_costs bdver1_cost = {
838 COSTS_N_INSNS (1), /* cost of an add instruction */
839 COSTS_N_INSNS (1), /* cost of a lea instruction */
840 COSTS_N_INSNS (1), /* variable shift costs */
841 COSTS_N_INSNS (1), /* constant shift costs */
842 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
843 COSTS_N_INSNS (4), /* HI */
844 COSTS_N_INSNS (4), /* SI */
845 COSTS_N_INSNS (6), /* DI */
846 COSTS_N_INSNS (6)}, /* other */
847 0, /* cost of multiply per each bit set */
848 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
849 COSTS_N_INSNS (35), /* HI */
850 COSTS_N_INSNS (51), /* SI */
851 COSTS_N_INSNS (83), /* DI */
852 COSTS_N_INSNS (83)}, /* other */
853 COSTS_N_INSNS (1), /* cost of movsx */
854 COSTS_N_INSNS (1), /* cost of movzx */
855 8, /* "large" insn */
856 9, /* MOVE_RATIO */
857 4, /* cost for loading QImode using movzbl */
858 {5, 5, 4}, /* cost of loading integer registers
859 in QImode, HImode and SImode.
860 Relative to reg-reg move (2). */
861 {4, 4, 4}, /* cost of storing integer registers */
862 2, /* cost of reg,reg fld/fst */
863 {5, 5, 12}, /* cost of loading fp registers
864 in SFmode, DFmode and XFmode */
865 {4, 4, 8}, /* cost of storing fp registers
866 in SFmode, DFmode and XFmode */
867 2, /* cost of moving MMX register */
868 {4, 4}, /* cost of loading MMX registers
869 in SImode and DImode */
870 {4, 4}, /* cost of storing MMX registers
871 in SImode and DImode */
872 2, /* cost of moving SSE register */
873 {4, 4, 4}, /* cost of loading SSE registers
874 in SImode, DImode and TImode */
875 {4, 4, 4}, /* cost of storing SSE registers
876 in SImode, DImode and TImode */
877 2, /* MMX or SSE register to integer */
878 /* On K8:
879 MOVD reg64, xmmreg Double FSTORE 4
880 MOVD reg32, xmmreg Double FSTORE 4
881 On AMDFAM10:
882 MOVD reg64, xmmreg Double FADD 3
883 1/1 1/1
884 MOVD reg32, xmmreg Double FADD 3
885 1/1 1/1 */
886 16, /* size of l1 cache. */
887 2048, /* size of l2 cache. */
888 64, /* size of prefetch block */
889 /* New AMD processors never drop prefetches; if they cannot be performed
890 immediately, they are queued. We set number of simultaneous prefetches
891 to a large constant to reflect this (it probably is not a good idea not
892 to limit number of prefetches at all, as their execution also takes some
893 time). */
894 100, /* number of parallel prefetches */
895 2, /* Branch cost */
896 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
897 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
898 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
899 COSTS_N_INSNS (2), /* cost of FABS instruction. */
900 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
901 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
903 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
904 very small blocks it is better to use loop. For large blocks, libcall
905 can do nontemporary accesses and beat inline considerably. */
906 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
907 {-1, rep_prefix_4_byte, false}}},
908 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
909 {-1, libcall, false}}}},
910 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
911 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
912 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
913 {-1, libcall, false}}}},
914 6, /* scalar_stmt_cost. */
915 4, /* scalar load_cost. */
916 4, /* scalar_store_cost. */
917 6, /* vec_stmt_cost. */
918 0, /* vec_to_scalar_cost. */
919 2, /* scalar_to_vec_cost. */
920 4, /* vec_align_load_cost. */
921 4, /* vec_unalign_load_cost. */
922 4, /* vec_store_cost. */
923 2, /* cond_taken_branch_cost. */
924 1, /* cond_not_taken_branch_cost. */
927 struct processor_costs bdver2_cost = {
928 COSTS_N_INSNS (1), /* cost of an add instruction */
929 COSTS_N_INSNS (1), /* cost of a lea instruction */
930 COSTS_N_INSNS (1), /* variable shift costs */
931 COSTS_N_INSNS (1), /* constant shift costs */
932 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
933 COSTS_N_INSNS (4), /* HI */
934 COSTS_N_INSNS (4), /* SI */
935 COSTS_N_INSNS (6), /* DI */
936 COSTS_N_INSNS (6)}, /* other */
937 0, /* cost of multiply per each bit set */
938 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
939 COSTS_N_INSNS (35), /* HI */
940 COSTS_N_INSNS (51), /* SI */
941 COSTS_N_INSNS (83), /* DI */
942 COSTS_N_INSNS (83)}, /* other */
943 COSTS_N_INSNS (1), /* cost of movsx */
944 COSTS_N_INSNS (1), /* cost of movzx */
945 8, /* "large" insn */
946 9, /* MOVE_RATIO */
947 4, /* cost for loading QImode using movzbl */
948 {5, 5, 4}, /* cost of loading integer registers
949 in QImode, HImode and SImode.
950 Relative to reg-reg move (2). */
951 {4, 4, 4}, /* cost of storing integer registers */
952 2, /* cost of reg,reg fld/fst */
953 {5, 5, 12}, /* cost of loading fp registers
954 in SFmode, DFmode and XFmode */
955 {4, 4, 8}, /* cost of storing fp registers
956 in SFmode, DFmode and XFmode */
957 2, /* cost of moving MMX register */
958 {4, 4}, /* cost of loading MMX registers
959 in SImode and DImode */
960 {4, 4}, /* cost of storing MMX registers
961 in SImode and DImode */
962 2, /* cost of moving SSE register */
963 {4, 4, 4}, /* cost of loading SSE registers
964 in SImode, DImode and TImode */
965 {4, 4, 4}, /* cost of storing SSE registers
966 in SImode, DImode and TImode */
967 2, /* MMX or SSE register to integer */
968 /* On K8:
969 MOVD reg64, xmmreg Double FSTORE 4
970 MOVD reg32, xmmreg Double FSTORE 4
971 On AMDFAM10:
972 MOVD reg64, xmmreg Double FADD 3
973 1/1 1/1
974 MOVD reg32, xmmreg Double FADD 3
975 1/1 1/1 */
976 16, /* size of l1 cache. */
977 2048, /* size of l2 cache. */
978 64, /* size of prefetch block */
979 /* New AMD processors never drop prefetches; if they cannot be performed
980 immediately, they are queued. We set number of simultaneous prefetches
981 to a large constant to reflect this (it probably is not a good idea not
982 to limit number of prefetches at all, as their execution also takes some
983 time). */
984 100, /* number of parallel prefetches */
985 2, /* Branch cost */
986 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
987 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
988 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
989 COSTS_N_INSNS (2), /* cost of FABS instruction. */
990 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
991 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
993 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
994 very small blocks it is better to use loop. For large blocks, libcall
995 can do nontemporary accesses and beat inline considerably. */
996 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
997 {-1, rep_prefix_4_byte, false}}},
998 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
999 {-1, libcall, false}}}},
1000 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1001 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1002 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1003 {-1, libcall, false}}}},
1004 6, /* scalar_stmt_cost. */
1005 4, /* scalar load_cost. */
1006 4, /* scalar_store_cost. */
1007 6, /* vec_stmt_cost. */
1008 0, /* vec_to_scalar_cost. */
1009 2, /* scalar_to_vec_cost. */
1010 4, /* vec_align_load_cost. */
1011 4, /* vec_unalign_load_cost. */
1012 4, /* vec_store_cost. */
1013 2, /* cond_taken_branch_cost. */
1014 1, /* cond_not_taken_branch_cost. */
1017 struct processor_costs bdver3_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (1), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (4), /* HI */
1024 COSTS_N_INSNS (4), /* SI */
1025 COSTS_N_INSNS (6), /* DI */
1026 COSTS_N_INSNS (6)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (35), /* HI */
1030 COSTS_N_INSNS (51), /* SI */
1031 COSTS_N_INSNS (83), /* DI */
1032 COSTS_N_INSNS (83)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1036 9, /* MOVE_RATIO */
1037 4, /* cost for loading QImode using movzbl */
1038 {5, 5, 4}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {4, 4, 4}, /* cost of storing integer registers */
1042 2, /* cost of reg,reg fld/fst */
1043 {5, 5, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {4, 4, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 4}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 4}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 2, /* MMX or SSE register to integer */
1058 16, /* size of l1 cache. */
1059 2048, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 /* New AMD processors never drop prefetches; if they cannot be performed
1062 immediately, they are queued. We set number of simultaneous prefetches
1063 to a large constant to reflect this (it probably is not a good idea not
1064 to limit number of prefetches at all, as their execution also takes some
1065 time). */
1066 100, /* number of parallel prefetches */
1067 2, /* Branch cost */
1068 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1069 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1070 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1071 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1072 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1073 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1075 /* BDVER3 has optimized REP instruction for medium sized blocks, but for
1076 very small blocks it is better to use loop. For large blocks, libcall
1077 can do nontemporary accesses and beat inline considerably. */
1078 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1079 {-1, rep_prefix_4_byte, false}}},
1080 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1081 {-1, libcall, false}}}},
1082 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1083 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1084 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1085 {-1, libcall, false}}}},
1086 6, /* scalar_stmt_cost. */
1087 4, /* scalar load_cost. */
1088 4, /* scalar_store_cost. */
1089 6, /* vec_stmt_cost. */
1090 0, /* vec_to_scalar_cost. */
1091 2, /* scalar_to_vec_cost. */
1092 4, /* vec_align_load_cost. */
1093 4, /* vec_unalign_load_cost. */
1094 4, /* vec_store_cost. */
1095 2, /* cond_taken_branch_cost. */
1096 1, /* cond_not_taken_branch_cost. */
1099 struct processor_costs btver1_cost = {
1100 COSTS_N_INSNS (1), /* cost of an add instruction */
1101 COSTS_N_INSNS (2), /* cost of a lea instruction */
1102 COSTS_N_INSNS (1), /* variable shift costs */
1103 COSTS_N_INSNS (1), /* constant shift costs */
1104 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1105 COSTS_N_INSNS (4), /* HI */
1106 COSTS_N_INSNS (3), /* SI */
1107 COSTS_N_INSNS (4), /* DI */
1108 COSTS_N_INSNS (5)}, /* other */
1109 0, /* cost of multiply per each bit set */
1110 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1111 COSTS_N_INSNS (35), /* HI */
1112 COSTS_N_INSNS (51), /* SI */
1113 COSTS_N_INSNS (83), /* DI */
1114 COSTS_N_INSNS (83)}, /* other */
1115 COSTS_N_INSNS (1), /* cost of movsx */
1116 COSTS_N_INSNS (1), /* cost of movzx */
1117 8, /* "large" insn */
1118 9, /* MOVE_RATIO */
1119 4, /* cost for loading QImode using movzbl */
1120 {3, 4, 3}, /* cost of loading integer registers
1121 in QImode, HImode and SImode.
1122 Relative to reg-reg move (2). */
1123 {3, 4, 3}, /* cost of storing integer registers */
1124 4, /* cost of reg,reg fld/fst */
1125 {4, 4, 12}, /* cost of loading fp registers
1126 in SFmode, DFmode and XFmode */
1127 {6, 6, 8}, /* cost of storing fp registers
1128 in SFmode, DFmode and XFmode */
1129 2, /* cost of moving MMX register */
1130 {3, 3}, /* cost of loading MMX registers
1131 in SImode and DImode */
1132 {4, 4}, /* cost of storing MMX registers
1133 in SImode and DImode */
1134 2, /* cost of moving SSE register */
1135 {4, 4, 3}, /* cost of loading SSE registers
1136 in SImode, DImode and TImode */
1137 {4, 4, 5}, /* cost of storing SSE registers
1138 in SImode, DImode and TImode */
1139 3, /* MMX or SSE register to integer */
1140 /* On K8:
1141 MOVD reg64, xmmreg Double FSTORE 4
1142 MOVD reg32, xmmreg Double FSTORE 4
1143 On AMDFAM10:
1144 MOVD reg64, xmmreg Double FADD 3
1145 1/1 1/1
1146 MOVD reg32, xmmreg Double FADD 3
1147 1/1 1/1 */
1148 32, /* size of l1 cache. */
1149 512, /* size of l2 cache. */
1150 64, /* size of prefetch block */
1151 100, /* number of parallel prefetches */
1152 2, /* Branch cost */
1153 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1154 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1155 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1156 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1157 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1158 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1160 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1161 very small blocks it is better to use loop. For large blocks, libcall can
1162 do nontemporary accesses and beat inline considerably. */
1163 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1164 {-1, rep_prefix_4_byte, false}}},
1165 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1166 {-1, libcall, false}}}},
1167 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1168 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1169 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1170 {-1, libcall, false}}}},
1171 4, /* scalar_stmt_cost. */
1172 2, /* scalar load_cost. */
1173 2, /* scalar_store_cost. */
1174 6, /* vec_stmt_cost. */
1175 0, /* vec_to_scalar_cost. */
1176 2, /* scalar_to_vec_cost. */
1177 2, /* vec_align_load_cost. */
1178 2, /* vec_unalign_load_cost. */
1179 2, /* vec_store_cost. */
1180 2, /* cond_taken_branch_cost. */
1181 1, /* cond_not_taken_branch_cost. */
1184 struct processor_costs btver2_cost = {
1185 COSTS_N_INSNS (1), /* cost of an add instruction */
1186 COSTS_N_INSNS (2), /* cost of a lea instruction */
1187 COSTS_N_INSNS (1), /* variable shift costs */
1188 COSTS_N_INSNS (1), /* constant shift costs */
1189 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1190 COSTS_N_INSNS (4), /* HI */
1191 COSTS_N_INSNS (3), /* SI */
1192 COSTS_N_INSNS (4), /* DI */
1193 COSTS_N_INSNS (5)}, /* other */
1194 0, /* cost of multiply per each bit set */
1195 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1196 COSTS_N_INSNS (35), /* HI */
1197 COSTS_N_INSNS (51), /* SI */
1198 COSTS_N_INSNS (83), /* DI */
1199 COSTS_N_INSNS (83)}, /* other */
1200 COSTS_N_INSNS (1), /* cost of movsx */
1201 COSTS_N_INSNS (1), /* cost of movzx */
1202 8, /* "large" insn */
1203 9, /* MOVE_RATIO */
1204 4, /* cost for loading QImode using movzbl */
1205 {3, 4, 3}, /* cost of loading integer registers
1206 in QImode, HImode and SImode.
1207 Relative to reg-reg move (2). */
1208 {3, 4, 3}, /* cost of storing integer registers */
1209 4, /* cost of reg,reg fld/fst */
1210 {4, 4, 12}, /* cost of loading fp registers
1211 in SFmode, DFmode and XFmode */
1212 {6, 6, 8}, /* cost of storing fp registers
1213 in SFmode, DFmode and XFmode */
1214 2, /* cost of moving MMX register */
1215 {3, 3}, /* cost of loading MMX registers
1216 in SImode and DImode */
1217 {4, 4}, /* cost of storing MMX registers
1218 in SImode and DImode */
1219 2, /* cost of moving SSE register */
1220 {4, 4, 3}, /* cost of loading SSE registers
1221 in SImode, DImode and TImode */
1222 {4, 4, 5}, /* cost of storing SSE registers
1223 in SImode, DImode and TImode */
1224 3, /* MMX or SSE register to integer */
1225 /* On K8:
1226 MOVD reg64, xmmreg Double FSTORE 4
1227 MOVD reg32, xmmreg Double FSTORE 4
1228 On AMDFAM10:
1229 MOVD reg64, xmmreg Double FADD 3
1230 1/1 1/1
1231 MOVD reg32, xmmreg Double FADD 3
1232 1/1 1/1 */
1233 32, /* size of l1 cache. */
1234 2048, /* size of l2 cache. */
1235 64, /* size of prefetch block */
1236 100, /* number of parallel prefetches */
1237 2, /* Branch cost */
1238 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1239 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1240 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1241 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1242 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1243 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1245 {{libcall, {{6, loop, false}, {14, unrolled_loop, false},
1246 {-1, rep_prefix_4_byte, false}}},
1247 {libcall, {{16, loop, false}, {8192, rep_prefix_8_byte, false},
1248 {-1, libcall, false}}}},
1249 {{libcall, {{8, loop, false}, {24, unrolled_loop, false},
1250 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1251 {libcall, {{48, unrolled_loop, false}, {8192, rep_prefix_8_byte, false},
1252 {-1, libcall, false}}}},
1253 4, /* scalar_stmt_cost. */
1254 2, /* scalar load_cost. */
1255 2, /* scalar_store_cost. */
1256 6, /* vec_stmt_cost. */
1257 0, /* vec_to_scalar_cost. */
1258 2, /* scalar_to_vec_cost. */
1259 2, /* vec_align_load_cost. */
1260 2, /* vec_unalign_load_cost. */
1261 2, /* vec_store_cost. */
1262 2, /* cond_taken_branch_cost. */
1263 1, /* cond_not_taken_branch_cost. */
1266 static const
1267 struct processor_costs pentium4_cost = {
1268 COSTS_N_INSNS (1), /* cost of an add instruction */
1269 COSTS_N_INSNS (3), /* cost of a lea instruction */
1270 COSTS_N_INSNS (4), /* variable shift costs */
1271 COSTS_N_INSNS (4), /* constant shift costs */
1272 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1273 COSTS_N_INSNS (15), /* HI */
1274 COSTS_N_INSNS (15), /* SI */
1275 COSTS_N_INSNS (15), /* DI */
1276 COSTS_N_INSNS (15)}, /* other */
1277 0, /* cost of multiply per each bit set */
1278 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1279 COSTS_N_INSNS (56), /* HI */
1280 COSTS_N_INSNS (56), /* SI */
1281 COSTS_N_INSNS (56), /* DI */
1282 COSTS_N_INSNS (56)}, /* other */
1283 COSTS_N_INSNS (1), /* cost of movsx */
1284 COSTS_N_INSNS (1), /* cost of movzx */
1285 16, /* "large" insn */
1286 6, /* MOVE_RATIO */
1287 2, /* cost for loading QImode using movzbl */
1288 {4, 5, 4}, /* cost of loading integer registers
1289 in QImode, HImode and SImode.
1290 Relative to reg-reg move (2). */
1291 {2, 3, 2}, /* cost of storing integer registers */
1292 2, /* cost of reg,reg fld/fst */
1293 {2, 2, 6}, /* cost of loading fp registers
1294 in SFmode, DFmode and XFmode */
1295 {4, 4, 6}, /* cost of storing fp registers
1296 in SFmode, DFmode and XFmode */
1297 2, /* cost of moving MMX register */
1298 {2, 2}, /* cost of loading MMX registers
1299 in SImode and DImode */
1300 {2, 2}, /* cost of storing MMX registers
1301 in SImode and DImode */
1302 12, /* cost of moving SSE register */
1303 {12, 12, 12}, /* cost of loading SSE registers
1304 in SImode, DImode and TImode */
1305 {2, 2, 8}, /* cost of storing SSE registers
1306 in SImode, DImode and TImode */
1307 10, /* MMX or SSE register to integer */
1308 8, /* size of l1 cache. */
1309 256, /* size of l2 cache. */
1310 64, /* size of prefetch block */
1311 6, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1319 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1320 DUMMY_STRINGOP_ALGS},
1321 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1322 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1323 DUMMY_STRINGOP_ALGS},
1324 1, /* scalar_stmt_cost. */
1325 1, /* scalar load_cost. */
1326 1, /* scalar_store_cost. */
1327 1, /* vec_stmt_cost. */
1328 1, /* vec_to_scalar_cost. */
1329 1, /* scalar_to_vec_cost. */
1330 1, /* vec_align_load_cost. */
1331 2, /* vec_unalign_load_cost. */
1332 1, /* vec_store_cost. */
1333 3, /* cond_taken_branch_cost. */
1334 1, /* cond_not_taken_branch_cost. */
1337 static const
1338 struct processor_costs nocona_cost = {
1339 COSTS_N_INSNS (1), /* cost of an add instruction */
1340 COSTS_N_INSNS (1), /* cost of a lea instruction */
1341 COSTS_N_INSNS (1), /* variable shift costs */
1342 COSTS_N_INSNS (1), /* constant shift costs */
1343 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1344 COSTS_N_INSNS (10), /* HI */
1345 COSTS_N_INSNS (10), /* SI */
1346 COSTS_N_INSNS (10), /* DI */
1347 COSTS_N_INSNS (10)}, /* other */
1348 0, /* cost of multiply per each bit set */
1349 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1350 COSTS_N_INSNS (66), /* HI */
1351 COSTS_N_INSNS (66), /* SI */
1352 COSTS_N_INSNS (66), /* DI */
1353 COSTS_N_INSNS (66)}, /* other */
1354 COSTS_N_INSNS (1), /* cost of movsx */
1355 COSTS_N_INSNS (1), /* cost of movzx */
1356 16, /* "large" insn */
1357 17, /* MOVE_RATIO */
1358 4, /* cost for loading QImode using movzbl */
1359 {4, 4, 4}, /* cost of loading integer registers
1360 in QImode, HImode and SImode.
1361 Relative to reg-reg move (2). */
1362 {4, 4, 4}, /* cost of storing integer registers */
1363 3, /* cost of reg,reg fld/fst */
1364 {12, 12, 12}, /* cost of loading fp registers
1365 in SFmode, DFmode and XFmode */
1366 {4, 4, 4}, /* cost of storing fp registers
1367 in SFmode, DFmode and XFmode */
1368 6, /* cost of moving MMX register */
1369 {12, 12}, /* cost of loading MMX registers
1370 in SImode and DImode */
1371 {12, 12}, /* cost of storing MMX registers
1372 in SImode and DImode */
1373 6, /* cost of moving SSE register */
1374 {12, 12, 12}, /* cost of loading SSE registers
1375 in SImode, DImode and TImode */
1376 {12, 12, 12}, /* cost of storing SSE registers
1377 in SImode, DImode and TImode */
1378 8, /* MMX or SSE register to integer */
1379 8, /* size of l1 cache. */
1380 1024, /* size of l2 cache. */
1381 64, /* size of prefetch block */
1382 8, /* number of parallel prefetches */
1383 1, /* Branch cost */
1384 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1385 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1386 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1387 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1388 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1389 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1390 {{libcall, {{12, loop_1_byte, false}, {-1, rep_prefix_4_byte, false}}},
1391 {libcall, {{32, loop, false}, {20000, rep_prefix_8_byte, false},
1392 {100000, unrolled_loop, false}, {-1, libcall, false}}}},
1393 {{libcall, {{6, loop_1_byte, false}, {48, loop, false},
1394 {20480, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1395 {libcall, {{24, loop, false}, {64, unrolled_loop, false},
1396 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1397 1, /* scalar_stmt_cost. */
1398 1, /* scalar load_cost. */
1399 1, /* scalar_store_cost. */
1400 1, /* vec_stmt_cost. */
1401 1, /* vec_to_scalar_cost. */
1402 1, /* scalar_to_vec_cost. */
1403 1, /* vec_align_load_cost. */
1404 2, /* vec_unalign_load_cost. */
1405 1, /* vec_store_cost. */
1406 3, /* cond_taken_branch_cost. */
1407 1, /* cond_not_taken_branch_cost. */
1410 static const
1411 struct processor_costs atom_cost = {
1412 COSTS_N_INSNS (1), /* cost of an add instruction */
1413 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1414 COSTS_N_INSNS (1), /* variable shift costs */
1415 COSTS_N_INSNS (1), /* constant shift costs */
1416 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1417 COSTS_N_INSNS (4), /* HI */
1418 COSTS_N_INSNS (3), /* SI */
1419 COSTS_N_INSNS (4), /* DI */
1420 COSTS_N_INSNS (2)}, /* other */
1421 0, /* cost of multiply per each bit set */
1422 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1423 COSTS_N_INSNS (26), /* HI */
1424 COSTS_N_INSNS (42), /* SI */
1425 COSTS_N_INSNS (74), /* DI */
1426 COSTS_N_INSNS (74)}, /* other */
1427 COSTS_N_INSNS (1), /* cost of movsx */
1428 COSTS_N_INSNS (1), /* cost of movzx */
1429 8, /* "large" insn */
1430 17, /* MOVE_RATIO */
1431 4, /* cost for loading QImode using movzbl */
1432 {4, 4, 4}, /* cost of loading integer registers
1433 in QImode, HImode and SImode.
1434 Relative to reg-reg move (2). */
1435 {4, 4, 4}, /* cost of storing integer registers */
1436 4, /* cost of reg,reg fld/fst */
1437 {12, 12, 12}, /* cost of loading fp registers
1438 in SFmode, DFmode and XFmode */
1439 {6, 6, 8}, /* cost of storing fp registers
1440 in SFmode, DFmode and XFmode */
1441 2, /* cost of moving MMX register */
1442 {8, 8}, /* cost of loading MMX registers
1443 in SImode and DImode */
1444 {8, 8}, /* cost of storing MMX registers
1445 in SImode and DImode */
1446 2, /* cost of moving SSE register */
1447 {8, 8, 8}, /* cost of loading SSE registers
1448 in SImode, DImode and TImode */
1449 {8, 8, 8}, /* cost of storing SSE registers
1450 in SImode, DImode and TImode */
1451 5, /* MMX or SSE register to integer */
1452 32, /* size of l1 cache. */
1453 256, /* size of l2 cache. */
1454 64, /* size of prefetch block */
1455 6, /* number of parallel prefetches */
1456 3, /* Branch cost */
1457 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1458 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1459 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1460 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1461 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1462 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1463 {{libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
1464 {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
1465 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1466 {{libcall, {{8, loop, false}, {15, unrolled_loop, false},
1467 {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
1468 {libcall, {{24, loop, false}, {32, unrolled_loop, false},
1469 {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}},
1470 1, /* scalar_stmt_cost. */
1471 1, /* scalar load_cost. */
1472 1, /* scalar_store_cost. */
1473 1, /* vec_stmt_cost. */
1474 1, /* vec_to_scalar_cost. */
1475 1, /* scalar_to_vec_cost. */
1476 1, /* vec_align_load_cost. */
1477 2, /* vec_unalign_load_cost. */
1478 1, /* vec_store_cost. */
1479 3, /* cond_taken_branch_cost. */
1480 1, /* cond_not_taken_branch_cost. */
1483 /* Generic64 should produce code tuned for Nocona and K8. */
1484 static const
1485 struct processor_costs generic64_cost = {
1486 COSTS_N_INSNS (1), /* cost of an add instruction */
1487 /* On all chips taken into consideration lea is 2 cycles and more. With
1488 this cost however our current implementation of synth_mult results in
1489 use of unnecessary temporary registers causing regression on several
1490 SPECfp benchmarks. */
1491 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1492 COSTS_N_INSNS (1), /* variable shift costs */
1493 COSTS_N_INSNS (1), /* constant shift costs */
1494 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1495 COSTS_N_INSNS (4), /* HI */
1496 COSTS_N_INSNS (3), /* SI */
1497 COSTS_N_INSNS (4), /* DI */
1498 COSTS_N_INSNS (2)}, /* other */
1499 0, /* cost of multiply per each bit set */
1500 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1501 COSTS_N_INSNS (26), /* HI */
1502 COSTS_N_INSNS (42), /* SI */
1503 COSTS_N_INSNS (74), /* DI */
1504 COSTS_N_INSNS (74)}, /* other */
1505 COSTS_N_INSNS (1), /* cost of movsx */
1506 COSTS_N_INSNS (1), /* cost of movzx */
1507 8, /* "large" insn */
1508 17, /* MOVE_RATIO */
1509 4, /* cost for loading QImode using movzbl */
1510 {4, 4, 4}, /* cost of loading integer registers
1511 in QImode, HImode and SImode.
1512 Relative to reg-reg move (2). */
1513 {4, 4, 4}, /* cost of storing integer registers */
1514 4, /* cost of reg,reg fld/fst */
1515 {12, 12, 12}, /* cost of loading fp registers
1516 in SFmode, DFmode and XFmode */
1517 {6, 6, 8}, /* cost of storing fp registers
1518 in SFmode, DFmode and XFmode */
1519 2, /* cost of moving MMX register */
1520 {8, 8}, /* cost of loading MMX registers
1521 in SImode and DImode */
1522 {8, 8}, /* cost of storing MMX registers
1523 in SImode and DImode */
1524 2, /* cost of moving SSE register */
1525 {8, 8, 8}, /* cost of loading SSE registers
1526 in SImode, DImode and TImode */
1527 {8, 8, 8}, /* cost of storing SSE registers
1528 in SImode, DImode and TImode */
1529 5, /* MMX or SSE register to integer */
1530 32, /* size of l1 cache. */
1531 512, /* size of l2 cache. */
1532 64, /* size of prefetch block */
1533 6, /* number of parallel prefetches */
1534 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1535 value is increased to perhaps more appropriate value of 5. */
1536 3, /* Branch cost */
1537 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1538 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1539 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1540 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1541 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1542 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1543 {DUMMY_STRINGOP_ALGS,
1544 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1545 {-1, libcall, false}}}},
1546 {DUMMY_STRINGOP_ALGS,
1547 {libcall, {{32, loop, false}, {8192, rep_prefix_8_byte, false},
1548 {-1, libcall, false}}}},
1549 1, /* scalar_stmt_cost. */
1550 1, /* scalar load_cost. */
1551 1, /* scalar_store_cost. */
1552 1, /* vec_stmt_cost. */
1553 1, /* vec_to_scalar_cost. */
1554 1, /* scalar_to_vec_cost. */
1555 1, /* vec_align_load_cost. */
1556 2, /* vec_unalign_load_cost. */
1557 1, /* vec_store_cost. */
1558 3, /* cond_taken_branch_cost. */
1559 1, /* cond_not_taken_branch_cost. */
1562 /* core_cost should produce code tuned for Core familly of CPUs. */
1563 static const
1564 struct processor_costs core_cost = {
1565 COSTS_N_INSNS (1), /* cost of an add instruction */
1566 /* On all chips taken into consideration lea is 2 cycles and more. With
1567 this cost however our current implementation of synth_mult results in
1568 use of unnecessary temporary registers causing regression on several
1569 SPECfp benchmarks. */
1570 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1571 COSTS_N_INSNS (1), /* variable shift costs */
1572 COSTS_N_INSNS (1), /* constant shift costs */
1573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1574 COSTS_N_INSNS (4), /* HI */
1575 COSTS_N_INSNS (3), /* SI */
1576 COSTS_N_INSNS (4), /* DI */
1577 COSTS_N_INSNS (2)}, /* other */
1578 0, /* cost of multiply per each bit set */
1579 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1580 COSTS_N_INSNS (26), /* HI */
1581 COSTS_N_INSNS (42), /* SI */
1582 COSTS_N_INSNS (74), /* DI */
1583 COSTS_N_INSNS (74)}, /* other */
1584 COSTS_N_INSNS (1), /* cost of movsx */
1585 COSTS_N_INSNS (1), /* cost of movzx */
1586 8, /* "large" insn */
1587 17, /* MOVE_RATIO */
1588 4, /* cost for loading QImode using movzbl */
1589 {4, 4, 4}, /* cost of loading integer registers
1590 in QImode, HImode and SImode.
1591 Relative to reg-reg move (2). */
1592 {4, 4, 4}, /* cost of storing integer registers */
1593 4, /* cost of reg,reg fld/fst */
1594 {12, 12, 12}, /* cost of loading fp registers
1595 in SFmode, DFmode and XFmode */
1596 {6, 6, 8}, /* cost of storing fp registers
1597 in SFmode, DFmode and XFmode */
1598 2, /* cost of moving MMX register */
1599 {8, 8}, /* cost of loading MMX registers
1600 in SImode and DImode */
1601 {8, 8}, /* cost of storing MMX registers
1602 in SImode and DImode */
1603 2, /* cost of moving SSE register */
1604 {8, 8, 8}, /* cost of loading SSE registers
1605 in SImode, DImode and TImode */
1606 {8, 8, 8}, /* cost of storing SSE registers
1607 in SImode, DImode and TImode */
1608 5, /* MMX or SSE register to integer */
1609 64, /* size of l1 cache. */
1610 512, /* size of l2 cache. */
1611 64, /* size of prefetch block */
1612 6, /* number of parallel prefetches */
1613 /* FIXME perhaps more appropriate value is 5. */
1614 3, /* Branch cost */
1615 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1616 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1617 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1618 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1619 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1620 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1621 {{libcall, {{1024, rep_prefix_4_byte, true}, {-1, libcall, false}}},
1622 {libcall, {{24, loop, true}, {128, rep_prefix_8_byte, true},
1623 {-1, libcall, false}}}},
1624 {{libcall, {{6, loop_1_byte, true},
1625 {24, loop, true},
1626 {8192, rep_prefix_4_byte, true},
1627 {-1, libcall, false}}},
1628 {libcall, {{24, loop, true}, {512, rep_prefix_8_byte, true},
1629 {-1, libcall, false}}}},
1630 1, /* scalar_stmt_cost. */
1631 1, /* scalar load_cost. */
1632 1, /* scalar_store_cost. */
1633 1, /* vec_stmt_cost. */
1634 1, /* vec_to_scalar_cost. */
1635 1, /* scalar_to_vec_cost. */
1636 1, /* vec_align_load_cost. */
1637 2, /* vec_unalign_load_cost. */
1638 1, /* vec_store_cost. */
1639 3, /* cond_taken_branch_cost. */
1640 1, /* cond_not_taken_branch_cost. */
1643 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1644 Athlon and K8. */
1645 static const
1646 struct processor_costs generic32_cost = {
1647 COSTS_N_INSNS (1), /* cost of an add instruction */
1648 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1649 COSTS_N_INSNS (1), /* variable shift costs */
1650 COSTS_N_INSNS (1), /* constant shift costs */
1651 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1652 COSTS_N_INSNS (4), /* HI */
1653 COSTS_N_INSNS (3), /* SI */
1654 COSTS_N_INSNS (4), /* DI */
1655 COSTS_N_INSNS (2)}, /* other */
1656 0, /* cost of multiply per each bit set */
1657 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1658 COSTS_N_INSNS (26), /* HI */
1659 COSTS_N_INSNS (42), /* SI */
1660 COSTS_N_INSNS (74), /* DI */
1661 COSTS_N_INSNS (74)}, /* other */
1662 COSTS_N_INSNS (1), /* cost of movsx */
1663 COSTS_N_INSNS (1), /* cost of movzx */
1664 8, /* "large" insn */
1665 17, /* MOVE_RATIO */
1666 4, /* cost for loading QImode using movzbl */
1667 {4, 4, 4}, /* cost of loading integer registers
1668 in QImode, HImode and SImode.
1669 Relative to reg-reg move (2). */
1670 {4, 4, 4}, /* cost of storing integer registers */
1671 4, /* cost of reg,reg fld/fst */
1672 {12, 12, 12}, /* cost of loading fp registers
1673 in SFmode, DFmode and XFmode */
1674 {6, 6, 8}, /* cost of storing fp registers
1675 in SFmode, DFmode and XFmode */
1676 2, /* cost of moving MMX register */
1677 {8, 8}, /* cost of loading MMX registers
1678 in SImode and DImode */
1679 {8, 8}, /* cost of storing MMX registers
1680 in SImode and DImode */
1681 2, /* cost of moving SSE register */
1682 {8, 8, 8}, /* cost of loading SSE registers
1683 in SImode, DImode and TImode */
1684 {8, 8, 8}, /* cost of storing SSE registers
1685 in SImode, DImode and TImode */
1686 5, /* MMX or SSE register to integer */
1687 32, /* size of l1 cache. */
1688 256, /* size of l2 cache. */
1689 64, /* size of prefetch block */
1690 6, /* number of parallel prefetches */
1691 3, /* Branch cost */
1692 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1693 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1694 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1695 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1696 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1697 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1698 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1699 {-1, libcall, false}}},
1700 DUMMY_STRINGOP_ALGS},
1701 {{libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
1702 {-1, libcall, false}}},
1703 DUMMY_STRINGOP_ALGS},
1704 1, /* scalar_stmt_cost. */
1705 1, /* scalar load_cost. */
1706 1, /* scalar_store_cost. */
1707 1, /* vec_stmt_cost. */
1708 1, /* vec_to_scalar_cost. */
1709 1, /* scalar_to_vec_cost. */
1710 1, /* vec_align_load_cost. */
1711 2, /* vec_unalign_load_cost. */
1712 1, /* vec_store_cost. */
1713 3, /* cond_taken_branch_cost. */
1714 1, /* cond_not_taken_branch_cost. */
1717 /* Set by -mtune. */
1718 const struct processor_costs *ix86_tune_cost = &pentium_cost;
1720 /* Set by -mtune or -Os. */
1721 const struct processor_costs *ix86_cost = &pentium_cost;
1723 /* Processor feature/optimization bitmasks. */
1724 #define m_386 (1<<PROCESSOR_I386)
1725 #define m_486 (1<<PROCESSOR_I486)
1726 #define m_PENT (1<<PROCESSOR_PENTIUM)
1727 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1728 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1729 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1730 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1731 #define m_CORE2 (1<<PROCESSOR_CORE2)
1732 #define m_COREI7 (1<<PROCESSOR_COREI7)
1733 #define m_HASWELL (1<<PROCESSOR_HASWELL)
1734 #define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL)
1735 #define m_ATOM (1<<PROCESSOR_ATOM)
1737 #define m_GEODE (1<<PROCESSOR_GEODE)
1738 #define m_K6 (1<<PROCESSOR_K6)
1739 #define m_K6_GEODE (m_K6 | m_GEODE)
1740 #define m_K8 (1<<PROCESSOR_K8)
1741 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1742 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1743 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1744 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1745 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1746 #define m_BDVER3 (1<<PROCESSOR_BDVER3)
1747 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1748 #define m_BTVER2 (1<<PROCESSOR_BTVER2)
1749 #define m_BDVER (m_BDVER1 | m_BDVER2 | m_BDVER3)
1750 #define m_BTVER (m_BTVER1 | m_BTVER2)
1751 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER)
1753 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1754 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1756 /* Generic instruction choice should be common subset of supported CPUs
1757 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1758 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1760 /* Feature tests against the various tunings. */
1761 unsigned char ix86_tune_features[X86_TUNE_LAST];
1763 /* Feature tests against the various tunings used to create ix86_tune_features
1764 based on the processor mask. */
1765 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1766 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1767 negatively, so enabling for Generic64 seems like good code size
1768 tradeoff. We can't enable it for 32bit generic because it does not
1769 work well with PPro base chips. */
1770 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1772 /* X86_TUNE_PUSH_MEMORY */
1773 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1775 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1776 m_486 | m_PENT,
1778 /* X86_TUNE_UNROLL_STRLEN */
1779 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1781 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1782 on simulation result. But after P4 was made, no performance benefit
1783 was observed with branch hints. It also increases the code size.
1784 As a result, icc never generates branch hints. */
1787 /* X86_TUNE_DOUBLE_WITH_ADD */
1788 ~m_386,
1790 /* X86_TUNE_USE_SAHF */
1791 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC,
1793 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1794 partial dependencies. */
1795 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1797 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1798 register stalls on Generic32 compilation setting as well. However
1799 in current implementation the partial register stalls are not eliminated
1800 very well - they can be introduced via subregs synthesized by combine
1801 and can happen in caller/callee saving sequences. Because this option
1802 pays back little on PPro based chips and is in conflict with partial reg
1803 dependencies used by Athlon/P4 based chips, it is better to leave it off
1804 for generic32 for now. */
1805 m_PPRO,
1807 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1808 m_CORE_ALL | m_GENERIC,
1810 /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall
1811 * on 16-bit immediate moves into memory on Core2 and Corei7. */
1812 m_CORE_ALL | m_GENERIC,
1814 /* X86_TUNE_USE_HIMODE_FIOP */
1815 m_386 | m_486 | m_K6_GEODE,
1817 /* X86_TUNE_USE_SIMODE_FIOP */
1818 ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1820 /* X86_TUNE_USE_MOV0 */
1821 m_K6,
1823 /* X86_TUNE_USE_CLTD */
1824 ~(m_PENT | m_ATOM | m_K6),
1826 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1827 m_PENT4,
1829 /* X86_TUNE_SPLIT_LONG_MOVES */
1830 m_PPRO,
1832 /* X86_TUNE_READ_MODIFY_WRITE */
1833 ~m_PENT,
1835 /* X86_TUNE_READ_MODIFY */
1836 ~(m_PENT | m_PPRO),
1838 /* X86_TUNE_PROMOTE_QIMODE */
1839 m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1841 /* X86_TUNE_FAST_PREFIX */
1842 ~(m_386 | m_486 | m_PENT),
1844 /* X86_TUNE_SINGLE_STRINGOP */
1845 m_386 | m_P4_NOCONA,
1847 /* X86_TUNE_QIMODE_MATH */
1850 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1851 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1852 might be considered for Generic32 if our scheme for avoiding partial
1853 stalls was more effective. */
1854 ~m_PPRO,
1856 /* X86_TUNE_PROMOTE_QI_REGS */
1859 /* X86_TUNE_PROMOTE_HI_REGS */
1860 m_PPRO,
1862 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1863 over esp addition. */
1864 m_386 | m_486 | m_PENT | m_PPRO,
1866 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1867 over esp addition. */
1868 m_PENT,
1870 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1871 over esp subtraction. */
1872 m_386 | m_486 | m_PENT | m_K6_GEODE,
1874 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1875 over esp subtraction. */
1876 m_PENT | m_K6_GEODE,
1878 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1879 for DFmode copies */
1880 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
1882 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1883 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1885 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1886 conflict here in between PPro/Pentium4 based chips that thread 128bit
1887 SSE registers as single units versus K8 based chips that divide SSE
1888 registers to two 64bit halves. This knob promotes all store destinations
1889 to be 128bit to allow register renaming on 128bit SSE units, but usually
1890 results in one extra microop on 64bit SSE units. Experimental results
1891 shows that disabling this option on P4 brings over 20% SPECfp regression,
1892 while enabling it on K8 brings roughly 2.4% regression that can be partly
1893 masked by careful scheduling of moves. */
1894 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
1896 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1897 m_COREI7 | m_HASWELL | m_AMDFAM10 | m_BDVER | m_BTVER,
1899 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1900 m_COREI7 | m_HASWELL| m_BDVER,
1902 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1903 m_BDVER ,
1905 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1906 are resolved on SSE register parts instead of whole registers, so we may
1907 maintain just lower part of scalar values in proper format leaving the
1908 upper part undefined. */
1909 m_ATHLON_K8,
1911 /* X86_TUNE_SSE_TYPELESS_STORES */
1912 m_AMD_MULTIPLE,
1914 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1915 m_PPRO | m_P4_NOCONA,
1917 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1918 m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1920 /* X86_TUNE_PROLOGUE_USING_MOVE */
1921 m_PPRO | m_ATHLON_K8,
1923 /* X86_TUNE_EPILOGUE_USING_MOVE */
1924 m_PPRO | m_ATHLON_K8,
1926 /* X86_TUNE_SHIFT1 */
1927 ~m_486,
1929 /* X86_TUNE_USE_FFREEP */
1930 m_AMD_MULTIPLE,
1932 /* X86_TUNE_INTER_UNIT_MOVES */
1933 ~(m_AMD_MULTIPLE | m_GENERIC),
1935 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1936 ~(m_AMDFAM10 | m_BDVER ),
1938 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1939 than 4 branch instructions in the 16 byte window. */
1940 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1942 /* X86_TUNE_SCHEDULE */
1943 m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1945 /* X86_TUNE_USE_BT */
1946 m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
1948 /* X86_TUNE_USE_INCDEC */
1949 ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC),
1951 /* X86_TUNE_PAD_RETURNS */
1952 m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC,
1954 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
1955 m_ATOM,
1957 /* X86_TUNE_EXT_80387_CONSTANTS */
1958 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
1960 /* X86_TUNE_AVOID_VECTOR_DECODE */
1961 m_CORE_ALL | m_K8 | m_GENERIC64,
1963 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1964 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1965 ~(m_386 | m_486),
1967 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1968 vector path on AMD machines. */
1969 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1971 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1972 machines. */
1973 m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64,
1975 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1976 than a MOV. */
1977 m_PENT,
1979 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1980 but one byte longer. */
1981 m_PENT,
1983 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1984 operand that cannot be represented using a modRM byte. The XOR
1985 replacement is long decoded, so this split helps here as well. */
1986 m_K6,
1988 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
1989 from FP to FP. */
1990 m_CORE_ALL | m_AMDFAM10 | m_GENERIC,
1992 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1993 from integer to FP. */
1994 m_AMDFAM10,
1996 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
1997 with a subsequent conditional jump instruction into a single
1998 compare-and-branch uop. */
1999 m_BDVER,
2001 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2002 will impact LEA instruction selection. */
2003 m_ATOM,
2005 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2006 instructions. */
2007 ~m_ATOM,
2009 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2010 at -O3. For the moment, the prefetching seems badly tuned for Intel
2011 chips. */
2012 m_K6_GEODE | m_AMD_MULTIPLE,
2014 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2015 the auto-vectorizer. */
2016 m_BDVER | m_BTVER2,
2018 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2019 during reassociation of integer computation. */
2020 m_ATOM,
2022 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2023 during reassociation of fp computation. */
2024 m_ATOM | m_HASWELL,
2026 /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE
2027 regs instead of memory. */
2028 m_CORE_ALL,
2030 /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for
2031 a conditional move. */
2032 m_ATOM
2035 /* Feature tests against the various architecture variations. */
2036 unsigned char ix86_arch_features[X86_ARCH_LAST];
2038 /* Feature tests against the various architecture variations, used to create
2039 ix86_arch_features based on the processor mask. */
2040 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2041 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2042 ~(m_386 | m_486 | m_PENT | m_K6),
2044 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2045 ~m_386,
2047 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2048 ~(m_386 | m_486),
2050 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2051 ~m_386,
2053 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2054 ~m_386,
2057 static const unsigned int x86_accumulate_outgoing_args
2058 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC;
2060 static const unsigned int x86_arch_always_fancy_math_387
2061 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2063 static const unsigned int x86_avx256_split_unaligned_load
2064 = m_COREI7 | m_GENERIC;
2066 static const unsigned int x86_avx256_split_unaligned_store
2067 = m_COREI7 | m_BDVER | m_GENERIC;
2069 /* In case the average insn count for single function invocation is
2070 lower than this constant, emit fast (but longer) prologue and
2071 epilogue code. */
2072 #define FAST_PROLOGUE_INSN_COUNT 20
2074 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2075 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2076 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2077 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2079 /* Array of the smallest class containing reg number REGNO, indexed by
2080 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2082 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2084 /* ax, dx, cx, bx */
2085 AREG, DREG, CREG, BREG,
2086 /* si, di, bp, sp */
2087 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2088 /* FP registers */
2089 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2090 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2091 /* arg pointer */
2092 NON_Q_REGS,
2093 /* flags, fpsr, fpcr, frame */
2094 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2095 /* SSE registers */
2096 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2097 SSE_REGS, SSE_REGS,
2098 /* MMX registers */
2099 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2100 MMX_REGS, MMX_REGS,
2101 /* REX registers */
2102 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2103 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2104 /* SSE REX registers */
2105 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2106 SSE_REGS, SSE_REGS,
2109 /* The "default" register map used in 32bit mode. */
2111 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2113 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2114 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2115 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2116 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2117 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2118 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2119 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2122 /* The "default" register map used in 64bit mode. */
2124 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2126 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2127 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2128 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2129 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2130 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2131 8,9,10,11,12,13,14,15, /* extended integer registers */
2132 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2135 /* Define the register numbers to be used in Dwarf debugging information.
2136 The SVR4 reference port C compiler uses the following register numbers
2137 in its Dwarf output code:
2138 0 for %eax (gcc regno = 0)
2139 1 for %ecx (gcc regno = 2)
2140 2 for %edx (gcc regno = 1)
2141 3 for %ebx (gcc regno = 3)
2142 4 for %esp (gcc regno = 7)
2143 5 for %ebp (gcc regno = 6)
2144 6 for %esi (gcc regno = 4)
2145 7 for %edi (gcc regno = 5)
2146 The following three DWARF register numbers are never generated by
2147 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2148 believes these numbers have these meanings.
2149 8 for %eip (no gcc equivalent)
2150 9 for %eflags (gcc regno = 17)
2151 10 for %trapno (no gcc equivalent)
2152 It is not at all clear how we should number the FP stack registers
2153 for the x86 architecture. If the version of SDB on x86/svr4 were
2154 a bit less brain dead with respect to floating-point then we would
2155 have a precedent to follow with respect to DWARF register numbers
2156 for x86 FP registers, but the SDB on x86/svr4 is so completely
2157 broken with respect to FP registers that it is hardly worth thinking
2158 of it as something to strive for compatibility with.
2159 The version of x86/svr4 SDB I have at the moment does (partially)
2160 seem to believe that DWARF register number 11 is associated with
2161 the x86 register %st(0), but that's about all. Higher DWARF
2162 register numbers don't seem to be associated with anything in
2163 particular, and even for DWARF regno 11, SDB only seems to under-
2164 stand that it should say that a variable lives in %st(0) (when
2165 asked via an `=' command) if we said it was in DWARF regno 11,
2166 but SDB still prints garbage when asked for the value of the
2167 variable in question (via a `/' command).
2168 (Also note that the labels SDB prints for various FP stack regs
2169 when doing an `x' command are all wrong.)
2170 Note that these problems generally don't affect the native SVR4
2171 C compiler because it doesn't allow the use of -O with -g and
2172 because when it is *not* optimizing, it allocates a memory
2173 location for each floating-point variable, and the memory
2174 location is what gets described in the DWARF AT_location
2175 attribute for the variable in question.
2176 Regardless of the severe mental illness of the x86/svr4 SDB, we
2177 do something sensible here and we use the following DWARF
2178 register numbers. Note that these are all stack-top-relative
2179 numbers.
2180 11 for %st(0) (gcc regno = 8)
2181 12 for %st(1) (gcc regno = 9)
2182 13 for %st(2) (gcc regno = 10)
2183 14 for %st(3) (gcc regno = 11)
2184 15 for %st(4) (gcc regno = 12)
2185 16 for %st(5) (gcc regno = 13)
2186 17 for %st(6) (gcc regno = 14)
2187 18 for %st(7) (gcc regno = 15)
2189 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2191 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2192 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2193 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2194 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2195 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2196 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2197 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2200 /* Define parameter passing and return registers. */
2202 static int const x86_64_int_parameter_registers[6] =
2204 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2207 static int const x86_64_ms_abi_int_parameter_registers[4] =
2209 CX_REG, DX_REG, R8_REG, R9_REG
2212 static int const x86_64_int_return_registers[4] =
2214 AX_REG, DX_REG, DI_REG, SI_REG
2217 /* Define the structure for the machine field in struct function. */
2219 struct GTY(()) stack_local_entry {
2220 unsigned short mode;
2221 unsigned short n;
2222 rtx rtl;
2223 struct stack_local_entry *next;
2226 /* Structure describing stack frame layout.
2227 Stack grows downward:
2229 [arguments]
2230 <- ARG_POINTER
2231 saved pc
2233 saved static chain if ix86_static_chain_on_stack
2235 saved frame pointer if frame_pointer_needed
2236 <- HARD_FRAME_POINTER
2237 [saved regs]
2238 <- regs_save_offset
2239 [padding0]
2241 [saved SSE regs]
2242 <- sse_regs_save_offset
2243 [padding1] |
2244 | <- FRAME_POINTER
2245 [va_arg registers] |
2247 [frame] |
2249 [padding2] | = to_allocate
2250 <- STACK_POINTER
2252 struct ix86_frame
2254 int nsseregs;
2255 int nregs;
2256 int va_arg_size;
2257 int red_zone_size;
2258 int outgoing_arguments_size;
2260 /* The offsets relative to ARG_POINTER. */
2261 HOST_WIDE_INT frame_pointer_offset;
2262 HOST_WIDE_INT hard_frame_pointer_offset;
2263 HOST_WIDE_INT stack_pointer_offset;
2264 HOST_WIDE_INT hfp_save_offset;
2265 HOST_WIDE_INT reg_save_offset;
2266 HOST_WIDE_INT sse_reg_save_offset;
2268 /* When save_regs_using_mov is set, emit prologue using
2269 move instead of push instructions. */
2270 bool save_regs_using_mov;
2273 /* Which cpu are we scheduling for. */
2274 enum attr_cpu ix86_schedule;
2276 /* Which cpu are we optimizing for. */
2277 enum processor_type ix86_tune;
2279 /* Which instruction set architecture to use. */
2280 enum processor_type ix86_arch;
2282 /* True if processor has SSE prefetch instruction. */
2283 unsigned char x86_prefetch_sse;
2285 /* -mstackrealign option */
2286 static const char ix86_force_align_arg_pointer_string[]
2287 = "force_align_arg_pointer";
2289 static rtx (*ix86_gen_leave) (void);
2290 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2291 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2292 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2293 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2294 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2295 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2296 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2297 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2298 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2299 static rtx (*ix86_gen_tls_global_dynamic_64) (rtx, rtx, rtx);
2300 static rtx (*ix86_gen_tls_local_dynamic_base_64) (rtx, rtx);
2302 /* Preferred alignment for stack boundary in bits. */
2303 unsigned int ix86_preferred_stack_boundary;
2305 /* Alignment for incoming stack boundary in bits specified at
2306 command line. */
2307 static unsigned int ix86_user_incoming_stack_boundary;
2309 /* Default alignment for incoming stack boundary in bits. */
2310 static unsigned int ix86_default_incoming_stack_boundary;
2312 /* Alignment for incoming stack boundary in bits. */
2313 unsigned int ix86_incoming_stack_boundary;
2315 /* Calling abi specific va_list type nodes. */
2316 static GTY(()) tree sysv_va_list_type_node;
2317 static GTY(()) tree ms_va_list_type_node;
2319 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2320 char internal_label_prefix[16];
2321 int internal_label_prefix_len;
2323 /* Fence to use after loop using movnt. */
2324 tree x86_mfence;
2326 /* Register class used for passing given 64bit part of the argument.
2327 These represent classes as documented by the PS ABI, with the exception
2328 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2329 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2331 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2332 whenever possible (upper half does contain padding). */
2333 enum x86_64_reg_class
2335 X86_64_NO_CLASS,
2336 X86_64_INTEGER_CLASS,
2337 X86_64_INTEGERSI_CLASS,
2338 X86_64_SSE_CLASS,
2339 X86_64_SSESF_CLASS,
2340 X86_64_SSEDF_CLASS,
2341 X86_64_SSEUP_CLASS,
2342 X86_64_X87_CLASS,
2343 X86_64_X87UP_CLASS,
2344 X86_64_COMPLEX_X87_CLASS,
2345 X86_64_MEMORY_CLASS
2348 #define MAX_CLASSES 4
2350 /* Table of constants used by fldpi, fldln2, etc.... */
2351 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2352 static bool ext_80387_constants_init = 0;
2355 static struct machine_function * ix86_init_machine_status (void);
2356 static rtx ix86_function_value (const_tree, const_tree, bool);
2357 static bool ix86_function_value_regno_p (const unsigned int);
2358 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2359 const_tree);
2360 static rtx ix86_static_chain (const_tree, bool);
2361 static int ix86_function_regparm (const_tree, const_tree);
2362 static void ix86_compute_frame_layout (struct ix86_frame *);
2363 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2364 rtx, rtx, int);
2365 static void ix86_add_new_builtins (HOST_WIDE_INT);
2366 static tree ix86_canonical_va_list_type (tree);
2367 static void predict_jump (int);
2368 static unsigned int split_stack_prologue_scratch_regno (void);
2369 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2371 enum ix86_function_specific_strings
2373 IX86_FUNCTION_SPECIFIC_ARCH,
2374 IX86_FUNCTION_SPECIFIC_TUNE,
2375 IX86_FUNCTION_SPECIFIC_MAX
2378 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2379 const char *, enum fpmath_unit, bool);
2380 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2381 static void ix86_function_specific_save (struct cl_target_option *);
2382 static void ix86_function_specific_restore (struct cl_target_option *);
2383 static void ix86_function_specific_print (FILE *, int,
2384 struct cl_target_option *);
2385 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2386 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2387 struct gcc_options *);
2388 static bool ix86_can_inline_p (tree, tree);
2389 static void ix86_set_current_function (tree);
2390 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2392 static enum calling_abi ix86_function_abi (const_tree);
2395 #ifndef SUBTARGET32_DEFAULT_CPU
2396 #define SUBTARGET32_DEFAULT_CPU "i386"
2397 #endif
2399 /* Whether -mtune= or -march= were specified */
2400 static int ix86_tune_defaulted;
2401 static int ix86_arch_specified;
2403 /* Vectorization library interface and handlers. */
2404 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2406 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2407 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2409 /* Processor target table, indexed by processor number */
2410 struct ptt
2412 const char *const name; /* processor name */
2413 const struct processor_costs *cost; /* Processor costs */
2414 const int align_loop; /* Default alignments. */
2415 const int align_loop_max_skip;
2416 const int align_jump;
2417 const int align_jump_max_skip;
2418 const int align_func;
2421 /* This table must be in sync with enum processor_type in i386.h. */
2422 static const struct ptt processor_target_table[PROCESSOR_max] =
2424 {"generic", &generic32_cost, 16, 7, 16, 7, 16},
2425 {"generic", &generic64_cost, 16, 10, 16, 10, 16},
2426 {"i386", &i386_cost, 4, 3, 4, 3, 4},
2427 {"i486", &i486_cost, 16, 15, 16, 15, 16},
2428 {"pentium", &pentium_cost, 16, 7, 16, 7, 16},
2429 {"pentiumpro", &pentiumpro_cost, 16, 15, 16, 10, 16},
2430 {"pentium4", &pentium4_cost, 0, 0, 0, 0, 0},
2431 {"nocona", &nocona_cost, 0, 0, 0, 0, 0},
2432 {"core2", &core_cost, 16, 10, 16, 10, 16},
2433 {"corei7", &core_cost, 16, 10, 16, 10, 16},
2434 {"core-avx2", &core_cost, 16, 10, 16, 10, 16},
2435 {"atom", &atom_cost, 16, 15, 16, 7, 16},
2436 {"geode", &geode_cost, 0, 0, 0, 0, 0},
2437 {"k6", &k6_cost, 32, 7, 32, 7, 32},
2438 {"athlon", &athlon_cost, 16, 7, 16, 7, 16},
2439 {"k8", &k8_cost, 16, 7, 16, 7, 16},
2440 {"amdfam10", &amdfam10_cost, 32, 24, 32, 7, 32},
2441 {"bdver1", &bdver1_cost, 16, 10, 16, 7, 11},
2442 {"bdver2", &bdver2_cost, 16, 10, 16, 7, 11},
2443 {"bdver3", &bdver3_cost, 16, 10, 16, 7, 11},
2444 {"btver1", &btver1_cost, 16, 10, 16, 7, 11},
2445 {"btver2", &btver2_cost, 16, 10, 16, 7, 11}
2448 static bool
2449 gate_insert_vzeroupper (void)
2451 return TARGET_VZEROUPPER;
2454 static unsigned int
2455 rest_of_handle_insert_vzeroupper (void)
2457 int i;
2459 /* vzeroupper instructions are inserted immediately after reload to
2460 account for possible spills from 256bit registers. The pass
2461 reuses mode switching infrastructure by re-running mode insertion
2462 pass, so disable entities that have already been processed. */
2463 for (i = 0; i < MAX_386_ENTITIES; i++)
2464 ix86_optimize_mode_switching[i] = 0;
2466 ix86_optimize_mode_switching[AVX_U128] = 1;
2468 /* Call optimize_mode_switching. */
2469 pass_mode_switching.pass.execute ();
2470 return 0;
2473 struct rtl_opt_pass pass_insert_vzeroupper =
2476 RTL_PASS,
2477 "vzeroupper", /* name */
2478 OPTGROUP_NONE, /* optinfo_flags */
2479 gate_insert_vzeroupper, /* gate */
2480 rest_of_handle_insert_vzeroupper, /* execute */
2481 NULL, /* sub */
2482 NULL, /* next */
2483 0, /* static_pass_number */
2484 TV_NONE, /* tv_id */
2485 0, /* properties_required */
2486 0, /* properties_provided */
2487 0, /* properties_destroyed */
2488 0, /* todo_flags_start */
2489 TODO_df_finish | TODO_verify_rtl_sharing |
2490 0, /* todo_flags_finish */
2494 /* Return true if a red-zone is in use. */
2496 static inline bool
2497 ix86_using_red_zone (void)
2499 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2502 /* Return a string that documents the current -m options. The caller is
2503 responsible for freeing the string. */
2505 static char *
2506 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2507 const char *tune, enum fpmath_unit fpmath,
2508 bool add_nl_p)
2510 struct ix86_target_opts
2512 const char *option; /* option string */
2513 HOST_WIDE_INT mask; /* isa mask options */
2516 /* This table is ordered so that options like -msse4.2 that imply
2517 preceding options while match those first. */
2518 static struct ix86_target_opts isa_opts[] =
2520 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2521 { "-mfma", OPTION_MASK_ISA_FMA },
2522 { "-mxop", OPTION_MASK_ISA_XOP },
2523 { "-mlwp", OPTION_MASK_ISA_LWP },
2524 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2525 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2526 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2527 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2528 { "-msse3", OPTION_MASK_ISA_SSE3 },
2529 { "-msse2", OPTION_MASK_ISA_SSE2 },
2530 { "-msse", OPTION_MASK_ISA_SSE },
2531 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2532 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2533 { "-mmmx", OPTION_MASK_ISA_MMX },
2534 { "-mabm", OPTION_MASK_ISA_ABM },
2535 { "-mbmi", OPTION_MASK_ISA_BMI },
2536 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2537 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2538 { "-mhle", OPTION_MASK_ISA_HLE },
2539 { "-mfxsr", OPTION_MASK_ISA_FXSR },
2540 { "-mrdseed", OPTION_MASK_ISA_RDSEED },
2541 { "-mprfchw", OPTION_MASK_ISA_PRFCHW },
2542 { "-madx", OPTION_MASK_ISA_ADX },
2543 { "-mtbm", OPTION_MASK_ISA_TBM },
2544 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2545 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2546 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2547 { "-maes", OPTION_MASK_ISA_AES },
2548 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2549 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2550 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2551 { "-mf16c", OPTION_MASK_ISA_F16C },
2552 { "-mrtm", OPTION_MASK_ISA_RTM },
2553 { "-mxsave", OPTION_MASK_ISA_XSAVE },
2554 { "-mxsaveopt", OPTION_MASK_ISA_XSAVEOPT },
2557 /* Flag options. */
2558 static struct ix86_target_opts flag_opts[] =
2560 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2561 { "-mlong-double-64", MASK_LONG_DOUBLE_64 },
2562 { "-m80387", MASK_80387 },
2563 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2564 { "-malign-double", MASK_ALIGN_DOUBLE },
2565 { "-mcld", MASK_CLD },
2566 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2567 { "-mieee-fp", MASK_IEEE_FP },
2568 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2569 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2570 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2571 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2572 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2573 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2574 { "-mno-red-zone", MASK_NO_RED_ZONE },
2575 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2576 { "-mrecip", MASK_RECIP },
2577 { "-mrtd", MASK_RTD },
2578 { "-msseregparm", MASK_SSEREGPARM },
2579 { "-mstack-arg-probe", MASK_STACK_PROBE },
2580 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2581 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2582 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2583 { "-mvzeroupper", MASK_VZEROUPPER },
2584 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2585 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2586 { "-mprefer-avx128", MASK_PREFER_AVX128},
2589 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2591 char isa_other[40];
2592 char target_other[40];
2593 unsigned num = 0;
2594 unsigned i, j;
2595 char *ret;
2596 char *ptr;
2597 size_t len;
2598 size_t line_len;
2599 size_t sep_len;
2600 const char *abi;
2602 memset (opts, '\0', sizeof (opts));
2604 /* Add -march= option. */
2605 if (arch)
2607 opts[num][0] = "-march=";
2608 opts[num++][1] = arch;
2611 /* Add -mtune= option. */
2612 if (tune)
2614 opts[num][0] = "-mtune=";
2615 opts[num++][1] = tune;
2618 /* Add -m32/-m64/-mx32. */
2619 if ((isa & OPTION_MASK_ISA_64BIT) != 0)
2621 if ((isa & OPTION_MASK_ABI_64) != 0)
2622 abi = "-m64";
2623 else
2624 abi = "-mx32";
2625 isa &= ~ (OPTION_MASK_ISA_64BIT
2626 | OPTION_MASK_ABI_64
2627 | OPTION_MASK_ABI_X32);
2629 else
2630 abi = "-m32";
2631 opts[num++][0] = abi;
2633 /* Pick out the options in isa options. */
2634 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2636 if ((isa & isa_opts[i].mask) != 0)
2638 opts[num++][0] = isa_opts[i].option;
2639 isa &= ~ isa_opts[i].mask;
2643 if (isa && add_nl_p)
2645 opts[num++][0] = isa_other;
2646 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2647 isa);
2650 /* Add flag options. */
2651 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2653 if ((flags & flag_opts[i].mask) != 0)
2655 opts[num++][0] = flag_opts[i].option;
2656 flags &= ~ flag_opts[i].mask;
2660 if (flags && add_nl_p)
2662 opts[num++][0] = target_other;
2663 sprintf (target_other, "(other flags: %#x)", flags);
2666 /* Add -fpmath= option. */
2667 if (fpmath)
2669 opts[num][0] = "-mfpmath=";
2670 switch ((int) fpmath)
2672 case FPMATH_387:
2673 opts[num++][1] = "387";
2674 break;
2676 case FPMATH_SSE:
2677 opts[num++][1] = "sse";
2678 break;
2680 case FPMATH_387 | FPMATH_SSE:
2681 opts[num++][1] = "sse+387";
2682 break;
2684 default:
2685 gcc_unreachable ();
2689 /* Any options? */
2690 if (num == 0)
2691 return NULL;
2693 gcc_assert (num < ARRAY_SIZE (opts));
2695 /* Size the string. */
2696 len = 0;
2697 sep_len = (add_nl_p) ? 3 : 1;
2698 for (i = 0; i < num; i++)
2700 len += sep_len;
2701 for (j = 0; j < 2; j++)
2702 if (opts[i][j])
2703 len += strlen (opts[i][j]);
2706 /* Build the string. */
2707 ret = ptr = (char *) xmalloc (len);
2708 line_len = 0;
2710 for (i = 0; i < num; i++)
2712 size_t len2[2];
2714 for (j = 0; j < 2; j++)
2715 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2717 if (i != 0)
2719 *ptr++ = ' ';
2720 line_len++;
2722 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2724 *ptr++ = '\\';
2725 *ptr++ = '\n';
2726 line_len = 0;
2730 for (j = 0; j < 2; j++)
2731 if (opts[i][j])
2733 memcpy (ptr, opts[i][j], len2[j]);
2734 ptr += len2[j];
2735 line_len += len2[j];
2739 *ptr = '\0';
2740 gcc_assert (ret + len >= ptr);
2742 return ret;
2745 /* Return true, if profiling code should be emitted before
2746 prologue. Otherwise it returns false.
2747 Note: For x86 with "hotfix" it is sorried. */
2748 static bool
2749 ix86_profile_before_prologue (void)
2751 return flag_fentry != 0;
2754 /* Function that is callable from the debugger to print the current
2755 options. */
2756 void
2757 ix86_debug_options (void)
2759 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2760 ix86_arch_string, ix86_tune_string,
2761 ix86_fpmath, true);
2763 if (opts)
2765 fprintf (stderr, "%s\n\n", opts);
2766 free (opts);
2768 else
2769 fputs ("<no options>\n\n", stderr);
2771 return;
2774 /* Override various settings based on options. If MAIN_ARGS_P, the
2775 options are from the command line, otherwise they are from
2776 attributes. */
2778 static void
2779 ix86_option_override_internal (bool main_args_p)
2781 int i;
2782 unsigned int ix86_arch_mask, ix86_tune_mask;
2783 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2784 const char *prefix;
2785 const char *suffix;
2786 const char *sw;
2788 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2789 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2790 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2791 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2792 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2793 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2794 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2795 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2796 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2797 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2798 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2799 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2800 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2801 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2802 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2803 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2804 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2805 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2806 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2807 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2808 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2809 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2810 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2811 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2812 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2813 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2814 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2815 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2816 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2817 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2818 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2819 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2820 #define PTA_RTM (HOST_WIDE_INT_1 << 32)
2821 #define PTA_HLE (HOST_WIDE_INT_1 << 33)
2822 #define PTA_PRFCHW (HOST_WIDE_INT_1 << 34)
2823 #define PTA_RDSEED (HOST_WIDE_INT_1 << 35)
2824 #define PTA_ADX (HOST_WIDE_INT_1 << 36)
2825 #define PTA_FXSR (HOST_WIDE_INT_1 << 37)
2826 #define PTA_XSAVE (HOST_WIDE_INT_1 << 38)
2827 #define PTA_XSAVEOPT (HOST_WIDE_INT_1 << 39)
2829 /* if this reaches 64, need to widen struct pta flags below */
2831 static struct pta
2833 const char *const name; /* processor name or nickname. */
2834 const enum processor_type processor;
2835 const enum attr_cpu schedule;
2836 const unsigned HOST_WIDE_INT flags;
2838 const processor_alias_table[] =
2840 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2841 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2842 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2843 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2844 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2845 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2846 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2847 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2848 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2849 PTA_MMX | PTA_SSE | PTA_FXSR},
2850 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2851 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2852 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_FXSR},
2853 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2854 PTA_MMX | PTA_SSE | PTA_FXSR},
2855 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2856 PTA_MMX | PTA_SSE | PTA_FXSR},
2857 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2858 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2859 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2860 PTA_MMX |PTA_SSE | PTA_SSE2 | PTA_FXSR},
2861 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2862 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_FXSR},
2863 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2864 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_FXSR},
2865 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2866 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2867 | PTA_CX16 | PTA_NO_SAHF | PTA_FXSR},
2868 {"core2", PROCESSOR_CORE2, CPU_CORE2,
2869 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2870 | PTA_SSSE3 | PTA_CX16 | PTA_FXSR},
2871 {"corei7", PROCESSOR_COREI7, CPU_COREI7,
2872 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
2873 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT | PTA_FXSR},
2874 {"corei7-avx", PROCESSOR_COREI7, CPU_COREI7,
2875 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2876 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2877 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL
2878 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2879 {"core-avx-i", PROCESSOR_COREI7, CPU_COREI7,
2880 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2881 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2882 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2883 | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2884 {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7,
2885 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2886 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2887 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2888 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2889 | PTA_FMA | PTA_MOVBE | PTA_HLE | PTA_FXSR | PTA_XSAVE
2890 | PTA_XSAVEOPT},
2891 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2892 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2893 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE | PTA_FXSR},
2894 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2895 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
2896 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2897 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2898 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW | PTA_PRFCHW},
2899 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2900 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
2901 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2902 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE | PTA_PRFCHW},
2903 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2904 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
2905 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2906 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
2907 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2908 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_PRFCHW | PTA_FXSR},
2909 {"x86-64", PROCESSOR_K8, CPU_K8,
2910 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF | PTA_FXSR},
2911 {"k8", PROCESSOR_K8, CPU_K8,
2912 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2913 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2914 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2915 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2916 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2917 {"opteron", PROCESSOR_K8, CPU_K8,
2918 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2919 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2920 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
2921 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2922 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2923 {"athlon64", PROCESSOR_K8, CPU_K8,
2924 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2925 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2926 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
2927 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2928 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2929 {"athlon-fx", PROCESSOR_K8, CPU_K8,
2930 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2931 | PTA_SSE2 | PTA_NO_SAHF | PTA_PRFCHW | PTA_FXSR},
2932 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2933 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
2934 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
2935 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
2936 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
2937 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_PRFCHW | PTA_FXSR},
2938 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
2939 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2940 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2941 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2942 | PTA_XOP | PTA_LWP | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2943 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
2944 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2945 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2946 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2947 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2948 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE},
2949 {"bdver3", PROCESSOR_BDVER3, CPU_BDVER3,
2950 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2951 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
2952 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
2953 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
2954 | PTA_FMA | PTA_PRFCHW | PTA_FXSR | PTA_XSAVE
2955 | PTA_XSAVEOPT},
2956 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
2957 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2958 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_PRFCHW
2959 | PTA_FXSR | PTA_XSAVE},
2960 {"btver2", PROCESSOR_BTVER2, CPU_BTVER2,
2961 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2962 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16 | PTA_SSE4_1
2963 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
2964 | PTA_BMI | PTA_F16C | PTA_MOVBE | PTA_PRFCHW
2965 | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT},
2967 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
2968 PTA_HLE /* flags are only used for -march switch. */ },
2969 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
2970 PTA_64BIT
2971 | PTA_HLE /* flags are only used for -march switch. */ },
2974 /* -mrecip options. */
2975 static struct
2977 const char *string; /* option name */
2978 unsigned int mask; /* mask bits to set */
2980 const recip_options[] =
2982 { "all", RECIP_MASK_ALL },
2983 { "none", RECIP_MASK_NONE },
2984 { "div", RECIP_MASK_DIV },
2985 { "sqrt", RECIP_MASK_SQRT },
2986 { "vec-div", RECIP_MASK_VEC_DIV },
2987 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
2990 int const pta_size = ARRAY_SIZE (processor_alias_table);
2992 /* Set up prefix/suffix so the error messages refer to either the command
2993 line argument, or the attribute(target). */
2994 if (main_args_p)
2996 prefix = "-m";
2997 suffix = "";
2998 sw = "switch";
3000 else
3002 prefix = "option(\"";
3003 suffix = "\")";
3004 sw = "attribute";
3007 /* Turn off both OPTION_MASK_ABI_64 and OPTION_MASK_ABI_X32 if
3008 TARGET_64BIT_DEFAULT is true and TARGET_64BIT is false. */
3009 if (TARGET_64BIT_DEFAULT && !TARGET_64BIT)
3010 ix86_isa_flags &= ~(OPTION_MASK_ABI_64 | OPTION_MASK_ABI_X32);
3011 #ifdef TARGET_BI_ARCH
3012 else
3014 #if TARGET_BI_ARCH == 1
3015 /* When TARGET_BI_ARCH == 1, by default, OPTION_MASK_ABI_64
3016 is on and OPTION_MASK_ABI_X32 is off. We turn off
3017 OPTION_MASK_ABI_64 if OPTION_MASK_ABI_X32 is turned on by
3018 -mx32. */
3019 if (TARGET_X32)
3020 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3021 #else
3022 /* When TARGET_BI_ARCH == 2, by default, OPTION_MASK_ABI_X32 is
3023 on and OPTION_MASK_ABI_64 is off. We turn off
3024 OPTION_MASK_ABI_X32 if OPTION_MASK_ABI_64 is turned on by
3025 -m64. */
3026 if (TARGET_LP64)
3027 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3028 #endif
3030 #endif
3032 if (TARGET_X32)
3034 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3035 OPTION_MASK_ABI_64 for TARGET_X32. */
3036 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3037 ix86_isa_flags &= ~OPTION_MASK_ABI_64;
3039 else if (TARGET_LP64)
3041 /* Always turn on OPTION_MASK_ISA_64BIT and turn off
3042 OPTION_MASK_ABI_X32 for TARGET_LP64. */
3043 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3044 ix86_isa_flags &= ~OPTION_MASK_ABI_X32;
3047 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3048 SUBTARGET_OVERRIDE_OPTIONS;
3049 #endif
3051 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3052 SUBSUBTARGET_OVERRIDE_OPTIONS;
3053 #endif
3055 /* -fPIC is the default for x86_64. */
3056 if (TARGET_MACHO && TARGET_64BIT)
3057 flag_pic = 2;
3059 /* Need to check -mtune=generic first. */
3060 if (ix86_tune_string)
3062 if (!strcmp (ix86_tune_string, "generic")
3063 || !strcmp (ix86_tune_string, "i686")
3064 /* As special support for cross compilers we read -mtune=native
3065 as -mtune=generic. With native compilers we won't see the
3066 -mtune=native, as it was changed by the driver. */
3067 || !strcmp (ix86_tune_string, "native"))
3069 if (TARGET_64BIT)
3070 ix86_tune_string = "generic64";
3071 else
3072 ix86_tune_string = "generic32";
3074 /* If this call is for setting the option attribute, allow the
3075 generic32/generic64 that was previously set. */
3076 else if (!main_args_p
3077 && (!strcmp (ix86_tune_string, "generic32")
3078 || !strcmp (ix86_tune_string, "generic64")))
3080 else if (!strncmp (ix86_tune_string, "generic", 7))
3081 error ("bad value (%s) for %stune=%s %s",
3082 ix86_tune_string, prefix, suffix, sw);
3083 else if (!strcmp (ix86_tune_string, "x86-64"))
3084 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3085 "%stune=k8%s or %stune=generic%s instead as appropriate",
3086 prefix, suffix, prefix, suffix, prefix, suffix);
3088 else
3090 if (ix86_arch_string)
3091 ix86_tune_string = ix86_arch_string;
3092 if (!ix86_tune_string)
3094 ix86_tune_string
3095 = processor_target_table[TARGET_CPU_DEFAULT].name;
3096 ix86_tune_defaulted = 1;
3099 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3100 need to use a sensible tune option. */
3101 if (!strcmp (ix86_tune_string, "generic")
3102 || !strcmp (ix86_tune_string, "x86-64")
3103 || !strcmp (ix86_tune_string, "i686"))
3105 if (TARGET_64BIT)
3106 ix86_tune_string = "generic64";
3107 else
3108 ix86_tune_string = "generic32";
3112 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3114 /* rep; movq isn't available in 32-bit code. */
3115 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3116 ix86_stringop_alg = no_stringop;
3119 if (!ix86_arch_string)
3120 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3121 else
3122 ix86_arch_specified = 1;
3124 if (global_options_set.x_ix86_pmode)
3126 if ((TARGET_LP64 && ix86_pmode == PMODE_SI)
3127 || (!TARGET_64BIT && ix86_pmode == PMODE_DI))
3128 error ("address mode %qs not supported in the %s bit mode",
3129 TARGET_64BIT ? "short" : "long",
3130 TARGET_64BIT ? "64" : "32");
3132 else
3133 ix86_pmode = TARGET_LP64 ? PMODE_DI : PMODE_SI;
3135 if (!global_options_set.x_ix86_abi)
3136 ix86_abi = DEFAULT_ABI;
3138 if (global_options_set.x_ix86_cmodel)
3140 switch (ix86_cmodel)
3142 case CM_SMALL:
3143 case CM_SMALL_PIC:
3144 if (flag_pic)
3145 ix86_cmodel = CM_SMALL_PIC;
3146 if (!TARGET_64BIT)
3147 error ("code model %qs not supported in the %s bit mode",
3148 "small", "32");
3149 break;
3151 case CM_MEDIUM:
3152 case CM_MEDIUM_PIC:
3153 if (flag_pic)
3154 ix86_cmodel = CM_MEDIUM_PIC;
3155 if (!TARGET_64BIT)
3156 error ("code model %qs not supported in the %s bit mode",
3157 "medium", "32");
3158 else if (TARGET_X32)
3159 error ("code model %qs not supported in x32 mode",
3160 "medium");
3161 break;
3163 case CM_LARGE:
3164 case CM_LARGE_PIC:
3165 if (flag_pic)
3166 ix86_cmodel = CM_LARGE_PIC;
3167 if (!TARGET_64BIT)
3168 error ("code model %qs not supported in the %s bit mode",
3169 "large", "32");
3170 else if (TARGET_X32)
3171 error ("code model %qs not supported in x32 mode",
3172 "large");
3173 break;
3175 case CM_32:
3176 if (flag_pic)
3177 error ("code model %s does not support PIC mode", "32");
3178 if (TARGET_64BIT)
3179 error ("code model %qs not supported in the %s bit mode",
3180 "32", "64");
3181 break;
3183 case CM_KERNEL:
3184 if (flag_pic)
3186 error ("code model %s does not support PIC mode", "kernel");
3187 ix86_cmodel = CM_32;
3189 if (!TARGET_64BIT)
3190 error ("code model %qs not supported in the %s bit mode",
3191 "kernel", "32");
3192 break;
3194 default:
3195 gcc_unreachable ();
3198 else
3200 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3201 use of rip-relative addressing. This eliminates fixups that
3202 would otherwise be needed if this object is to be placed in a
3203 DLL, and is essentially just as efficient as direct addressing. */
3204 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3205 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3206 else if (TARGET_64BIT && TARGET_RDOS)
3207 ix86_cmodel = CM_MEDIUM_PIC, flag_pic = 1;
3208 else if (TARGET_64BIT)
3209 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3210 else
3211 ix86_cmodel = CM_32;
3213 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3215 error ("-masm=intel not supported in this configuration");
3216 ix86_asm_dialect = ASM_ATT;
3218 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3219 sorry ("%i-bit mode not compiled in",
3220 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3222 for (i = 0; i < pta_size; i++)
3223 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3225 ix86_schedule = processor_alias_table[i].schedule;
3226 ix86_arch = processor_alias_table[i].processor;
3227 /* Default cpu tuning to the architecture. */
3228 ix86_tune = ix86_arch;
3230 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3231 error ("CPU you selected does not support x86-64 "
3232 "instruction set");
3234 if (processor_alias_table[i].flags & PTA_MMX
3235 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3236 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3237 if (processor_alias_table[i].flags & PTA_3DNOW
3238 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3239 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3240 if (processor_alias_table[i].flags & PTA_3DNOW_A
3241 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3242 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3243 if (processor_alias_table[i].flags & PTA_SSE
3244 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3245 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3246 if (processor_alias_table[i].flags & PTA_SSE2
3247 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3248 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3249 if (processor_alias_table[i].flags & PTA_SSE3
3250 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3251 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3252 if (processor_alias_table[i].flags & PTA_SSSE3
3253 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3254 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3255 if (processor_alias_table[i].flags & PTA_SSE4_1
3256 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3257 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3258 if (processor_alias_table[i].flags & PTA_SSE4_2
3259 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3260 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3261 if (processor_alias_table[i].flags & PTA_AVX
3262 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3263 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3264 if (processor_alias_table[i].flags & PTA_AVX2
3265 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX2))
3266 ix86_isa_flags |= OPTION_MASK_ISA_AVX2;
3267 if (processor_alias_table[i].flags & PTA_FMA
3268 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3269 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3270 if (processor_alias_table[i].flags & PTA_SSE4A
3271 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3272 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3273 if (processor_alias_table[i].flags & PTA_FMA4
3274 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3275 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3276 if (processor_alias_table[i].flags & PTA_XOP
3277 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3278 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3279 if (processor_alias_table[i].flags & PTA_LWP
3280 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3281 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3282 if (processor_alias_table[i].flags & PTA_ABM
3283 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3284 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3285 if (processor_alias_table[i].flags & PTA_BMI
3286 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3287 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3288 if (processor_alias_table[i].flags & (PTA_LZCNT | PTA_ABM)
3289 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LZCNT))
3290 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT;
3291 if (processor_alias_table[i].flags & PTA_TBM
3292 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3293 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3294 if (processor_alias_table[i].flags & PTA_BMI2
3295 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI2))
3296 ix86_isa_flags |= OPTION_MASK_ISA_BMI2;
3297 if (processor_alias_table[i].flags & PTA_CX16
3298 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3299 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3300 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3301 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3302 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3303 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3304 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3305 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3306 if (processor_alias_table[i].flags & PTA_MOVBE
3307 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3308 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3309 if (processor_alias_table[i].flags & PTA_AES
3310 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3311 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3312 if (processor_alias_table[i].flags & PTA_PCLMUL
3313 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3314 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3315 if (processor_alias_table[i].flags & PTA_FSGSBASE
3316 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3317 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3318 if (processor_alias_table[i].flags & PTA_RDRND
3319 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3320 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3321 if (processor_alias_table[i].flags & PTA_F16C
3322 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3323 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3324 if (processor_alias_table[i].flags & PTA_RTM
3325 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RTM))
3326 ix86_isa_flags |= OPTION_MASK_ISA_RTM;
3327 if (processor_alias_table[i].flags & PTA_HLE
3328 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_HLE))
3329 ix86_isa_flags |= OPTION_MASK_ISA_HLE;
3330 if (processor_alias_table[i].flags & PTA_PRFCHW
3331 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PRFCHW))
3332 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW;
3333 if (processor_alias_table[i].flags & PTA_RDSEED
3334 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDSEED))
3335 ix86_isa_flags |= OPTION_MASK_ISA_RDSEED;
3336 if (processor_alias_table[i].flags & PTA_ADX
3337 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ADX))
3338 ix86_isa_flags |= OPTION_MASK_ISA_ADX;
3339 if (processor_alias_table[i].flags & PTA_FXSR
3340 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FXSR))
3341 ix86_isa_flags |= OPTION_MASK_ISA_FXSR;
3342 if (processor_alias_table[i].flags & PTA_XSAVE
3343 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVE))
3344 ix86_isa_flags |= OPTION_MASK_ISA_XSAVE;
3345 if (processor_alias_table[i].flags & PTA_XSAVEOPT
3346 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XSAVEOPT))
3347 ix86_isa_flags |= OPTION_MASK_ISA_XSAVEOPT;
3348 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3349 x86_prefetch_sse = true;
3351 break;
3354 if (!strcmp (ix86_arch_string, "generic"))
3355 error ("generic CPU can be used only for %stune=%s %s",
3356 prefix, suffix, sw);
3357 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3358 error ("bad value (%s) for %sarch=%s %s",
3359 ix86_arch_string, prefix, suffix, sw);
3361 ix86_arch_mask = 1u << ix86_arch;
3362 for (i = 0; i < X86_ARCH_LAST; ++i)
3363 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3365 for (i = 0; i < pta_size; i++)
3366 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3368 ix86_schedule = processor_alias_table[i].schedule;
3369 ix86_tune = processor_alias_table[i].processor;
3370 if (TARGET_64BIT)
3372 if (!(processor_alias_table[i].flags & PTA_64BIT))
3374 if (ix86_tune_defaulted)
3376 ix86_tune_string = "x86-64";
3377 for (i = 0; i < pta_size; i++)
3378 if (! strcmp (ix86_tune_string,
3379 processor_alias_table[i].name))
3380 break;
3381 ix86_schedule = processor_alias_table[i].schedule;
3382 ix86_tune = processor_alias_table[i].processor;
3384 else
3385 error ("CPU you selected does not support x86-64 "
3386 "instruction set");
3389 else
3391 /* Adjust tuning when compiling for 32-bit ABI. */
3392 switch (ix86_tune)
3394 case PROCESSOR_GENERIC64:
3395 ix86_tune = PROCESSOR_GENERIC32;
3396 ix86_schedule = CPU_PENTIUMPRO;
3397 break;
3399 default:
3400 break;
3403 /* Intel CPUs have always interpreted SSE prefetch instructions as
3404 NOPs; so, we can enable SSE prefetch instructions even when
3405 -mtune (rather than -march) points us to a processor that has them.
3406 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3407 higher processors. */
3408 if (TARGET_CMOV
3409 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3410 x86_prefetch_sse = true;
3411 break;
3414 if (ix86_tune_specified && i == pta_size)
3415 error ("bad value (%s) for %stune=%s %s",
3416 ix86_tune_string, prefix, suffix, sw);
3418 ix86_tune_mask = 1u << ix86_tune;
3419 for (i = 0; i < X86_TUNE_LAST; ++i)
3420 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3422 #ifndef USE_IX86_FRAME_POINTER
3423 #define USE_IX86_FRAME_POINTER 0
3424 #endif
3426 #ifndef USE_X86_64_FRAME_POINTER
3427 #define USE_X86_64_FRAME_POINTER 0
3428 #endif
3430 /* Set the default values for switches whose default depends on TARGET_64BIT
3431 in case they weren't overwritten by command line options. */
3432 if (TARGET_64BIT)
3434 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3435 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3436 if (flag_asynchronous_unwind_tables == 2)
3437 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3438 if (flag_pcc_struct_return == 2)
3439 flag_pcc_struct_return = 0;
3441 else
3443 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3444 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3445 if (flag_asynchronous_unwind_tables == 2)
3446 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3447 if (flag_pcc_struct_return == 2)
3448 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3451 ix86_tune_cost = processor_target_table[ix86_tune].cost;
3452 if (optimize_size)
3453 ix86_cost = &ix86_size_cost;
3454 else
3455 ix86_cost = ix86_tune_cost;
3457 /* Arrange to set up i386_stack_locals for all functions. */
3458 init_machine_status = ix86_init_machine_status;
3460 /* Validate -mregparm= value. */
3461 if (global_options_set.x_ix86_regparm)
3463 if (TARGET_64BIT)
3464 warning (0, "-mregparm is ignored in 64-bit mode");
3465 if (ix86_regparm > REGPARM_MAX)
3467 error ("-mregparm=%d is not between 0 and %d",
3468 ix86_regparm, REGPARM_MAX);
3469 ix86_regparm = 0;
3472 if (TARGET_64BIT)
3473 ix86_regparm = REGPARM_MAX;
3475 /* Default align_* from the processor table. */
3476 if (align_loops == 0)
3478 align_loops = processor_target_table[ix86_tune].align_loop;
3479 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3481 if (align_jumps == 0)
3483 align_jumps = processor_target_table[ix86_tune].align_jump;
3484 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3486 if (align_functions == 0)
3488 align_functions = processor_target_table[ix86_tune].align_func;
3491 /* Provide default for -mbranch-cost= value. */
3492 if (!global_options_set.x_ix86_branch_cost)
3493 ix86_branch_cost = ix86_cost->branch_cost;
3495 if (TARGET_64BIT)
3497 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3499 /* Enable by default the SSE and MMX builtins. Do allow the user to
3500 explicitly disable any of these. In particular, disabling SSE and
3501 MMX for kernel code is extremely useful. */
3502 if (!ix86_arch_specified)
3503 ix86_isa_flags
3504 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3505 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3507 if (TARGET_RTD)
3508 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3510 else
3512 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3514 if (!ix86_arch_specified)
3515 ix86_isa_flags
3516 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3518 /* i386 ABI does not specify red zone. It still makes sense to use it
3519 when programmer takes care to stack from being destroyed. */
3520 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3521 target_flags |= MASK_NO_RED_ZONE;
3524 /* Keep nonleaf frame pointers. */
3525 if (flag_omit_frame_pointer)
3526 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3527 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3528 flag_omit_frame_pointer = 1;
3530 /* If we're doing fast math, we don't care about comparison order
3531 wrt NaNs. This lets us use a shorter comparison sequence. */
3532 if (flag_finite_math_only)
3533 target_flags &= ~MASK_IEEE_FP;
3535 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3536 since the insns won't need emulation. */
3537 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3538 target_flags &= ~MASK_NO_FANCY_MATH_387;
3540 /* Likewise, if the target doesn't have a 387, or we've specified
3541 software floating point, don't use 387 inline intrinsics. */
3542 if (!TARGET_80387)
3543 target_flags |= MASK_NO_FANCY_MATH_387;
3545 /* Turn on MMX builtins for -msse. */
3546 if (TARGET_SSE)
3547 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3549 /* Enable SSE prefetch. */
3550 if (TARGET_SSE || (TARGET_PRFCHW && !TARGET_3DNOW))
3551 x86_prefetch_sse = true;
3553 /* Enable prefetch{,w} instructions for -m3dnow. */
3554 if (TARGET_3DNOW)
3555 ix86_isa_flags |= OPTION_MASK_ISA_PRFCHW & ~ix86_isa_flags_explicit;
3557 /* Enable popcnt instruction for -msse4.2 or -mabm. */
3558 if (TARGET_SSE4_2 || TARGET_ABM)
3559 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3561 /* Enable lzcnt instruction for -mabm. */
3562 if (TARGET_ABM)
3563 ix86_isa_flags |= OPTION_MASK_ISA_LZCNT & ~ix86_isa_flags_explicit;
3565 /* Validate -mpreferred-stack-boundary= value or default it to
3566 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3567 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3568 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3570 int min = (TARGET_64BIT ? (TARGET_SSE ? 4 : 3) : 2);
3571 int max = (TARGET_SEH ? 4 : 12);
3573 if (ix86_preferred_stack_boundary_arg < min
3574 || ix86_preferred_stack_boundary_arg > max)
3576 if (min == max)
3577 error ("-mpreferred-stack-boundary is not supported "
3578 "for this target");
3579 else
3580 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3581 ix86_preferred_stack_boundary_arg, min, max);
3583 else
3584 ix86_preferred_stack_boundary
3585 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3588 /* Set the default value for -mstackrealign. */
3589 if (ix86_force_align_arg_pointer == -1)
3590 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3592 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3594 /* Validate -mincoming-stack-boundary= value or default it to
3595 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3596 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3597 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3599 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3600 || ix86_incoming_stack_boundary_arg > 12)
3601 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3602 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3603 else
3605 ix86_user_incoming_stack_boundary
3606 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3607 ix86_incoming_stack_boundary
3608 = ix86_user_incoming_stack_boundary;
3612 /* Accept -msseregparm only if at least SSE support is enabled. */
3613 if (TARGET_SSEREGPARM
3614 && ! TARGET_SSE)
3615 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3617 if (global_options_set.x_ix86_fpmath)
3619 if (ix86_fpmath & FPMATH_SSE)
3621 if (!TARGET_SSE)
3623 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3624 ix86_fpmath = FPMATH_387;
3626 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3628 warning (0, "387 instruction set disabled, using SSE arithmetics");
3629 ix86_fpmath = FPMATH_SSE;
3633 else
3634 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3636 /* If the i387 is disabled, then do not return values in it. */
3637 if (!TARGET_80387)
3638 target_flags &= ~MASK_FLOAT_RETURNS;
3640 /* Use external vectorized library in vectorizing intrinsics. */
3641 if (global_options_set.x_ix86_veclibabi_type)
3642 switch (ix86_veclibabi_type)
3644 case ix86_veclibabi_type_svml:
3645 ix86_veclib_handler = ix86_veclibabi_svml;
3646 break;
3648 case ix86_veclibabi_type_acml:
3649 ix86_veclib_handler = ix86_veclibabi_acml;
3650 break;
3652 default:
3653 gcc_unreachable ();
3656 if ((!USE_IX86_FRAME_POINTER
3657 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3658 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3659 && !optimize_size)
3660 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3662 /* ??? Unwind info is not correct around the CFG unless either a frame
3663 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3664 unwind info generation to be aware of the CFG and propagating states
3665 around edges. */
3666 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3667 || flag_exceptions || flag_non_call_exceptions)
3668 && flag_omit_frame_pointer
3669 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3671 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3672 warning (0, "unwind tables currently require either a frame pointer "
3673 "or %saccumulate-outgoing-args%s for correctness",
3674 prefix, suffix);
3675 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3678 /* If stack probes are required, the space used for large function
3679 arguments on the stack must also be probed, so enable
3680 -maccumulate-outgoing-args so this happens in the prologue. */
3681 if (TARGET_STACK_PROBE
3682 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3684 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3685 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3686 "for correctness", prefix, suffix);
3687 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3690 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3692 char *p;
3693 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3694 p = strchr (internal_label_prefix, 'X');
3695 internal_label_prefix_len = p - internal_label_prefix;
3696 *p = '\0';
3699 /* When scheduling description is not available, disable scheduler pass
3700 so it won't slow down the compilation and make x87 code slower. */
3701 if (!TARGET_SCHEDULE)
3702 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3704 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3705 ix86_tune_cost->simultaneous_prefetches,
3706 global_options.x_param_values,
3707 global_options_set.x_param_values);
3708 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
3709 ix86_tune_cost->prefetch_block,
3710 global_options.x_param_values,
3711 global_options_set.x_param_values);
3712 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
3713 ix86_tune_cost->l1_cache_size,
3714 global_options.x_param_values,
3715 global_options_set.x_param_values);
3716 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
3717 ix86_tune_cost->l2_cache_size,
3718 global_options.x_param_values,
3719 global_options_set.x_param_values);
3721 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3722 if (flag_prefetch_loop_arrays < 0
3723 && HAVE_prefetch
3724 && (optimize >= 3 || flag_profile_use)
3725 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3726 flag_prefetch_loop_arrays = 1;
3728 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3729 can be optimized to ap = __builtin_next_arg (0). */
3730 if (!TARGET_64BIT && !flag_split_stack)
3731 targetm.expand_builtin_va_start = NULL;
3733 if (TARGET_64BIT)
3735 ix86_gen_leave = gen_leave_rex64;
3736 if (Pmode == DImode)
3738 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_di;
3739 ix86_gen_tls_local_dynamic_base_64
3740 = gen_tls_local_dynamic_base_64_di;
3742 else
3744 ix86_gen_tls_global_dynamic_64 = gen_tls_global_dynamic_64_si;
3745 ix86_gen_tls_local_dynamic_base_64
3746 = gen_tls_local_dynamic_base_64_si;
3749 else
3750 ix86_gen_leave = gen_leave;
3752 if (Pmode == DImode)
3754 ix86_gen_add3 = gen_adddi3;
3755 ix86_gen_sub3 = gen_subdi3;
3756 ix86_gen_sub3_carry = gen_subdi3_carry;
3757 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3758 ix86_gen_andsp = gen_anddi3;
3759 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3760 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3761 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3762 ix86_gen_monitor = gen_sse3_monitor_di;
3764 else
3766 ix86_gen_add3 = gen_addsi3;
3767 ix86_gen_sub3 = gen_subsi3;
3768 ix86_gen_sub3_carry = gen_subsi3_carry;
3769 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3770 ix86_gen_andsp = gen_andsi3;
3771 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3772 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3773 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3774 ix86_gen_monitor = gen_sse3_monitor_si;
3777 #ifdef USE_IX86_CLD
3778 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3779 if (!TARGET_64BIT)
3780 target_flags |= MASK_CLD & ~target_flags_explicit;
3781 #endif
3783 if (!TARGET_64BIT && flag_pic)
3785 if (flag_fentry > 0)
3786 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3787 "with -fpic");
3788 flag_fentry = 0;
3790 else if (TARGET_SEH)
3792 if (flag_fentry == 0)
3793 sorry ("-mno-fentry isn%'t compatible with SEH");
3794 flag_fentry = 1;
3796 else if (flag_fentry < 0)
3798 #if defined(PROFILE_BEFORE_PROLOGUE)
3799 flag_fentry = 1;
3800 #else
3801 flag_fentry = 0;
3802 #endif
3805 if (TARGET_AVX)
3807 /* When not optimize for size, enable vzeroupper optimization for
3808 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3809 AVX unaligned load/store. */
3810 if (!optimize_size)
3812 if (flag_expensive_optimizations
3813 && !(target_flags_explicit & MASK_VZEROUPPER))
3814 target_flags |= MASK_VZEROUPPER;
3815 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3816 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3817 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3818 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3819 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3820 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3821 /* Enable 128-bit AVX instruction generation
3822 for the auto-vectorizer. */
3823 if (TARGET_AVX128_OPTIMAL
3824 && !(target_flags_explicit & MASK_PREFER_AVX128))
3825 target_flags |= MASK_PREFER_AVX128;
3828 else
3830 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3831 target_flags &= ~MASK_VZEROUPPER;
3834 if (ix86_recip_name)
3836 char *p = ASTRDUP (ix86_recip_name);
3837 char *q;
3838 unsigned int mask, i;
3839 bool invert;
3841 while ((q = strtok (p, ",")) != NULL)
3843 p = NULL;
3844 if (*q == '!')
3846 invert = true;
3847 q++;
3849 else
3850 invert = false;
3852 if (!strcmp (q, "default"))
3853 mask = RECIP_MASK_ALL;
3854 else
3856 for (i = 0; i < ARRAY_SIZE (recip_options); i++)
3857 if (!strcmp (q, recip_options[i].string))
3859 mask = recip_options[i].mask;
3860 break;
3863 if (i == ARRAY_SIZE (recip_options))
3865 error ("unknown option for -mrecip=%s", q);
3866 invert = false;
3867 mask = RECIP_MASK_NONE;
3871 recip_mask_explicit |= mask;
3872 if (invert)
3873 recip_mask &= ~mask;
3874 else
3875 recip_mask |= mask;
3879 if (TARGET_RECIP)
3880 recip_mask |= RECIP_MASK_ALL & ~recip_mask_explicit;
3881 else if (target_flags_explicit & MASK_RECIP)
3882 recip_mask &= ~(RECIP_MASK_ALL & ~recip_mask_explicit);
3884 /* Default long double to 64-bit for Bionic. */
3885 if (TARGET_HAS_BIONIC
3886 && !(target_flags_explicit & MASK_LONG_DOUBLE_64))
3887 target_flags |= MASK_LONG_DOUBLE_64;
3889 /* Save the initial options in case the user does function specific
3890 options. */
3891 if (main_args_p)
3892 target_option_default_node = target_option_current_node
3893 = build_target_option_node ();
3896 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3898 static void
3899 ix86_option_override (void)
3901 static struct register_pass_info insert_vzeroupper_info
3902 = { &pass_insert_vzeroupper.pass, "reload",
3903 1, PASS_POS_INSERT_AFTER
3906 ix86_option_override_internal (true);
3909 /* This needs to be done at start up. It's convenient to do it here. */
3910 register_pass (&insert_vzeroupper_info);
3913 /* Update register usage after having seen the compiler flags. */
3915 static void
3916 ix86_conditional_register_usage (void)
3918 int i, c_mask;
3919 unsigned int j;
3921 /* The PIC register, if it exists, is fixed. */
3922 j = PIC_OFFSET_TABLE_REGNUM;
3923 if (j != INVALID_REGNUM)
3924 fixed_regs[j] = call_used_regs[j] = 1;
3926 /* For 32-bit targets, squash the REX registers. */
3927 if (! TARGET_64BIT)
3929 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3930 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3931 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3932 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3935 /* See the definition of CALL_USED_REGISTERS in i386.h. */
3936 c_mask = (TARGET_64BIT_MS_ABI ? (1 << 3)
3937 : TARGET_64BIT ? (1 << 2)
3938 : (1 << 1));
3940 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3942 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3944 /* Set/reset conditionally defined registers from
3945 CALL_USED_REGISTERS initializer. */
3946 if (call_used_regs[i] > 1)
3947 call_used_regs[i] = !!(call_used_regs[i] & c_mask);
3949 /* Calculate registers of CLOBBERED_REGS register set
3950 as call used registers from GENERAL_REGS register set. */
3951 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3952 && call_used_regs[i])
3953 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3956 /* If MMX is disabled, squash the registers. */
3957 if (! TARGET_MMX)
3958 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3959 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3960 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3962 /* If SSE is disabled, squash the registers. */
3963 if (! TARGET_SSE)
3964 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3965 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3966 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3968 /* If the FPU is disabled, squash the registers. */
3969 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3970 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3971 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3972 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3976 /* Save the current options */
3978 static void
3979 ix86_function_specific_save (struct cl_target_option *ptr)
3981 ptr->arch = ix86_arch;
3982 ptr->schedule = ix86_schedule;
3983 ptr->tune = ix86_tune;
3984 ptr->branch_cost = ix86_branch_cost;
3985 ptr->tune_defaulted = ix86_tune_defaulted;
3986 ptr->arch_specified = ix86_arch_specified;
3987 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
3988 ptr->ix86_target_flags_explicit = target_flags_explicit;
3989 ptr->x_recip_mask_explicit = recip_mask_explicit;
3991 /* The fields are char but the variables are not; make sure the
3992 values fit in the fields. */
3993 gcc_assert (ptr->arch == ix86_arch);
3994 gcc_assert (ptr->schedule == ix86_schedule);
3995 gcc_assert (ptr->tune == ix86_tune);
3996 gcc_assert (ptr->branch_cost == ix86_branch_cost);
3999 /* Restore the current options */
4001 static void
4002 ix86_function_specific_restore (struct cl_target_option *ptr)
4004 enum processor_type old_tune = ix86_tune;
4005 enum processor_type old_arch = ix86_arch;
4006 unsigned int ix86_arch_mask, ix86_tune_mask;
4007 int i;
4009 ix86_arch = (enum processor_type) ptr->arch;
4010 ix86_schedule = (enum attr_cpu) ptr->schedule;
4011 ix86_tune = (enum processor_type) ptr->tune;
4012 ix86_branch_cost = ptr->branch_cost;
4013 ix86_tune_defaulted = ptr->tune_defaulted;
4014 ix86_arch_specified = ptr->arch_specified;
4015 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
4016 target_flags_explicit = ptr->ix86_target_flags_explicit;
4017 recip_mask_explicit = ptr->x_recip_mask_explicit;
4019 /* Recreate the arch feature tests if the arch changed */
4020 if (old_arch != ix86_arch)
4022 ix86_arch_mask = 1u << ix86_arch;
4023 for (i = 0; i < X86_ARCH_LAST; ++i)
4024 ix86_arch_features[i]
4025 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
4028 /* Recreate the tune optimization tests */
4029 if (old_tune != ix86_tune)
4031 ix86_tune_mask = 1u << ix86_tune;
4032 for (i = 0; i < X86_TUNE_LAST; ++i)
4033 ix86_tune_features[i]
4034 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
4038 /* Print the current options */
4040 static void
4041 ix86_function_specific_print (FILE *file, int indent,
4042 struct cl_target_option *ptr)
4044 char *target_string
4045 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
4046 NULL, NULL, ptr->x_ix86_fpmath, false);
4048 gcc_assert (ptr->arch < PROCESSOR_max);
4049 fprintf (file, "%*sarch = %d (%s)\n",
4050 indent, "",
4051 ptr->arch, processor_target_table[ptr->arch].name);
4053 gcc_assert (ptr->tune < PROCESSOR_max);
4054 fprintf (file, "%*stune = %d (%s)\n",
4055 indent, "",
4056 ptr->tune, processor_target_table[ptr->tune].name);
4058 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
4060 if (target_string)
4062 fprintf (file, "%*s%s\n", indent, "", target_string);
4063 free (target_string);
4068 /* Inner function to process the attribute((target(...))), take an argument and
4069 set the current options from the argument. If we have a list, recursively go
4070 over the list. */
4072 static bool
4073 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4074 struct gcc_options *enum_opts_set)
4076 char *next_optstr;
4077 bool ret = true;
4079 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4080 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4081 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4082 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4083 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4085 enum ix86_opt_type
4087 ix86_opt_unknown,
4088 ix86_opt_yes,
4089 ix86_opt_no,
4090 ix86_opt_str,
4091 ix86_opt_enum,
4092 ix86_opt_isa
4095 static const struct
4097 const char *string;
4098 size_t len;
4099 enum ix86_opt_type type;
4100 int opt;
4101 int mask;
4102 } attrs[] = {
4103 /* isa options */
4104 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4105 IX86_ATTR_ISA ("abm", OPT_mabm),
4106 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4107 IX86_ATTR_ISA ("bmi2", OPT_mbmi2),
4108 IX86_ATTR_ISA ("lzcnt", OPT_mlzcnt),
4109 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4110 IX86_ATTR_ISA ("aes", OPT_maes),
4111 IX86_ATTR_ISA ("avx", OPT_mavx),
4112 IX86_ATTR_ISA ("avx2", OPT_mavx2),
4113 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4114 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4115 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4116 IX86_ATTR_ISA ("sse", OPT_msse),
4117 IX86_ATTR_ISA ("sse2", OPT_msse2),
4118 IX86_ATTR_ISA ("sse3", OPT_msse3),
4119 IX86_ATTR_ISA ("sse4", OPT_msse4),
4120 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4121 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4122 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4123 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4124 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4125 IX86_ATTR_ISA ("fma", OPT_mfma),
4126 IX86_ATTR_ISA ("xop", OPT_mxop),
4127 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4128 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4129 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4130 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4131 IX86_ATTR_ISA ("rtm", OPT_mrtm),
4132 IX86_ATTR_ISA ("hle", OPT_mhle),
4133 IX86_ATTR_ISA ("prfchw", OPT_mprfchw),
4134 IX86_ATTR_ISA ("rdseed", OPT_mrdseed),
4135 IX86_ATTR_ISA ("adx", OPT_madx),
4136 IX86_ATTR_ISA ("fxsr", OPT_mfxsr),
4137 IX86_ATTR_ISA ("xsave", OPT_mxsave),
4138 IX86_ATTR_ISA ("xsaveopt", OPT_mxsaveopt),
4140 /* enum options */
4141 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4143 /* string options */
4144 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4145 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4147 /* flag options */
4148 IX86_ATTR_YES ("cld",
4149 OPT_mcld,
4150 MASK_CLD),
4152 IX86_ATTR_NO ("fancy-math-387",
4153 OPT_mfancy_math_387,
4154 MASK_NO_FANCY_MATH_387),
4156 IX86_ATTR_YES ("ieee-fp",
4157 OPT_mieee_fp,
4158 MASK_IEEE_FP),
4160 IX86_ATTR_YES ("inline-all-stringops",
4161 OPT_minline_all_stringops,
4162 MASK_INLINE_ALL_STRINGOPS),
4164 IX86_ATTR_YES ("inline-stringops-dynamically",
4165 OPT_minline_stringops_dynamically,
4166 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4168 IX86_ATTR_NO ("align-stringops",
4169 OPT_mno_align_stringops,
4170 MASK_NO_ALIGN_STRINGOPS),
4172 IX86_ATTR_YES ("recip",
4173 OPT_mrecip,
4174 MASK_RECIP),
4178 /* If this is a list, recurse to get the options. */
4179 if (TREE_CODE (args) == TREE_LIST)
4181 bool ret = true;
4183 for (; args; args = TREE_CHAIN (args))
4184 if (TREE_VALUE (args)
4185 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4186 p_strings, enum_opts_set))
4187 ret = false;
4189 return ret;
4192 else if (TREE_CODE (args) != STRING_CST)
4194 error ("attribute %<target%> argument not a string");
4195 return false;
4198 /* Handle multiple arguments separated by commas. */
4199 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4201 while (next_optstr && *next_optstr != '\0')
4203 char *p = next_optstr;
4204 char *orig_p = p;
4205 char *comma = strchr (next_optstr, ',');
4206 const char *opt_string;
4207 size_t len, opt_len;
4208 int opt;
4209 bool opt_set_p;
4210 char ch;
4211 unsigned i;
4212 enum ix86_opt_type type = ix86_opt_unknown;
4213 int mask = 0;
4215 if (comma)
4217 *comma = '\0';
4218 len = comma - next_optstr;
4219 next_optstr = comma + 1;
4221 else
4223 len = strlen (p);
4224 next_optstr = NULL;
4227 /* Recognize no-xxx. */
4228 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4230 opt_set_p = false;
4231 p += 3;
4232 len -= 3;
4234 else
4235 opt_set_p = true;
4237 /* Find the option. */
4238 ch = *p;
4239 opt = N_OPTS;
4240 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4242 type = attrs[i].type;
4243 opt_len = attrs[i].len;
4244 if (ch == attrs[i].string[0]
4245 && ((type != ix86_opt_str && type != ix86_opt_enum)
4246 ? len == opt_len
4247 : len > opt_len)
4248 && memcmp (p, attrs[i].string, opt_len) == 0)
4250 opt = attrs[i].opt;
4251 mask = attrs[i].mask;
4252 opt_string = attrs[i].string;
4253 break;
4257 /* Process the option. */
4258 if (opt == N_OPTS)
4260 error ("attribute(target(\"%s\")) is unknown", orig_p);
4261 ret = false;
4264 else if (type == ix86_opt_isa)
4266 struct cl_decoded_option decoded;
4268 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4269 ix86_handle_option (&global_options, &global_options_set,
4270 &decoded, input_location);
4273 else if (type == ix86_opt_yes || type == ix86_opt_no)
4275 if (type == ix86_opt_no)
4276 opt_set_p = !opt_set_p;
4278 if (opt_set_p)
4279 target_flags |= mask;
4280 else
4281 target_flags &= ~mask;
4284 else if (type == ix86_opt_str)
4286 if (p_strings[opt])
4288 error ("option(\"%s\") was already specified", opt_string);
4289 ret = false;
4291 else
4292 p_strings[opt] = xstrdup (p + opt_len);
4295 else if (type == ix86_opt_enum)
4297 bool arg_ok;
4298 int value;
4300 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4301 if (arg_ok)
4302 set_option (&global_options, enum_opts_set, opt, value,
4303 p + opt_len, DK_UNSPECIFIED, input_location,
4304 global_dc);
4305 else
4307 error ("attribute(target(\"%s\")) is unknown", orig_p);
4308 ret = false;
4312 else
4313 gcc_unreachable ();
4316 return ret;
4319 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4321 tree
4322 ix86_valid_target_attribute_tree (tree args)
4324 const char *orig_arch_string = ix86_arch_string;
4325 const char *orig_tune_string = ix86_tune_string;
4326 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4327 int orig_tune_defaulted = ix86_tune_defaulted;
4328 int orig_arch_specified = ix86_arch_specified;
4329 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4330 tree t = NULL_TREE;
4331 int i;
4332 struct cl_target_option *def
4333 = TREE_TARGET_OPTION (target_option_default_node);
4334 struct gcc_options enum_opts_set;
4336 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4338 /* Process each of the options on the chain. */
4339 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4340 &enum_opts_set))
4341 return error_mark_node;
4343 /* If the changed options are different from the default, rerun
4344 ix86_option_override_internal, and then save the options away.
4345 The string options are are attribute options, and will be undone
4346 when we copy the save structure. */
4347 if (ix86_isa_flags != def->x_ix86_isa_flags
4348 || target_flags != def->x_target_flags
4349 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4350 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4351 || enum_opts_set.x_ix86_fpmath)
4353 /* If we are using the default tune= or arch=, undo the string assigned,
4354 and use the default. */
4355 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4356 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4357 else if (!orig_arch_specified)
4358 ix86_arch_string = NULL;
4360 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4361 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4362 else if (orig_tune_defaulted)
4363 ix86_tune_string = NULL;
4365 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4366 if (enum_opts_set.x_ix86_fpmath)
4367 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4368 else if (!TARGET_64BIT && TARGET_SSE)
4370 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4371 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4374 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4375 ix86_option_override_internal (false);
4377 /* Add any builtin functions with the new isa if any. */
4378 ix86_add_new_builtins (ix86_isa_flags);
4380 /* Save the current options unless we are validating options for
4381 #pragma. */
4382 t = build_target_option_node ();
4384 ix86_arch_string = orig_arch_string;
4385 ix86_tune_string = orig_tune_string;
4386 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4388 /* Free up memory allocated to hold the strings */
4389 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4390 free (option_strings[i]);
4393 return t;
4396 /* Hook to validate attribute((target("string"))). */
4398 static bool
4399 ix86_valid_target_attribute_p (tree fndecl,
4400 tree ARG_UNUSED (name),
4401 tree args,
4402 int ARG_UNUSED (flags))
4404 struct cl_target_option cur_target;
4405 bool ret = true;
4407 /* attribute((target("default"))) does nothing, beyond
4408 affecting multi-versioning. */
4409 if (TREE_VALUE (args)
4410 && TREE_CODE (TREE_VALUE (args)) == STRING_CST
4411 && TREE_CHAIN (args) == NULL_TREE
4412 && strcmp (TREE_STRING_POINTER (TREE_VALUE (args)), "default") == 0)
4413 return true;
4415 tree old_optimize = build_optimization_node ();
4416 tree new_target, new_optimize;
4417 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4419 /* If the function changed the optimization levels as well as setting target
4420 options, start with the optimizations specified. */
4421 if (func_optimize && func_optimize != old_optimize)
4422 cl_optimization_restore (&global_options,
4423 TREE_OPTIMIZATION (func_optimize));
4425 /* The target attributes may also change some optimization flags, so update
4426 the optimization options if necessary. */
4427 cl_target_option_save (&cur_target, &global_options);
4428 new_target = ix86_valid_target_attribute_tree (args);
4429 new_optimize = build_optimization_node ();
4431 if (new_target == error_mark_node)
4432 ret = false;
4434 else if (fndecl && new_target)
4436 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4438 if (old_optimize != new_optimize)
4439 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4442 cl_target_option_restore (&global_options, &cur_target);
4444 if (old_optimize != new_optimize)
4445 cl_optimization_restore (&global_options,
4446 TREE_OPTIMIZATION (old_optimize));
4448 return ret;
4452 /* Hook to determine if one function can safely inline another. */
4454 static bool
4455 ix86_can_inline_p (tree caller, tree callee)
4457 bool ret = false;
4458 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4459 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4461 /* If callee has no option attributes, then it is ok to inline. */
4462 if (!callee_tree)
4463 ret = true;
4465 /* If caller has no option attributes, but callee does then it is not ok to
4466 inline. */
4467 else if (!caller_tree)
4468 ret = false;
4470 else
4472 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4473 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4475 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4476 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4477 function. */
4478 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4479 != callee_opts->x_ix86_isa_flags)
4480 ret = false;
4482 /* See if we have the same non-isa options. */
4483 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4484 ret = false;
4486 /* See if arch, tune, etc. are the same. */
4487 else if (caller_opts->arch != callee_opts->arch)
4488 ret = false;
4490 else if (caller_opts->tune != callee_opts->tune)
4491 ret = false;
4493 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4494 ret = false;
4496 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4497 ret = false;
4499 else
4500 ret = true;
4503 return ret;
4507 /* Remember the last target of ix86_set_current_function. */
4508 static GTY(()) tree ix86_previous_fndecl;
4510 /* Establish appropriate back-end context for processing the function
4511 FNDECL. The argument might be NULL to indicate processing at top
4512 level, outside of any function scope. */
4513 static void
4514 ix86_set_current_function (tree fndecl)
4516 /* Only change the context if the function changes. This hook is called
4517 several times in the course of compiling a function, and we don't want to
4518 slow things down too much or call target_reinit when it isn't safe. */
4519 if (fndecl && fndecl != ix86_previous_fndecl)
4521 tree old_tree = (ix86_previous_fndecl
4522 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4523 : NULL_TREE);
4525 tree new_tree = (fndecl
4526 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4527 : NULL_TREE);
4529 ix86_previous_fndecl = fndecl;
4530 if (old_tree == new_tree)
4533 else if (new_tree)
4535 cl_target_option_restore (&global_options,
4536 TREE_TARGET_OPTION (new_tree));
4537 target_reinit ();
4540 else if (old_tree)
4542 struct cl_target_option *def
4543 = TREE_TARGET_OPTION (target_option_current_node);
4545 cl_target_option_restore (&global_options, def);
4546 target_reinit ();
4552 /* Return true if this goes in large data/bss. */
4554 static bool
4555 ix86_in_large_data_p (tree exp)
4557 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4558 return false;
4560 /* Functions are never large data. */
4561 if (TREE_CODE (exp) == FUNCTION_DECL)
4562 return false;
4564 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4566 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4567 if (strcmp (section, ".ldata") == 0
4568 || strcmp (section, ".lbss") == 0)
4569 return true;
4570 return false;
4572 else
4574 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4576 /* If this is an incomplete type with size 0, then we can't put it
4577 in data because it might be too big when completed. */
4578 if (!size || size > ix86_section_threshold)
4579 return true;
4582 return false;
4585 /* Switch to the appropriate section for output of DECL.
4586 DECL is either a `VAR_DECL' node or a constant of some sort.
4587 RELOC indicates whether forming the initial value of DECL requires
4588 link-time relocations. */
4590 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4591 ATTRIBUTE_UNUSED;
4593 static section *
4594 x86_64_elf_select_section (tree decl, int reloc,
4595 unsigned HOST_WIDE_INT align)
4597 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4598 && ix86_in_large_data_p (decl))
4600 const char *sname = NULL;
4601 unsigned int flags = SECTION_WRITE;
4602 switch (categorize_decl_for_section (decl, reloc))
4604 case SECCAT_DATA:
4605 sname = ".ldata";
4606 break;
4607 case SECCAT_DATA_REL:
4608 sname = ".ldata.rel";
4609 break;
4610 case SECCAT_DATA_REL_LOCAL:
4611 sname = ".ldata.rel.local";
4612 break;
4613 case SECCAT_DATA_REL_RO:
4614 sname = ".ldata.rel.ro";
4615 break;
4616 case SECCAT_DATA_REL_RO_LOCAL:
4617 sname = ".ldata.rel.ro.local";
4618 break;
4619 case SECCAT_BSS:
4620 sname = ".lbss";
4621 flags |= SECTION_BSS;
4622 break;
4623 case SECCAT_RODATA:
4624 case SECCAT_RODATA_MERGE_STR:
4625 case SECCAT_RODATA_MERGE_STR_INIT:
4626 case SECCAT_RODATA_MERGE_CONST:
4627 sname = ".lrodata";
4628 flags = 0;
4629 break;
4630 case SECCAT_SRODATA:
4631 case SECCAT_SDATA:
4632 case SECCAT_SBSS:
4633 gcc_unreachable ();
4634 case SECCAT_TEXT:
4635 case SECCAT_TDATA:
4636 case SECCAT_TBSS:
4637 /* We don't split these for medium model. Place them into
4638 default sections and hope for best. */
4639 break;
4641 if (sname)
4643 /* We might get called with string constants, but get_named_section
4644 doesn't like them as they are not DECLs. Also, we need to set
4645 flags in that case. */
4646 if (!DECL_P (decl))
4647 return get_section (sname, flags, NULL);
4648 return get_named_section (decl, sname, reloc);
4651 return default_elf_select_section (decl, reloc, align);
4654 /* Select a set of attributes for section NAME based on the properties
4655 of DECL and whether or not RELOC indicates that DECL's initializer
4656 might contain runtime relocations. */
4658 static unsigned int ATTRIBUTE_UNUSED
4659 x86_64_elf_section_type_flags (tree decl, const char *name, int reloc)
4661 unsigned int flags = default_section_type_flags (decl, name, reloc);
4663 if (decl == NULL_TREE
4664 && (strcmp (name, ".ldata.rel.ro") == 0
4665 || strcmp (name, ".ldata.rel.ro.local") == 0))
4666 flags |= SECTION_RELRO;
4668 if (strcmp (name, ".lbss") == 0
4669 || strncmp (name, ".lbss.", 5) == 0
4670 || strncmp (name, ".gnu.linkonce.lb.", 16) == 0)
4671 flags |= SECTION_BSS;
4673 return flags;
4676 /* Build up a unique section name, expressed as a
4677 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4678 RELOC indicates whether the initial value of EXP requires
4679 link-time relocations. */
4681 static void ATTRIBUTE_UNUSED
4682 x86_64_elf_unique_section (tree decl, int reloc)
4684 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4685 && ix86_in_large_data_p (decl))
4687 const char *prefix = NULL;
4688 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4689 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4691 switch (categorize_decl_for_section (decl, reloc))
4693 case SECCAT_DATA:
4694 case SECCAT_DATA_REL:
4695 case SECCAT_DATA_REL_LOCAL:
4696 case SECCAT_DATA_REL_RO:
4697 case SECCAT_DATA_REL_RO_LOCAL:
4698 prefix = one_only ? ".ld" : ".ldata";
4699 break;
4700 case SECCAT_BSS:
4701 prefix = one_only ? ".lb" : ".lbss";
4702 break;
4703 case SECCAT_RODATA:
4704 case SECCAT_RODATA_MERGE_STR:
4705 case SECCAT_RODATA_MERGE_STR_INIT:
4706 case SECCAT_RODATA_MERGE_CONST:
4707 prefix = one_only ? ".lr" : ".lrodata";
4708 break;
4709 case SECCAT_SRODATA:
4710 case SECCAT_SDATA:
4711 case SECCAT_SBSS:
4712 gcc_unreachable ();
4713 case SECCAT_TEXT:
4714 case SECCAT_TDATA:
4715 case SECCAT_TBSS:
4716 /* We don't split these for medium model. Place them into
4717 default sections and hope for best. */
4718 break;
4720 if (prefix)
4722 const char *name, *linkonce;
4723 char *string;
4725 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4726 name = targetm.strip_name_encoding (name);
4728 /* If we're using one_only, then there needs to be a .gnu.linkonce
4729 prefix to the section name. */
4730 linkonce = one_only ? ".gnu.linkonce" : "";
4732 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4734 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4735 return;
4738 default_unique_section (decl, reloc);
4741 #ifdef COMMON_ASM_OP
4742 /* This says how to output assembler code to declare an
4743 uninitialized external linkage data object.
4745 For medium model x86-64 we need to use .largecomm opcode for
4746 large objects. */
4747 void
4748 x86_elf_aligned_common (FILE *file,
4749 const char *name, unsigned HOST_WIDE_INT size,
4750 int align)
4752 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4753 && size > (unsigned int)ix86_section_threshold)
4754 fputs (".largecomm\t", file);
4755 else
4756 fputs (COMMON_ASM_OP, file);
4757 assemble_name (file, name);
4758 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4759 size, align / BITS_PER_UNIT);
4761 #endif
4763 /* Utility function for targets to use in implementing
4764 ASM_OUTPUT_ALIGNED_BSS. */
4766 void
4767 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4768 const char *name, unsigned HOST_WIDE_INT size,
4769 int align)
4771 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4772 && size > (unsigned int)ix86_section_threshold)
4773 switch_to_section (get_named_section (decl, ".lbss", 0));
4774 else
4775 switch_to_section (bss_section);
4776 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4777 #ifdef ASM_DECLARE_OBJECT_NAME
4778 last_assemble_variable_decl = decl;
4779 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4780 #else
4781 /* Standard thing is just output label for the object. */
4782 ASM_OUTPUT_LABEL (file, name);
4783 #endif /* ASM_DECLARE_OBJECT_NAME */
4784 ASM_OUTPUT_SKIP (file, size ? size : 1);
4787 /* Decide whether we must probe the stack before any space allocation
4788 on this target. It's essentially TARGET_STACK_PROBE except when
4789 -fstack-check causes the stack to be already probed differently. */
4791 bool
4792 ix86_target_stack_probe (void)
4794 /* Do not probe the stack twice if static stack checking is enabled. */
4795 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4796 return false;
4798 return TARGET_STACK_PROBE;
4801 /* Decide whether we can make a sibling call to a function. DECL is the
4802 declaration of the function being targeted by the call and EXP is the
4803 CALL_EXPR representing the call. */
4805 static bool
4806 ix86_function_ok_for_sibcall (tree decl, tree exp)
4808 tree type, decl_or_type;
4809 rtx a, b;
4811 /* If we are generating position-independent code, we cannot sibcall
4812 optimize any indirect call, or a direct call to a global function,
4813 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4814 if (!TARGET_MACHO
4815 && !TARGET_64BIT
4816 && flag_pic
4817 && (!decl || !targetm.binds_local_p (decl)))
4818 return false;
4820 /* If we need to align the outgoing stack, then sibcalling would
4821 unalign the stack, which may break the called function. */
4822 if (ix86_minimum_incoming_stack_boundary (true)
4823 < PREFERRED_STACK_BOUNDARY)
4824 return false;
4826 if (decl)
4828 decl_or_type = decl;
4829 type = TREE_TYPE (decl);
4831 else
4833 /* We're looking at the CALL_EXPR, we need the type of the function. */
4834 type = CALL_EXPR_FN (exp); /* pointer expression */
4835 type = TREE_TYPE (type); /* pointer type */
4836 type = TREE_TYPE (type); /* function type */
4837 decl_or_type = type;
4840 /* Check that the return value locations are the same. Like
4841 if we are returning floats on the 80387 register stack, we cannot
4842 make a sibcall from a function that doesn't return a float to a
4843 function that does or, conversely, from a function that does return
4844 a float to a function that doesn't; the necessary stack adjustment
4845 would not be executed. This is also the place we notice
4846 differences in the return value ABI. Note that it is ok for one
4847 of the functions to have void return type as long as the return
4848 value of the other is passed in a register. */
4849 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4850 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4851 cfun->decl, false);
4852 if (STACK_REG_P (a) || STACK_REG_P (b))
4854 if (!rtx_equal_p (a, b))
4855 return false;
4857 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4859 else if (!rtx_equal_p (a, b))
4860 return false;
4862 if (TARGET_64BIT)
4864 /* The SYSV ABI has more call-clobbered registers;
4865 disallow sibcalls from MS to SYSV. */
4866 if (cfun->machine->call_abi == MS_ABI
4867 && ix86_function_type_abi (type) == SYSV_ABI)
4868 return false;
4870 else
4872 /* If this call is indirect, we'll need to be able to use a
4873 call-clobbered register for the address of the target function.
4874 Make sure that all such registers are not used for passing
4875 parameters. Note that DLLIMPORT functions are indirect. */
4876 if (!decl
4877 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4879 if (ix86_function_regparm (type, NULL) >= 3)
4881 /* ??? Need to count the actual number of registers to be used,
4882 not the possible number of registers. Fix later. */
4883 return false;
4888 /* Otherwise okay. That also includes certain types of indirect calls. */
4889 return true;
4892 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4893 and "sseregparm" calling convention attributes;
4894 arguments as in struct attribute_spec.handler. */
4896 static tree
4897 ix86_handle_cconv_attribute (tree *node, tree name,
4898 tree args,
4899 int flags ATTRIBUTE_UNUSED,
4900 bool *no_add_attrs)
4902 if (TREE_CODE (*node) != FUNCTION_TYPE
4903 && TREE_CODE (*node) != METHOD_TYPE
4904 && TREE_CODE (*node) != FIELD_DECL
4905 && TREE_CODE (*node) != TYPE_DECL)
4907 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4908 name);
4909 *no_add_attrs = true;
4910 return NULL_TREE;
4913 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4914 if (is_attribute_p ("regparm", name))
4916 tree cst;
4918 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4920 error ("fastcall and regparm attributes are not compatible");
4923 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4925 error ("regparam and thiscall attributes are not compatible");
4928 cst = TREE_VALUE (args);
4929 if (TREE_CODE (cst) != INTEGER_CST)
4931 warning (OPT_Wattributes,
4932 "%qE attribute requires an integer constant argument",
4933 name);
4934 *no_add_attrs = true;
4936 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4938 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4939 name, REGPARM_MAX);
4940 *no_add_attrs = true;
4943 return NULL_TREE;
4946 if (TARGET_64BIT)
4948 /* Do not warn when emulating the MS ABI. */
4949 if ((TREE_CODE (*node) != FUNCTION_TYPE
4950 && TREE_CODE (*node) != METHOD_TYPE)
4951 || ix86_function_type_abi (*node) != MS_ABI)
4952 warning (OPT_Wattributes, "%qE attribute ignored",
4953 name);
4954 *no_add_attrs = true;
4955 return NULL_TREE;
4958 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4959 if (is_attribute_p ("fastcall", name))
4961 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4963 error ("fastcall and cdecl attributes are not compatible");
4965 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4967 error ("fastcall and stdcall attributes are not compatible");
4969 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4971 error ("fastcall and regparm attributes are not compatible");
4973 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4975 error ("fastcall and thiscall attributes are not compatible");
4979 /* Can combine stdcall with fastcall (redundant), regparm and
4980 sseregparm. */
4981 else if (is_attribute_p ("stdcall", name))
4983 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4985 error ("stdcall and cdecl attributes are not compatible");
4987 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4989 error ("stdcall and fastcall attributes are not compatible");
4991 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4993 error ("stdcall and thiscall attributes are not compatible");
4997 /* Can combine cdecl with regparm and sseregparm. */
4998 else if (is_attribute_p ("cdecl", name))
5000 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5002 error ("stdcall and cdecl attributes are not compatible");
5004 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5006 error ("fastcall and cdecl attributes are not compatible");
5008 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
5010 error ("cdecl and thiscall attributes are not compatible");
5013 else if (is_attribute_p ("thiscall", name))
5015 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
5016 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
5017 name);
5018 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
5020 error ("stdcall and thiscall attributes are not compatible");
5022 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
5024 error ("fastcall and thiscall attributes are not compatible");
5026 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
5028 error ("cdecl and thiscall attributes are not compatible");
5032 /* Can combine sseregparm with all attributes. */
5034 return NULL_TREE;
5037 /* The transactional memory builtins are implicitly regparm or fastcall
5038 depending on the ABI. Override the generic do-nothing attribute that
5039 these builtins were declared with, and replace it with one of the two
5040 attributes that we expect elsewhere. */
5042 static tree
5043 ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
5044 tree args ATTRIBUTE_UNUSED,
5045 int flags ATTRIBUTE_UNUSED,
5046 bool *no_add_attrs)
5048 tree alt;
5050 /* In no case do we want to add the placeholder attribute. */
5051 *no_add_attrs = true;
5053 /* The 64-bit ABI is unchanged for transactional memory. */
5054 if (TARGET_64BIT)
5055 return NULL_TREE;
5057 /* ??? Is there a better way to validate 32-bit windows? We have
5058 cfun->machine->call_abi, but that seems to be set only for 64-bit. */
5059 if (CHECK_STACK_LIMIT > 0)
5060 alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
5061 else
5063 alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
5064 alt = tree_cons (get_identifier ("regparm"), alt, NULL);
5066 decl_attributes (node, alt, flags);
5068 return NULL_TREE;
5071 /* This function determines from TYPE the calling-convention. */
5073 unsigned int
5074 ix86_get_callcvt (const_tree type)
5076 unsigned int ret = 0;
5077 bool is_stdarg;
5078 tree attrs;
5080 if (TARGET_64BIT)
5081 return IX86_CALLCVT_CDECL;
5083 attrs = TYPE_ATTRIBUTES (type);
5084 if (attrs != NULL_TREE)
5086 if (lookup_attribute ("cdecl", attrs))
5087 ret |= IX86_CALLCVT_CDECL;
5088 else if (lookup_attribute ("stdcall", attrs))
5089 ret |= IX86_CALLCVT_STDCALL;
5090 else if (lookup_attribute ("fastcall", attrs))
5091 ret |= IX86_CALLCVT_FASTCALL;
5092 else if (lookup_attribute ("thiscall", attrs))
5093 ret |= IX86_CALLCVT_THISCALL;
5095 /* Regparam isn't allowed for thiscall and fastcall. */
5096 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
5098 if (lookup_attribute ("regparm", attrs))
5099 ret |= IX86_CALLCVT_REGPARM;
5100 if (lookup_attribute ("sseregparm", attrs))
5101 ret |= IX86_CALLCVT_SSEREGPARM;
5104 if (IX86_BASE_CALLCVT(ret) != 0)
5105 return ret;
5108 is_stdarg = stdarg_p (type);
5109 if (TARGET_RTD && !is_stdarg)
5110 return IX86_CALLCVT_STDCALL | ret;
5112 if (ret != 0
5113 || is_stdarg
5114 || TREE_CODE (type) != METHOD_TYPE
5115 || ix86_function_type_abi (type) != MS_ABI)
5116 return IX86_CALLCVT_CDECL | ret;
5118 return IX86_CALLCVT_THISCALL;
5121 /* Return 0 if the attributes for two types are incompatible, 1 if they
5122 are compatible, and 2 if they are nearly compatible (which causes a
5123 warning to be generated). */
5125 static int
5126 ix86_comp_type_attributes (const_tree type1, const_tree type2)
5128 unsigned int ccvt1, ccvt2;
5130 if (TREE_CODE (type1) != FUNCTION_TYPE
5131 && TREE_CODE (type1) != METHOD_TYPE)
5132 return 1;
5134 ccvt1 = ix86_get_callcvt (type1);
5135 ccvt2 = ix86_get_callcvt (type2);
5136 if (ccvt1 != ccvt2)
5137 return 0;
5138 if (ix86_function_regparm (type1, NULL)
5139 != ix86_function_regparm (type2, NULL))
5140 return 0;
5142 return 1;
5145 /* Return the regparm value for a function with the indicated TYPE and DECL.
5146 DECL may be NULL when calling function indirectly
5147 or considering a libcall. */
5149 static int
5150 ix86_function_regparm (const_tree type, const_tree decl)
5152 tree attr;
5153 int regparm;
5154 unsigned int ccvt;
5156 if (TARGET_64BIT)
5157 return (ix86_function_type_abi (type) == SYSV_ABI
5158 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5159 ccvt = ix86_get_callcvt (type);
5160 regparm = ix86_regparm;
5162 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5164 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5165 if (attr)
5167 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5168 return regparm;
5171 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5172 return 2;
5173 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5174 return 1;
5176 /* Use register calling convention for local functions when possible. */
5177 if (decl
5178 && TREE_CODE (decl) == FUNCTION_DECL
5179 /* Caller and callee must agree on the calling convention, so
5180 checking here just optimize means that with
5181 __attribute__((optimize (...))) caller could use regparm convention
5182 and callee not, or vice versa. Instead look at whether the callee
5183 is optimized or not. */
5184 && opt_for_fn (decl, optimize)
5185 && !(profile_flag && !flag_fentry))
5187 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5188 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5189 if (i && i->local && i->can_change_signature)
5191 int local_regparm, globals = 0, regno;
5193 /* Make sure no regparm register is taken by a
5194 fixed register variable. */
5195 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5196 if (fixed_regs[local_regparm])
5197 break;
5199 /* We don't want to use regparm(3) for nested functions as
5200 these use a static chain pointer in the third argument. */
5201 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5202 local_regparm = 2;
5204 /* In 32-bit mode save a register for the split stack. */
5205 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5206 local_regparm = 2;
5208 /* Each fixed register usage increases register pressure,
5209 so less registers should be used for argument passing.
5210 This functionality can be overriden by an explicit
5211 regparm value. */
5212 for (regno = AX_REG; regno <= DI_REG; regno++)
5213 if (fixed_regs[regno])
5214 globals++;
5216 local_regparm
5217 = globals < local_regparm ? local_regparm - globals : 0;
5219 if (local_regparm > regparm)
5220 regparm = local_regparm;
5224 return regparm;
5227 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5228 DFmode (2) arguments in SSE registers for a function with the
5229 indicated TYPE and DECL. DECL may be NULL when calling function
5230 indirectly or considering a libcall. Otherwise return 0. */
5232 static int
5233 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5235 gcc_assert (!TARGET_64BIT);
5237 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5238 by the sseregparm attribute. */
5239 if (TARGET_SSEREGPARM
5240 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5242 if (!TARGET_SSE)
5244 if (warn)
5246 if (decl)
5247 error ("calling %qD with attribute sseregparm without "
5248 "SSE/SSE2 enabled", decl);
5249 else
5250 error ("calling %qT with attribute sseregparm without "
5251 "SSE/SSE2 enabled", type);
5253 return 0;
5256 return 2;
5259 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5260 (and DFmode for SSE2) arguments in SSE registers. */
5261 if (decl && TARGET_SSE_MATH && optimize
5262 && !(profile_flag && !flag_fentry))
5264 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5265 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5266 if (i && i->local && i->can_change_signature)
5267 return TARGET_SSE2 ? 2 : 1;
5270 return 0;
5273 /* Return true if EAX is live at the start of the function. Used by
5274 ix86_expand_prologue to determine if we need special help before
5275 calling allocate_stack_worker. */
5277 static bool
5278 ix86_eax_live_at_start_p (void)
5280 /* Cheat. Don't bother working forward from ix86_function_regparm
5281 to the function type to whether an actual argument is located in
5282 eax. Instead just look at cfg info, which is still close enough
5283 to correct at this point. This gives false positives for broken
5284 functions that might use uninitialized data that happens to be
5285 allocated in eax, but who cares? */
5286 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5289 static bool
5290 ix86_keep_aggregate_return_pointer (tree fntype)
5292 tree attr;
5294 if (!TARGET_64BIT)
5296 attr = lookup_attribute ("callee_pop_aggregate_return",
5297 TYPE_ATTRIBUTES (fntype));
5298 if (attr)
5299 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5301 /* For 32-bit MS-ABI the default is to keep aggregate
5302 return pointer. */
5303 if (ix86_function_type_abi (fntype) == MS_ABI)
5304 return true;
5306 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5309 /* Value is the number of bytes of arguments automatically
5310 popped when returning from a subroutine call.
5311 FUNDECL is the declaration node of the function (as a tree),
5312 FUNTYPE is the data type of the function (as a tree),
5313 or for a library call it is an identifier node for the subroutine name.
5314 SIZE is the number of bytes of arguments passed on the stack.
5316 On the 80386, the RTD insn may be used to pop them if the number
5317 of args is fixed, but if the number is variable then the caller
5318 must pop them all. RTD can't be used for library calls now
5319 because the library is compiled with the Unix compiler.
5320 Use of RTD is a selectable option, since it is incompatible with
5321 standard Unix calling sequences. If the option is not selected,
5322 the caller must always pop the args.
5324 The attribute stdcall is equivalent to RTD on a per module basis. */
5326 static int
5327 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5329 unsigned int ccvt;
5331 /* None of the 64-bit ABIs pop arguments. */
5332 if (TARGET_64BIT)
5333 return 0;
5335 ccvt = ix86_get_callcvt (funtype);
5337 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5338 | IX86_CALLCVT_THISCALL)) != 0
5339 && ! stdarg_p (funtype))
5340 return size;
5342 /* Lose any fake structure return argument if it is passed on the stack. */
5343 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5344 && !ix86_keep_aggregate_return_pointer (funtype))
5346 int nregs = ix86_function_regparm (funtype, fundecl);
5347 if (nregs == 0)
5348 return GET_MODE_SIZE (Pmode);
5351 return 0;
5354 /* Implement the TARGET_LEGITIMATE_COMBINED_INSN hook. */
5356 static bool
5357 ix86_legitimate_combined_insn (rtx insn)
5359 /* Check operand constraints in case hard registers were propagated
5360 into insn pattern. This check prevents combine pass from
5361 generating insn patterns with invalid hard register operands.
5362 These invalid insns can eventually confuse reload to error out
5363 with a spill failure. See also PRs 46829 and 46843. */
5364 if ((INSN_CODE (insn) = recog (PATTERN (insn), insn, 0)) >= 0)
5366 int i;
5368 extract_insn (insn);
5369 preprocess_constraints ();
5371 for (i = 0; i < recog_data.n_operands; i++)
5373 rtx op = recog_data.operand[i];
5374 enum machine_mode mode = GET_MODE (op);
5375 struct operand_alternative *op_alt;
5376 int offset = 0;
5377 bool win;
5378 int j;
5380 /* For pre-AVX disallow unaligned loads/stores where the
5381 instructions don't support it. */
5382 if (!TARGET_AVX
5383 && VECTOR_MODE_P (GET_MODE (op))
5384 && misaligned_operand (op, GET_MODE (op)))
5386 int min_align = get_attr_ssememalign (insn);
5387 if (min_align == 0)
5388 return false;
5391 /* A unary operator may be accepted by the predicate, but it
5392 is irrelevant for matching constraints. */
5393 if (UNARY_P (op))
5394 op = XEXP (op, 0);
5396 if (GET_CODE (op) == SUBREG)
5398 if (REG_P (SUBREG_REG (op))
5399 && REGNO (SUBREG_REG (op)) < FIRST_PSEUDO_REGISTER)
5400 offset = subreg_regno_offset (REGNO (SUBREG_REG (op)),
5401 GET_MODE (SUBREG_REG (op)),
5402 SUBREG_BYTE (op),
5403 GET_MODE (op));
5404 op = SUBREG_REG (op);
5407 if (!(REG_P (op) && HARD_REGISTER_P (op)))
5408 continue;
5410 op_alt = recog_op_alt[i];
5412 /* Operand has no constraints, anything is OK. */
5413 win = !recog_data.n_alternatives;
5415 for (j = 0; j < recog_data.n_alternatives; j++)
5417 if (op_alt[j].anything_ok
5418 || (op_alt[j].matches != -1
5419 && operands_match_p
5420 (recog_data.operand[i],
5421 recog_data.operand[op_alt[j].matches]))
5422 || reg_fits_class_p (op, op_alt[j].cl, offset, mode))
5424 win = true;
5425 break;
5429 if (!win)
5430 return false;
5434 return true;
5437 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
5439 static unsigned HOST_WIDE_INT
5440 ix86_asan_shadow_offset (void)
5442 return TARGET_LP64 ? (TARGET_MACHO ? (HOST_WIDE_INT_1 << 44)
5443 : HOST_WIDE_INT_C (0x7fff8000))
5444 : (HOST_WIDE_INT_1 << 29);
5447 /* Argument support functions. */
5449 /* Return true when register may be used to pass function parameters. */
5450 bool
5451 ix86_function_arg_regno_p (int regno)
5453 int i;
5454 const int *parm_regs;
5456 if (!TARGET_64BIT)
5458 if (TARGET_MACHO)
5459 return (regno < REGPARM_MAX
5460 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5461 else
5462 return (regno < REGPARM_MAX
5463 || (TARGET_MMX && MMX_REGNO_P (regno)
5464 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5465 || (TARGET_SSE && SSE_REGNO_P (regno)
5466 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5469 if (TARGET_MACHO)
5471 if (SSE_REGNO_P (regno) && TARGET_SSE)
5472 return true;
5474 else
5476 if (TARGET_SSE && SSE_REGNO_P (regno)
5477 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5478 return true;
5481 /* TODO: The function should depend on current function ABI but
5482 builtins.c would need updating then. Therefore we use the
5483 default ABI. */
5485 /* RAX is used as hidden argument to va_arg functions. */
5486 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5487 return true;
5489 if (ix86_abi == MS_ABI)
5490 parm_regs = x86_64_ms_abi_int_parameter_registers;
5491 else
5492 parm_regs = x86_64_int_parameter_registers;
5493 for (i = 0; i < (ix86_abi == MS_ABI
5494 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5495 if (regno == parm_regs[i])
5496 return true;
5497 return false;
5500 /* Return if we do not know how to pass TYPE solely in registers. */
5502 static bool
5503 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5505 if (must_pass_in_stack_var_size_or_pad (mode, type))
5506 return true;
5508 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5509 The layout_type routine is crafty and tries to trick us into passing
5510 currently unsupported vector types on the stack by using TImode. */
5511 return (!TARGET_64BIT && mode == TImode
5512 && type && TREE_CODE (type) != VECTOR_TYPE);
5515 /* It returns the size, in bytes, of the area reserved for arguments passed
5516 in registers for the function represented by fndecl dependent to the used
5517 abi format. */
5519 ix86_reg_parm_stack_space (const_tree fndecl)
5521 enum calling_abi call_abi = SYSV_ABI;
5522 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5523 call_abi = ix86_function_abi (fndecl);
5524 else
5525 call_abi = ix86_function_type_abi (fndecl);
5526 if (TARGET_64BIT && call_abi == MS_ABI)
5527 return 32;
5528 return 0;
5531 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5532 call abi used. */
5533 enum calling_abi
5534 ix86_function_type_abi (const_tree fntype)
5536 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5538 enum calling_abi abi = ix86_abi;
5539 if (abi == SYSV_ABI)
5541 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5542 abi = MS_ABI;
5544 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5545 abi = SYSV_ABI;
5546 return abi;
5548 return ix86_abi;
5551 static bool
5552 ix86_function_ms_hook_prologue (const_tree fn)
5554 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5556 if (decl_function_context (fn) != NULL_TREE)
5557 error_at (DECL_SOURCE_LOCATION (fn),
5558 "ms_hook_prologue is not compatible with nested function");
5559 else
5560 return true;
5562 return false;
5565 static enum calling_abi
5566 ix86_function_abi (const_tree fndecl)
5568 if (! fndecl)
5569 return ix86_abi;
5570 return ix86_function_type_abi (TREE_TYPE (fndecl));
5573 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5574 call abi used. */
5575 enum calling_abi
5576 ix86_cfun_abi (void)
5578 if (! cfun)
5579 return ix86_abi;
5580 return cfun->machine->call_abi;
5583 /* Write the extra assembler code needed to declare a function properly. */
5585 void
5586 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5587 tree decl)
5589 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5591 if (is_ms_hook)
5593 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5594 unsigned int filler_cc = 0xcccccccc;
5596 for (i = 0; i < filler_count; i += 4)
5597 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5600 #ifdef SUBTARGET_ASM_UNWIND_INIT
5601 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5602 #endif
5604 ASM_OUTPUT_LABEL (asm_out_file, fname);
5606 /* Output magic byte marker, if hot-patch attribute is set. */
5607 if (is_ms_hook)
5609 if (TARGET_64BIT)
5611 /* leaq [%rsp + 0], %rsp */
5612 asm_fprintf (asm_out_file, ASM_BYTE
5613 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5615 else
5617 /* movl.s %edi, %edi
5618 push %ebp
5619 movl.s %esp, %ebp */
5620 asm_fprintf (asm_out_file, ASM_BYTE
5621 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5626 /* regclass.c */
5627 extern void init_regs (void);
5629 /* Implementation of call abi switching target hook. Specific to FNDECL
5630 the specific call register sets are set. See also
5631 ix86_conditional_register_usage for more details. */
5632 void
5633 ix86_call_abi_override (const_tree fndecl)
5635 if (fndecl == NULL_TREE)
5636 cfun->machine->call_abi = ix86_abi;
5637 else
5638 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5641 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5642 expensive re-initialization of init_regs each time we switch function context
5643 since this is needed only during RTL expansion. */
5644 static void
5645 ix86_maybe_switch_abi (void)
5647 if (TARGET_64BIT &&
5648 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5649 reinit_regs ();
5652 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5653 for a call to a function whose data type is FNTYPE.
5654 For a library call, FNTYPE is 0. */
5656 void
5657 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5658 tree fntype, /* tree ptr for function decl */
5659 rtx libname, /* SYMBOL_REF of library name or 0 */
5660 tree fndecl,
5661 int caller)
5663 struct cgraph_local_info *i;
5665 memset (cum, 0, sizeof (*cum));
5667 if (fndecl)
5669 i = cgraph_local_info (fndecl);
5670 cum->call_abi = ix86_function_abi (fndecl);
5672 else
5674 i = NULL;
5675 cum->call_abi = ix86_function_type_abi (fntype);
5678 cum->caller = caller;
5680 /* Set up the number of registers to use for passing arguments. */
5682 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5683 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5684 "or subtarget optimization implying it");
5685 cum->nregs = ix86_regparm;
5686 if (TARGET_64BIT)
5688 cum->nregs = (cum->call_abi == SYSV_ABI
5689 ? X86_64_REGPARM_MAX
5690 : X86_64_MS_REGPARM_MAX);
5692 if (TARGET_SSE)
5694 cum->sse_nregs = SSE_REGPARM_MAX;
5695 if (TARGET_64BIT)
5697 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5698 ? X86_64_SSE_REGPARM_MAX
5699 : X86_64_MS_SSE_REGPARM_MAX);
5702 if (TARGET_MMX)
5703 cum->mmx_nregs = MMX_REGPARM_MAX;
5704 cum->warn_avx = true;
5705 cum->warn_sse = true;
5706 cum->warn_mmx = true;
5708 /* Because type might mismatch in between caller and callee, we need to
5709 use actual type of function for local calls.
5710 FIXME: cgraph_analyze can be told to actually record if function uses
5711 va_start so for local functions maybe_vaarg can be made aggressive
5712 helping K&R code.
5713 FIXME: once typesytem is fixed, we won't need this code anymore. */
5714 if (i && i->local && i->can_change_signature)
5715 fntype = TREE_TYPE (fndecl);
5716 cum->maybe_vaarg = (fntype
5717 ? (!prototype_p (fntype) || stdarg_p (fntype))
5718 : !libname);
5720 if (!TARGET_64BIT)
5722 /* If there are variable arguments, then we won't pass anything
5723 in registers in 32-bit mode. */
5724 if (stdarg_p (fntype))
5726 cum->nregs = 0;
5727 cum->sse_nregs = 0;
5728 cum->mmx_nregs = 0;
5729 cum->warn_avx = false;
5730 cum->warn_sse = false;
5731 cum->warn_mmx = false;
5732 return;
5735 /* Use ecx and edx registers if function has fastcall attribute,
5736 else look for regparm information. */
5737 if (fntype)
5739 unsigned int ccvt = ix86_get_callcvt (fntype);
5740 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5742 cum->nregs = 1;
5743 cum->fastcall = 1; /* Same first register as in fastcall. */
5745 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5747 cum->nregs = 2;
5748 cum->fastcall = 1;
5750 else
5751 cum->nregs = ix86_function_regparm (fntype, fndecl);
5754 /* Set up the number of SSE registers used for passing SFmode
5755 and DFmode arguments. Warn for mismatching ABI. */
5756 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5760 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5761 But in the case of vector types, it is some vector mode.
5763 When we have only some of our vector isa extensions enabled, then there
5764 are some modes for which vector_mode_supported_p is false. For these
5765 modes, the generic vector support in gcc will choose some non-vector mode
5766 in order to implement the type. By computing the natural mode, we'll
5767 select the proper ABI location for the operand and not depend on whatever
5768 the middle-end decides to do with these vector types.
5770 The midde-end can't deal with the vector types > 16 bytes. In this
5771 case, we return the original mode and warn ABI change if CUM isn't
5772 NULL.
5774 If INT_RETURN is true, warn ABI change if the vector mode isn't
5775 available for function return value. */
5777 static enum machine_mode
5778 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum,
5779 bool in_return)
5781 enum machine_mode mode = TYPE_MODE (type);
5783 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5785 HOST_WIDE_INT size = int_size_in_bytes (type);
5786 if ((size == 8 || size == 16 || size == 32)
5787 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5788 && TYPE_VECTOR_SUBPARTS (type) > 1)
5790 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5792 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5793 mode = MIN_MODE_VECTOR_FLOAT;
5794 else
5795 mode = MIN_MODE_VECTOR_INT;
5797 /* Get the mode which has this inner mode and number of units. */
5798 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5799 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5800 && GET_MODE_INNER (mode) == innermode)
5802 if (size == 32 && !TARGET_AVX)
5804 static bool warnedavx;
5805 static bool warnedavx_ret;
5807 if (cum && cum->warn_avx && !warnedavx)
5809 if (warning (OPT_Wpsabi, "AVX vector argument "
5810 "without AVX enabled changes the ABI"))
5811 warnedavx = true;
5813 else if (in_return && !warnedavx_ret)
5815 if (warning (OPT_Wpsabi, "AVX vector return "
5816 "without AVX enabled changes the ABI"))
5817 warnedavx_ret = true;
5820 return TYPE_MODE (type);
5822 else if (((size == 8 && TARGET_64BIT) || size == 16)
5823 && !TARGET_SSE)
5825 static bool warnedsse;
5826 static bool warnedsse_ret;
5828 if (cum && cum->warn_sse && !warnedsse)
5830 if (warning (OPT_Wpsabi, "SSE vector argument "
5831 "without SSE enabled changes the ABI"))
5832 warnedsse = true;
5834 else if (!TARGET_64BIT && in_return && !warnedsse_ret)
5836 if (warning (OPT_Wpsabi, "SSE vector return "
5837 "without SSE enabled changes the ABI"))
5838 warnedsse_ret = true;
5841 else if ((size == 8 && !TARGET_64BIT) && !TARGET_MMX)
5843 static bool warnedmmx;
5844 static bool warnedmmx_ret;
5846 if (cum && cum->warn_mmx && !warnedmmx)
5848 if (warning (OPT_Wpsabi, "MMX vector argument "
5849 "without MMX enabled changes the ABI"))
5850 warnedmmx = true;
5852 else if (in_return && !warnedmmx_ret)
5854 if (warning (OPT_Wpsabi, "MMX vector return "
5855 "without MMX enabled changes the ABI"))
5856 warnedmmx_ret = true;
5859 return mode;
5862 gcc_unreachable ();
5866 return mode;
5869 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5870 this may not agree with the mode that the type system has chosen for the
5871 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5872 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5874 static rtx
5875 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5876 unsigned int regno)
5878 rtx tmp;
5880 if (orig_mode != BLKmode)
5881 tmp = gen_rtx_REG (orig_mode, regno);
5882 else
5884 tmp = gen_rtx_REG (mode, regno);
5885 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5886 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5889 return tmp;
5892 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5893 of this code is to classify each 8bytes of incoming argument by the register
5894 class and assign registers accordingly. */
5896 /* Return the union class of CLASS1 and CLASS2.
5897 See the x86-64 PS ABI for details. */
5899 static enum x86_64_reg_class
5900 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5902 /* Rule #1: If both classes are equal, this is the resulting class. */
5903 if (class1 == class2)
5904 return class1;
5906 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5907 the other class. */
5908 if (class1 == X86_64_NO_CLASS)
5909 return class2;
5910 if (class2 == X86_64_NO_CLASS)
5911 return class1;
5913 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5914 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5915 return X86_64_MEMORY_CLASS;
5917 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5918 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5919 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5920 return X86_64_INTEGERSI_CLASS;
5921 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5922 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5923 return X86_64_INTEGER_CLASS;
5925 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5926 MEMORY is used. */
5927 if (class1 == X86_64_X87_CLASS
5928 || class1 == X86_64_X87UP_CLASS
5929 || class1 == X86_64_COMPLEX_X87_CLASS
5930 || class2 == X86_64_X87_CLASS
5931 || class2 == X86_64_X87UP_CLASS
5932 || class2 == X86_64_COMPLEX_X87_CLASS)
5933 return X86_64_MEMORY_CLASS;
5935 /* Rule #6: Otherwise class SSE is used. */
5936 return X86_64_SSE_CLASS;
5939 /* Classify the argument of type TYPE and mode MODE.
5940 CLASSES will be filled by the register class used to pass each word
5941 of the operand. The number of words is returned. In case the parameter
5942 should be passed in memory, 0 is returned. As a special case for zero
5943 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5945 BIT_OFFSET is used internally for handling records and specifies offset
5946 of the offset in bits modulo 256 to avoid overflow cases.
5948 See the x86-64 PS ABI for details.
5951 static int
5952 classify_argument (enum machine_mode mode, const_tree type,
5953 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5955 HOST_WIDE_INT bytes =
5956 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5957 int words
5958 = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5960 /* Variable sized entities are always passed/returned in memory. */
5961 if (bytes < 0)
5962 return 0;
5964 if (mode != VOIDmode
5965 && targetm.calls.must_pass_in_stack (mode, type))
5966 return 0;
5968 if (type && AGGREGATE_TYPE_P (type))
5970 int i;
5971 tree field;
5972 enum x86_64_reg_class subclasses[MAX_CLASSES];
5974 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5975 if (bytes > 32)
5976 return 0;
5978 for (i = 0; i < words; i++)
5979 classes[i] = X86_64_NO_CLASS;
5981 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5982 signalize memory class, so handle it as special case. */
5983 if (!words)
5985 classes[0] = X86_64_NO_CLASS;
5986 return 1;
5989 /* Classify each field of record and merge classes. */
5990 switch (TREE_CODE (type))
5992 case RECORD_TYPE:
5993 /* And now merge the fields of structure. */
5994 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5996 if (TREE_CODE (field) == FIELD_DECL)
5998 int num;
6000 if (TREE_TYPE (field) == error_mark_node)
6001 continue;
6003 /* Bitfields are always classified as integer. Handle them
6004 early, since later code would consider them to be
6005 misaligned integers. */
6006 if (DECL_BIT_FIELD (field))
6008 for (i = (int_bit_position (field)
6009 + (bit_offset % 64)) / 8 / 8;
6010 i < ((int_bit_position (field) + (bit_offset % 64))
6011 + tree_low_cst (DECL_SIZE (field), 0)
6012 + 63) / 8 / 8; i++)
6013 classes[i] =
6014 merge_classes (X86_64_INTEGER_CLASS,
6015 classes[i]);
6017 else
6019 int pos;
6021 type = TREE_TYPE (field);
6023 /* Flexible array member is ignored. */
6024 if (TYPE_MODE (type) == BLKmode
6025 && TREE_CODE (type) == ARRAY_TYPE
6026 && TYPE_SIZE (type) == NULL_TREE
6027 && TYPE_DOMAIN (type) != NULL_TREE
6028 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
6029 == NULL_TREE))
6031 static bool warned;
6033 if (!warned && warn_psabi)
6035 warned = true;
6036 inform (input_location,
6037 "the ABI of passing struct with"
6038 " a flexible array member has"
6039 " changed in GCC 4.4");
6041 continue;
6043 num = classify_argument (TYPE_MODE (type), type,
6044 subclasses,
6045 (int_bit_position (field)
6046 + bit_offset) % 256);
6047 if (!num)
6048 return 0;
6049 pos = (int_bit_position (field)
6050 + (bit_offset % 64)) / 8 / 8;
6051 for (i = 0; i < num && (i + pos) < words; i++)
6052 classes[i + pos] =
6053 merge_classes (subclasses[i], classes[i + pos]);
6057 break;
6059 case ARRAY_TYPE:
6060 /* Arrays are handled as small records. */
6062 int num;
6063 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
6064 TREE_TYPE (type), subclasses, bit_offset);
6065 if (!num)
6066 return 0;
6068 /* The partial classes are now full classes. */
6069 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
6070 subclasses[0] = X86_64_SSE_CLASS;
6071 if (subclasses[0] == X86_64_INTEGERSI_CLASS
6072 && !((bit_offset % 64) == 0 && bytes == 4))
6073 subclasses[0] = X86_64_INTEGER_CLASS;
6075 for (i = 0; i < words; i++)
6076 classes[i] = subclasses[i % num];
6078 break;
6080 case UNION_TYPE:
6081 case QUAL_UNION_TYPE:
6082 /* Unions are similar to RECORD_TYPE but offset is always 0.
6084 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6086 if (TREE_CODE (field) == FIELD_DECL)
6088 int num;
6090 if (TREE_TYPE (field) == error_mark_node)
6091 continue;
6093 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
6094 TREE_TYPE (field), subclasses,
6095 bit_offset);
6096 if (!num)
6097 return 0;
6098 for (i = 0; i < num; i++)
6099 classes[i] = merge_classes (subclasses[i], classes[i]);
6102 break;
6104 default:
6105 gcc_unreachable ();
6108 if (words > 2)
6110 /* When size > 16 bytes, if the first one isn't
6111 X86_64_SSE_CLASS or any other ones aren't
6112 X86_64_SSEUP_CLASS, everything should be passed in
6113 memory. */
6114 if (classes[0] != X86_64_SSE_CLASS)
6115 return 0;
6117 for (i = 1; i < words; i++)
6118 if (classes[i] != X86_64_SSEUP_CLASS)
6119 return 0;
6122 /* Final merger cleanup. */
6123 for (i = 0; i < words; i++)
6125 /* If one class is MEMORY, everything should be passed in
6126 memory. */
6127 if (classes[i] == X86_64_MEMORY_CLASS)
6128 return 0;
6130 /* The X86_64_SSEUP_CLASS should be always preceded by
6131 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
6132 if (classes[i] == X86_64_SSEUP_CLASS
6133 && classes[i - 1] != X86_64_SSE_CLASS
6134 && classes[i - 1] != X86_64_SSEUP_CLASS)
6136 /* The first one should never be X86_64_SSEUP_CLASS. */
6137 gcc_assert (i != 0);
6138 classes[i] = X86_64_SSE_CLASS;
6141 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
6142 everything should be passed in memory. */
6143 if (classes[i] == X86_64_X87UP_CLASS
6144 && (classes[i - 1] != X86_64_X87_CLASS))
6146 static bool warned;
6148 /* The first one should never be X86_64_X87UP_CLASS. */
6149 gcc_assert (i != 0);
6150 if (!warned && warn_psabi)
6152 warned = true;
6153 inform (input_location,
6154 "the ABI of passing union with long double"
6155 " has changed in GCC 4.4");
6157 return 0;
6160 return words;
6163 /* Compute alignment needed. We align all types to natural boundaries with
6164 exception of XFmode that is aligned to 64bits. */
6165 if (mode != VOIDmode && mode != BLKmode)
6167 int mode_alignment = GET_MODE_BITSIZE (mode);
6169 if (mode == XFmode)
6170 mode_alignment = 128;
6171 else if (mode == XCmode)
6172 mode_alignment = 256;
6173 if (COMPLEX_MODE_P (mode))
6174 mode_alignment /= 2;
6175 /* Misaligned fields are always returned in memory. */
6176 if (bit_offset % mode_alignment)
6177 return 0;
6180 /* for V1xx modes, just use the base mode */
6181 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
6182 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
6183 mode = GET_MODE_INNER (mode);
6185 /* Classification of atomic types. */
6186 switch (mode)
6188 case SDmode:
6189 case DDmode:
6190 classes[0] = X86_64_SSE_CLASS;
6191 return 1;
6192 case TDmode:
6193 classes[0] = X86_64_SSE_CLASS;
6194 classes[1] = X86_64_SSEUP_CLASS;
6195 return 2;
6196 case DImode:
6197 case SImode:
6198 case HImode:
6199 case QImode:
6200 case CSImode:
6201 case CHImode:
6202 case CQImode:
6204 int size = bit_offset + (int) GET_MODE_BITSIZE (mode);
6206 /* Analyze last 128 bits only. */
6207 size = (size - 1) & 0x7f;
6209 if (size < 32)
6211 classes[0] = X86_64_INTEGERSI_CLASS;
6212 return 1;
6214 else if (size < 64)
6216 classes[0] = X86_64_INTEGER_CLASS;
6217 return 1;
6219 else if (size < 64+32)
6221 classes[0] = X86_64_INTEGER_CLASS;
6222 classes[1] = X86_64_INTEGERSI_CLASS;
6223 return 2;
6225 else if (size < 64+64)
6227 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6228 return 2;
6230 else
6231 gcc_unreachable ();
6233 case CDImode:
6234 case TImode:
6235 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
6236 return 2;
6237 case COImode:
6238 case OImode:
6239 /* OImode shouldn't be used directly. */
6240 gcc_unreachable ();
6241 case CTImode:
6242 return 0;
6243 case SFmode:
6244 if (!(bit_offset % 64))
6245 classes[0] = X86_64_SSESF_CLASS;
6246 else
6247 classes[0] = X86_64_SSE_CLASS;
6248 return 1;
6249 case DFmode:
6250 classes[0] = X86_64_SSEDF_CLASS;
6251 return 1;
6252 case XFmode:
6253 classes[0] = X86_64_X87_CLASS;
6254 classes[1] = X86_64_X87UP_CLASS;
6255 return 2;
6256 case TFmode:
6257 classes[0] = X86_64_SSE_CLASS;
6258 classes[1] = X86_64_SSEUP_CLASS;
6259 return 2;
6260 case SCmode:
6261 classes[0] = X86_64_SSE_CLASS;
6262 if (!(bit_offset % 64))
6263 return 1;
6264 else
6266 static bool warned;
6268 if (!warned && warn_psabi)
6270 warned = true;
6271 inform (input_location,
6272 "the ABI of passing structure with complex float"
6273 " member has changed in GCC 4.4");
6275 classes[1] = X86_64_SSESF_CLASS;
6276 return 2;
6278 case DCmode:
6279 classes[0] = X86_64_SSEDF_CLASS;
6280 classes[1] = X86_64_SSEDF_CLASS;
6281 return 2;
6282 case XCmode:
6283 classes[0] = X86_64_COMPLEX_X87_CLASS;
6284 return 1;
6285 case TCmode:
6286 /* This modes is larger than 16 bytes. */
6287 return 0;
6288 case V8SFmode:
6289 case V8SImode:
6290 case V32QImode:
6291 case V16HImode:
6292 case V4DFmode:
6293 case V4DImode:
6294 classes[0] = X86_64_SSE_CLASS;
6295 classes[1] = X86_64_SSEUP_CLASS;
6296 classes[2] = X86_64_SSEUP_CLASS;
6297 classes[3] = X86_64_SSEUP_CLASS;
6298 return 4;
6299 case V4SFmode:
6300 case V4SImode:
6301 case V16QImode:
6302 case V8HImode:
6303 case V2DFmode:
6304 case V2DImode:
6305 classes[0] = X86_64_SSE_CLASS;
6306 classes[1] = X86_64_SSEUP_CLASS;
6307 return 2;
6308 case V1TImode:
6309 case V1DImode:
6310 case V2SFmode:
6311 case V2SImode:
6312 case V4HImode:
6313 case V8QImode:
6314 classes[0] = X86_64_SSE_CLASS;
6315 return 1;
6316 case BLKmode:
6317 case VOIDmode:
6318 return 0;
6319 default:
6320 gcc_assert (VECTOR_MODE_P (mode));
6322 if (bytes > 16)
6323 return 0;
6325 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6327 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6328 classes[0] = X86_64_INTEGERSI_CLASS;
6329 else
6330 classes[0] = X86_64_INTEGER_CLASS;
6331 classes[1] = X86_64_INTEGER_CLASS;
6332 return 1 + (bytes > 8);
6336 /* Examine the argument and return set number of register required in each
6337 class. Return 0 iff parameter should be passed in memory. */
6338 static int
6339 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6340 int *int_nregs, int *sse_nregs)
6342 enum x86_64_reg_class regclass[MAX_CLASSES];
6343 int n = classify_argument (mode, type, regclass, 0);
6345 *int_nregs = 0;
6346 *sse_nregs = 0;
6347 if (!n)
6348 return 0;
6349 for (n--; n >= 0; n--)
6350 switch (regclass[n])
6352 case X86_64_INTEGER_CLASS:
6353 case X86_64_INTEGERSI_CLASS:
6354 (*int_nregs)++;
6355 break;
6356 case X86_64_SSE_CLASS:
6357 case X86_64_SSESF_CLASS:
6358 case X86_64_SSEDF_CLASS:
6359 (*sse_nregs)++;
6360 break;
6361 case X86_64_NO_CLASS:
6362 case X86_64_SSEUP_CLASS:
6363 break;
6364 case X86_64_X87_CLASS:
6365 case X86_64_X87UP_CLASS:
6366 if (!in_return)
6367 return 0;
6368 break;
6369 case X86_64_COMPLEX_X87_CLASS:
6370 return in_return ? 2 : 0;
6371 case X86_64_MEMORY_CLASS:
6372 gcc_unreachable ();
6374 return 1;
6377 /* Construct container for the argument used by GCC interface. See
6378 FUNCTION_ARG for the detailed description. */
6380 static rtx
6381 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6382 const_tree type, int in_return, int nintregs, int nsseregs,
6383 const int *intreg, int sse_regno)
6385 /* The following variables hold the static issued_error state. */
6386 static bool issued_sse_arg_error;
6387 static bool issued_sse_ret_error;
6388 static bool issued_x87_ret_error;
6390 enum machine_mode tmpmode;
6391 int bytes =
6392 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6393 enum x86_64_reg_class regclass[MAX_CLASSES];
6394 int n;
6395 int i;
6396 int nexps = 0;
6397 int needed_sseregs, needed_intregs;
6398 rtx exp[MAX_CLASSES];
6399 rtx ret;
6401 n = classify_argument (mode, type, regclass, 0);
6402 if (!n)
6403 return NULL;
6404 if (!examine_argument (mode, type, in_return, &needed_intregs,
6405 &needed_sseregs))
6406 return NULL;
6407 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6408 return NULL;
6410 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6411 some less clueful developer tries to use floating-point anyway. */
6412 if (needed_sseregs && !TARGET_SSE)
6414 if (in_return)
6416 if (!issued_sse_ret_error)
6418 error ("SSE register return with SSE disabled");
6419 issued_sse_ret_error = true;
6422 else if (!issued_sse_arg_error)
6424 error ("SSE register argument with SSE disabled");
6425 issued_sse_arg_error = true;
6427 return NULL;
6430 /* Likewise, error if the ABI requires us to return values in the
6431 x87 registers and the user specified -mno-80387. */
6432 if (in_return && !TARGET_FLOAT_RETURNS_IN_80387)
6433 for (i = 0; i < n; i++)
6434 if (regclass[i] == X86_64_X87_CLASS
6435 || regclass[i] == X86_64_X87UP_CLASS
6436 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6438 if (!issued_x87_ret_error)
6440 error ("x87 register return with x87 disabled");
6441 issued_x87_ret_error = true;
6443 return NULL;
6446 /* First construct simple cases. Avoid SCmode, since we want to use
6447 single register to pass this type. */
6448 if (n == 1 && mode != SCmode)
6449 switch (regclass[0])
6451 case X86_64_INTEGER_CLASS:
6452 case X86_64_INTEGERSI_CLASS:
6453 return gen_rtx_REG (mode, intreg[0]);
6454 case X86_64_SSE_CLASS:
6455 case X86_64_SSESF_CLASS:
6456 case X86_64_SSEDF_CLASS:
6457 if (mode != BLKmode)
6458 return gen_reg_or_parallel (mode, orig_mode,
6459 SSE_REGNO (sse_regno));
6460 break;
6461 case X86_64_X87_CLASS:
6462 case X86_64_COMPLEX_X87_CLASS:
6463 return gen_rtx_REG (mode, FIRST_STACK_REG);
6464 case X86_64_NO_CLASS:
6465 /* Zero sized array, struct or class. */
6466 return NULL;
6467 default:
6468 gcc_unreachable ();
6470 if (n == 2
6471 && regclass[0] == X86_64_SSE_CLASS
6472 && regclass[1] == X86_64_SSEUP_CLASS
6473 && mode != BLKmode)
6474 return gen_reg_or_parallel (mode, orig_mode,
6475 SSE_REGNO (sse_regno));
6476 if (n == 4
6477 && regclass[0] == X86_64_SSE_CLASS
6478 && regclass[1] == X86_64_SSEUP_CLASS
6479 && regclass[2] == X86_64_SSEUP_CLASS
6480 && regclass[3] == X86_64_SSEUP_CLASS
6481 && mode != BLKmode)
6482 return gen_reg_or_parallel (mode, orig_mode,
6483 SSE_REGNO (sse_regno));
6484 if (n == 2
6485 && regclass[0] == X86_64_X87_CLASS
6486 && regclass[1] == X86_64_X87UP_CLASS)
6487 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6489 if (n == 2
6490 && regclass[0] == X86_64_INTEGER_CLASS
6491 && regclass[1] == X86_64_INTEGER_CLASS
6492 && (mode == CDImode || mode == TImode)
6493 && intreg[0] + 1 == intreg[1])
6494 return gen_rtx_REG (mode, intreg[0]);
6496 /* Otherwise figure out the entries of the PARALLEL. */
6497 for (i = 0; i < n; i++)
6499 int pos;
6501 switch (regclass[i])
6503 case X86_64_NO_CLASS:
6504 break;
6505 case X86_64_INTEGER_CLASS:
6506 case X86_64_INTEGERSI_CLASS:
6507 /* Merge TImodes on aligned occasions here too. */
6508 if (i * 8 + 8 > bytes)
6509 tmpmode
6510 = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6511 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6512 tmpmode = SImode;
6513 else
6514 tmpmode = DImode;
6515 /* We've requested 24 bytes we
6516 don't have mode for. Use DImode. */
6517 if (tmpmode == BLKmode)
6518 tmpmode = DImode;
6519 exp [nexps++]
6520 = gen_rtx_EXPR_LIST (VOIDmode,
6521 gen_rtx_REG (tmpmode, *intreg),
6522 GEN_INT (i*8));
6523 intreg++;
6524 break;
6525 case X86_64_SSESF_CLASS:
6526 exp [nexps++]
6527 = gen_rtx_EXPR_LIST (VOIDmode,
6528 gen_rtx_REG (SFmode,
6529 SSE_REGNO (sse_regno)),
6530 GEN_INT (i*8));
6531 sse_regno++;
6532 break;
6533 case X86_64_SSEDF_CLASS:
6534 exp [nexps++]
6535 = gen_rtx_EXPR_LIST (VOIDmode,
6536 gen_rtx_REG (DFmode,
6537 SSE_REGNO (sse_regno)),
6538 GEN_INT (i*8));
6539 sse_regno++;
6540 break;
6541 case X86_64_SSE_CLASS:
6542 pos = i;
6543 switch (n)
6545 case 1:
6546 tmpmode = DImode;
6547 break;
6548 case 2:
6549 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6551 tmpmode = TImode;
6552 i++;
6554 else
6555 tmpmode = DImode;
6556 break;
6557 case 4:
6558 gcc_assert (i == 0
6559 && regclass[1] == X86_64_SSEUP_CLASS
6560 && regclass[2] == X86_64_SSEUP_CLASS
6561 && regclass[3] == X86_64_SSEUP_CLASS);
6562 tmpmode = OImode;
6563 i += 3;
6564 break;
6565 default:
6566 gcc_unreachable ();
6568 exp [nexps++]
6569 = gen_rtx_EXPR_LIST (VOIDmode,
6570 gen_rtx_REG (tmpmode,
6571 SSE_REGNO (sse_regno)),
6572 GEN_INT (pos*8));
6573 sse_regno++;
6574 break;
6575 default:
6576 gcc_unreachable ();
6580 /* Empty aligned struct, union or class. */
6581 if (nexps == 0)
6582 return NULL;
6584 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6585 for (i = 0; i < nexps; i++)
6586 XVECEXP (ret, 0, i) = exp [i];
6587 return ret;
6590 /* Update the data in CUM to advance over an argument of mode MODE
6591 and data type TYPE. (TYPE is null for libcalls where that information
6592 may not be available.) */
6594 static void
6595 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6596 const_tree type, HOST_WIDE_INT bytes,
6597 HOST_WIDE_INT words)
6599 switch (mode)
6601 default:
6602 break;
6604 case BLKmode:
6605 if (bytes < 0)
6606 break;
6607 /* FALLTHRU */
6609 case DImode:
6610 case SImode:
6611 case HImode:
6612 case QImode:
6613 cum->words += words;
6614 cum->nregs -= words;
6615 cum->regno += words;
6617 if (cum->nregs <= 0)
6619 cum->nregs = 0;
6620 cum->regno = 0;
6622 break;
6624 case OImode:
6625 /* OImode shouldn't be used directly. */
6626 gcc_unreachable ();
6628 case DFmode:
6629 if (cum->float_in_sse < 2)
6630 break;
6631 case SFmode:
6632 if (cum->float_in_sse < 1)
6633 break;
6634 /* FALLTHRU */
6636 case V8SFmode:
6637 case V8SImode:
6638 case V32QImode:
6639 case V16HImode:
6640 case V4DFmode:
6641 case V4DImode:
6642 case TImode:
6643 case V16QImode:
6644 case V8HImode:
6645 case V4SImode:
6646 case V2DImode:
6647 case V4SFmode:
6648 case V2DFmode:
6649 if (!type || !AGGREGATE_TYPE_P (type))
6651 cum->sse_words += words;
6652 cum->sse_nregs -= 1;
6653 cum->sse_regno += 1;
6654 if (cum->sse_nregs <= 0)
6656 cum->sse_nregs = 0;
6657 cum->sse_regno = 0;
6660 break;
6662 case V8QImode:
6663 case V4HImode:
6664 case V2SImode:
6665 case V2SFmode:
6666 case V1TImode:
6667 case V1DImode:
6668 if (!type || !AGGREGATE_TYPE_P (type))
6670 cum->mmx_words += words;
6671 cum->mmx_nregs -= 1;
6672 cum->mmx_regno += 1;
6673 if (cum->mmx_nregs <= 0)
6675 cum->mmx_nregs = 0;
6676 cum->mmx_regno = 0;
6679 break;
6683 static void
6684 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6685 const_tree type, HOST_WIDE_INT words, bool named)
6687 int int_nregs, sse_nregs;
6689 /* Unnamed 256bit vector mode parameters are passed on stack. */
6690 if (!named && VALID_AVX256_REG_MODE (mode))
6691 return;
6693 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6694 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6696 cum->nregs -= int_nregs;
6697 cum->sse_nregs -= sse_nregs;
6698 cum->regno += int_nregs;
6699 cum->sse_regno += sse_nregs;
6701 else
6703 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6704 cum->words = (cum->words + align - 1) & ~(align - 1);
6705 cum->words += words;
6709 static void
6710 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6711 HOST_WIDE_INT words)
6713 /* Otherwise, this should be passed indirect. */
6714 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6716 cum->words += words;
6717 if (cum->nregs > 0)
6719 cum->nregs -= 1;
6720 cum->regno += 1;
6724 /* Update the data in CUM to advance over an argument of mode MODE and
6725 data type TYPE. (TYPE is null for libcalls where that information
6726 may not be available.) */
6728 static void
6729 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6730 const_tree type, bool named)
6732 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6733 HOST_WIDE_INT bytes, words;
6735 if (mode == BLKmode)
6736 bytes = int_size_in_bytes (type);
6737 else
6738 bytes = GET_MODE_SIZE (mode);
6739 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6741 if (type)
6742 mode = type_natural_mode (type, NULL, false);
6744 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6745 function_arg_advance_ms_64 (cum, bytes, words);
6746 else if (TARGET_64BIT)
6747 function_arg_advance_64 (cum, mode, type, words, named);
6748 else
6749 function_arg_advance_32 (cum, mode, type, bytes, words);
6752 /* Define where to put the arguments to a function.
6753 Value is zero to push the argument on the stack,
6754 or a hard register in which to store the argument.
6756 MODE is the argument's machine mode.
6757 TYPE is the data type of the argument (as a tree).
6758 This is null for libcalls where that information may
6759 not be available.
6760 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6761 the preceding args and about the function being called.
6762 NAMED is nonzero if this argument is a named parameter
6763 (otherwise it is an extra parameter matching an ellipsis). */
6765 static rtx
6766 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6767 enum machine_mode orig_mode, const_tree type,
6768 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6770 /* Avoid the AL settings for the Unix64 ABI. */
6771 if (mode == VOIDmode)
6772 return constm1_rtx;
6774 switch (mode)
6776 default:
6777 break;
6779 case BLKmode:
6780 if (bytes < 0)
6781 break;
6782 /* FALLTHRU */
6783 case DImode:
6784 case SImode:
6785 case HImode:
6786 case QImode:
6787 if (words <= cum->nregs)
6789 int regno = cum->regno;
6791 /* Fastcall allocates the first two DWORD (SImode) or
6792 smaller arguments to ECX and EDX if it isn't an
6793 aggregate type . */
6794 if (cum->fastcall)
6796 if (mode == BLKmode
6797 || mode == DImode
6798 || (type && AGGREGATE_TYPE_P (type)))
6799 break;
6801 /* ECX not EAX is the first allocated register. */
6802 if (regno == AX_REG)
6803 regno = CX_REG;
6805 return gen_rtx_REG (mode, regno);
6807 break;
6809 case DFmode:
6810 if (cum->float_in_sse < 2)
6811 break;
6812 case SFmode:
6813 if (cum->float_in_sse < 1)
6814 break;
6815 /* FALLTHRU */
6816 case TImode:
6817 /* In 32bit, we pass TImode in xmm registers. */
6818 case V16QImode:
6819 case V8HImode:
6820 case V4SImode:
6821 case V2DImode:
6822 case V4SFmode:
6823 case V2DFmode:
6824 if (!type || !AGGREGATE_TYPE_P (type))
6826 if (cum->sse_nregs)
6827 return gen_reg_or_parallel (mode, orig_mode,
6828 cum->sse_regno + FIRST_SSE_REG);
6830 break;
6832 case OImode:
6833 /* OImode shouldn't be used directly. */
6834 gcc_unreachable ();
6836 case V8SFmode:
6837 case V8SImode:
6838 case V32QImode:
6839 case V16HImode:
6840 case V4DFmode:
6841 case V4DImode:
6842 if (!type || !AGGREGATE_TYPE_P (type))
6844 if (cum->sse_nregs)
6845 return gen_reg_or_parallel (mode, orig_mode,
6846 cum->sse_regno + FIRST_SSE_REG);
6848 break;
6850 case V8QImode:
6851 case V4HImode:
6852 case V2SImode:
6853 case V2SFmode:
6854 case V1TImode:
6855 case V1DImode:
6856 if (!type || !AGGREGATE_TYPE_P (type))
6858 if (cum->mmx_nregs)
6859 return gen_reg_or_parallel (mode, orig_mode,
6860 cum->mmx_regno + FIRST_MMX_REG);
6862 break;
6865 return NULL_RTX;
6868 static rtx
6869 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6870 enum machine_mode orig_mode, const_tree type, bool named)
6872 /* Handle a hidden AL argument containing number of registers
6873 for varargs x86-64 functions. */
6874 if (mode == VOIDmode)
6875 return GEN_INT (cum->maybe_vaarg
6876 ? (cum->sse_nregs < 0
6877 ? X86_64_SSE_REGPARM_MAX
6878 : cum->sse_regno)
6879 : -1);
6881 switch (mode)
6883 default:
6884 break;
6886 case V8SFmode:
6887 case V8SImode:
6888 case V32QImode:
6889 case V16HImode:
6890 case V4DFmode:
6891 case V4DImode:
6892 /* Unnamed 256bit vector mode parameters are passed on stack. */
6893 if (!named)
6894 return NULL;
6895 break;
6898 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6899 cum->sse_nregs,
6900 &x86_64_int_parameter_registers [cum->regno],
6901 cum->sse_regno);
6904 static rtx
6905 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6906 enum machine_mode orig_mode, bool named,
6907 HOST_WIDE_INT bytes)
6909 unsigned int regno;
6911 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6912 We use value of -2 to specify that current function call is MSABI. */
6913 if (mode == VOIDmode)
6914 return GEN_INT (-2);
6916 /* If we've run out of registers, it goes on the stack. */
6917 if (cum->nregs == 0)
6918 return NULL_RTX;
6920 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6922 /* Only floating point modes are passed in anything but integer regs. */
6923 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6925 if (named)
6926 regno = cum->regno + FIRST_SSE_REG;
6927 else
6929 rtx t1, t2;
6931 /* Unnamed floating parameters are passed in both the
6932 SSE and integer registers. */
6933 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6934 t2 = gen_rtx_REG (mode, regno);
6935 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6936 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6937 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6940 /* Handle aggregated types passed in register. */
6941 if (orig_mode == BLKmode)
6943 if (bytes > 0 && bytes <= 8)
6944 mode = (bytes > 4 ? DImode : SImode);
6945 if (mode == BLKmode)
6946 mode = DImode;
6949 return gen_reg_or_parallel (mode, orig_mode, regno);
6952 /* Return where to put the arguments to a function.
6953 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6955 MODE is the argument's machine mode. TYPE is the data type of the
6956 argument. It is null for libcalls where that information may not be
6957 available. CUM gives information about the preceding args and about
6958 the function being called. NAMED is nonzero if this argument is a
6959 named parameter (otherwise it is an extra parameter matching an
6960 ellipsis). */
6962 static rtx
6963 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6964 const_tree type, bool named)
6966 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6967 enum machine_mode mode = omode;
6968 HOST_WIDE_INT bytes, words;
6969 rtx arg;
6971 if (mode == BLKmode)
6972 bytes = int_size_in_bytes (type);
6973 else
6974 bytes = GET_MODE_SIZE (mode);
6975 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6977 /* To simplify the code below, represent vector types with a vector mode
6978 even if MMX/SSE are not active. */
6979 if (type && TREE_CODE (type) == VECTOR_TYPE)
6980 mode = type_natural_mode (type, cum, false);
6982 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6983 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6984 else if (TARGET_64BIT)
6985 arg = function_arg_64 (cum, mode, omode, type, named);
6986 else
6987 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6989 return arg;
6992 /* A C expression that indicates when an argument must be passed by
6993 reference. If nonzero for an argument, a copy of that argument is
6994 made in memory and a pointer to the argument is passed instead of
6995 the argument itself. The pointer is passed in whatever way is
6996 appropriate for passing a pointer to that type. */
6998 static bool
6999 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
7000 enum machine_mode mode ATTRIBUTE_UNUSED,
7001 const_tree type, bool named ATTRIBUTE_UNUSED)
7003 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7005 /* See Windows x64 Software Convention. */
7006 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
7008 int msize = (int) GET_MODE_SIZE (mode);
7009 if (type)
7011 /* Arrays are passed by reference. */
7012 if (TREE_CODE (type) == ARRAY_TYPE)
7013 return true;
7015 if (AGGREGATE_TYPE_P (type))
7017 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
7018 are passed by reference. */
7019 msize = int_size_in_bytes (type);
7023 /* __m128 is passed by reference. */
7024 switch (msize) {
7025 case 1: case 2: case 4: case 8:
7026 break;
7027 default:
7028 return true;
7031 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
7032 return 1;
7034 return 0;
7037 /* Return true when TYPE should be 128bit aligned for 32bit argument
7038 passing ABI. XXX: This function is obsolete and is only used for
7039 checking psABI compatibility with previous versions of GCC. */
7041 static bool
7042 ix86_compat_aligned_value_p (const_tree type)
7044 enum machine_mode mode = TYPE_MODE (type);
7045 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
7046 || mode == TDmode
7047 || mode == TFmode
7048 || mode == TCmode)
7049 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
7050 return true;
7051 if (TYPE_ALIGN (type) < 128)
7052 return false;
7054 if (AGGREGATE_TYPE_P (type))
7056 /* Walk the aggregates recursively. */
7057 switch (TREE_CODE (type))
7059 case RECORD_TYPE:
7060 case UNION_TYPE:
7061 case QUAL_UNION_TYPE:
7063 tree field;
7065 /* Walk all the structure fields. */
7066 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7068 if (TREE_CODE (field) == FIELD_DECL
7069 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
7070 return true;
7072 break;
7075 case ARRAY_TYPE:
7076 /* Just for use if some languages passes arrays by value. */
7077 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
7078 return true;
7079 break;
7081 default:
7082 gcc_unreachable ();
7085 return false;
7088 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
7089 XXX: This function is obsolete and is only used for checking psABI
7090 compatibility with previous versions of GCC. */
7092 static unsigned int
7093 ix86_compat_function_arg_boundary (enum machine_mode mode,
7094 const_tree type, unsigned int align)
7096 /* In 32bit, only _Decimal128 and __float128 are aligned to their
7097 natural boundaries. */
7098 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
7100 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
7101 make an exception for SSE modes since these require 128bit
7102 alignment.
7104 The handling here differs from field_alignment. ICC aligns MMX
7105 arguments to 4 byte boundaries, while structure fields are aligned
7106 to 8 byte boundaries. */
7107 if (!type)
7109 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
7110 align = PARM_BOUNDARY;
7112 else
7114 if (!ix86_compat_aligned_value_p (type))
7115 align = PARM_BOUNDARY;
7118 if (align > BIGGEST_ALIGNMENT)
7119 align = BIGGEST_ALIGNMENT;
7120 return align;
7123 /* Return true when TYPE should be 128bit aligned for 32bit argument
7124 passing ABI. */
7126 static bool
7127 ix86_contains_aligned_value_p (const_tree type)
7129 enum machine_mode mode = TYPE_MODE (type);
7131 if (mode == XFmode || mode == XCmode)
7132 return false;
7134 if (TYPE_ALIGN (type) < 128)
7135 return false;
7137 if (AGGREGATE_TYPE_P (type))
7139 /* Walk the aggregates recursively. */
7140 switch (TREE_CODE (type))
7142 case RECORD_TYPE:
7143 case UNION_TYPE:
7144 case QUAL_UNION_TYPE:
7146 tree field;
7148 /* Walk all the structure fields. */
7149 for (field = TYPE_FIELDS (type);
7150 field;
7151 field = DECL_CHAIN (field))
7153 if (TREE_CODE (field) == FIELD_DECL
7154 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
7155 return true;
7157 break;
7160 case ARRAY_TYPE:
7161 /* Just for use if some languages passes arrays by value. */
7162 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
7163 return true;
7164 break;
7166 default:
7167 gcc_unreachable ();
7170 else
7171 return TYPE_ALIGN (type) >= 128;
7173 return false;
7176 /* Gives the alignment boundary, in bits, of an argument with the
7177 specified mode and type. */
7179 static unsigned int
7180 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
7182 unsigned int align;
7183 if (type)
7185 /* Since the main variant type is used for call, we convert it to
7186 the main variant type. */
7187 type = TYPE_MAIN_VARIANT (type);
7188 align = TYPE_ALIGN (type);
7190 else
7191 align = GET_MODE_ALIGNMENT (mode);
7192 if (align < PARM_BOUNDARY)
7193 align = PARM_BOUNDARY;
7194 else
7196 static bool warned;
7197 unsigned int saved_align = align;
7199 if (!TARGET_64BIT)
7201 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
7202 if (!type)
7204 if (mode == XFmode || mode == XCmode)
7205 align = PARM_BOUNDARY;
7207 else if (!ix86_contains_aligned_value_p (type))
7208 align = PARM_BOUNDARY;
7210 if (align < 128)
7211 align = PARM_BOUNDARY;
7214 if (warn_psabi
7215 && !warned
7216 && align != ix86_compat_function_arg_boundary (mode, type,
7217 saved_align))
7219 warned = true;
7220 inform (input_location,
7221 "The ABI for passing parameters with %d-byte"
7222 " alignment has changed in GCC 4.6",
7223 align / BITS_PER_UNIT);
7227 return align;
7230 /* Return true if N is a possible register number of function value. */
7232 static bool
7233 ix86_function_value_regno_p (const unsigned int regno)
7235 switch (regno)
7237 case AX_REG:
7238 case DX_REG:
7239 return true;
7240 case DI_REG:
7241 case SI_REG:
7242 return TARGET_64BIT && ix86_abi != MS_ABI;
7244 /* Complex values are returned in %st(0)/%st(1) pair. */
7245 case ST0_REG:
7246 case ST1_REG:
7247 /* TODO: The function should depend on current function ABI but
7248 builtins.c would need updating then. Therefore we use the
7249 default ABI. */
7250 if (TARGET_64BIT && ix86_abi == MS_ABI)
7251 return false;
7252 return TARGET_FLOAT_RETURNS_IN_80387;
7254 /* Complex values are returned in %xmm0/%xmm1 pair. */
7255 case XMM0_REG:
7256 case XMM1_REG:
7257 return TARGET_SSE;
7259 case MM0_REG:
7260 if (TARGET_MACHO || TARGET_64BIT)
7261 return false;
7262 return TARGET_MMX;
7265 return false;
7268 /* Define how to find the value returned by a function.
7269 VALTYPE is the data type of the value (as a tree).
7270 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7271 otherwise, FUNC is 0. */
7273 static rtx
7274 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7275 const_tree fntype, const_tree fn)
7277 unsigned int regno;
7279 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7280 we normally prevent this case when mmx is not available. However
7281 some ABIs may require the result to be returned like DImode. */
7282 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7283 regno = FIRST_MMX_REG;
7285 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7286 we prevent this case when sse is not available. However some ABIs
7287 may require the result to be returned like integer TImode. */
7288 else if (mode == TImode
7289 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7290 regno = FIRST_SSE_REG;
7292 /* 32-byte vector modes in %ymm0. */
7293 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7294 regno = FIRST_SSE_REG;
7296 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7297 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7298 regno = FIRST_FLOAT_REG;
7299 else
7300 /* Most things go in %eax. */
7301 regno = AX_REG;
7303 /* Override FP return register with %xmm0 for local functions when
7304 SSE math is enabled or for functions with sseregparm attribute. */
7305 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7307 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7308 if ((sse_level >= 1 && mode == SFmode)
7309 || (sse_level == 2 && mode == DFmode))
7310 regno = FIRST_SSE_REG;
7313 /* OImode shouldn't be used directly. */
7314 gcc_assert (mode != OImode);
7316 return gen_rtx_REG (orig_mode, regno);
7319 static rtx
7320 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7321 const_tree valtype)
7323 rtx ret;
7325 /* Handle libcalls, which don't provide a type node. */
7326 if (valtype == NULL)
7328 unsigned int regno;
7330 switch (mode)
7332 case SFmode:
7333 case SCmode:
7334 case DFmode:
7335 case DCmode:
7336 case TFmode:
7337 case SDmode:
7338 case DDmode:
7339 case TDmode:
7340 regno = FIRST_SSE_REG;
7341 break;
7342 case XFmode:
7343 case XCmode:
7344 regno = FIRST_FLOAT_REG;
7345 break;
7346 case TCmode:
7347 return NULL;
7348 default:
7349 regno = AX_REG;
7352 return gen_rtx_REG (mode, regno);
7354 else if (POINTER_TYPE_P (valtype))
7356 /* Pointers are always returned in word_mode. */
7357 mode = word_mode;
7360 ret = construct_container (mode, orig_mode, valtype, 1,
7361 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7362 x86_64_int_return_registers, 0);
7364 /* For zero sized structures, construct_container returns NULL, but we
7365 need to keep rest of compiler happy by returning meaningful value. */
7366 if (!ret)
7367 ret = gen_rtx_REG (orig_mode, AX_REG);
7369 return ret;
7372 static rtx
7373 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode,
7374 const_tree valtype)
7376 unsigned int regno = AX_REG;
7378 if (TARGET_SSE)
7380 switch (GET_MODE_SIZE (mode))
7382 case 16:
7383 if (valtype != NULL_TREE
7384 && !VECTOR_INTEGER_TYPE_P (valtype)
7385 && !VECTOR_INTEGER_TYPE_P (valtype)
7386 && !INTEGRAL_TYPE_P (valtype)
7387 && !VECTOR_FLOAT_TYPE_P (valtype))
7388 break;
7389 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7390 && !COMPLEX_MODE_P (mode))
7391 regno = FIRST_SSE_REG;
7392 break;
7393 case 8:
7394 case 4:
7395 if (mode == SFmode || mode == DFmode)
7396 regno = FIRST_SSE_REG;
7397 break;
7398 default:
7399 break;
7402 return gen_rtx_REG (orig_mode, regno);
7405 static rtx
7406 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7407 enum machine_mode orig_mode, enum machine_mode mode)
7409 const_tree fn, fntype;
7411 fn = NULL_TREE;
7412 if (fntype_or_decl && DECL_P (fntype_or_decl))
7413 fn = fntype_or_decl;
7414 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7416 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7417 return function_value_ms_64 (orig_mode, mode, valtype);
7418 else if (TARGET_64BIT)
7419 return function_value_64 (orig_mode, mode, valtype);
7420 else
7421 return function_value_32 (orig_mode, mode, fntype, fn);
7424 static rtx
7425 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7426 bool outgoing ATTRIBUTE_UNUSED)
7428 enum machine_mode mode, orig_mode;
7430 orig_mode = TYPE_MODE (valtype);
7431 mode = type_natural_mode (valtype, NULL, true);
7432 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7435 /* Pointer function arguments and return values are promoted to
7436 word_mode. */
7438 static enum machine_mode
7439 ix86_promote_function_mode (const_tree type, enum machine_mode mode,
7440 int *punsignedp, const_tree fntype,
7441 int for_return)
7443 if (type != NULL_TREE && POINTER_TYPE_P (type))
7445 *punsignedp = POINTERS_EXTEND_UNSIGNED;
7446 return word_mode;
7448 return default_promote_function_mode (type, mode, punsignedp, fntype,
7449 for_return);
7452 /* Return true if a structure, union or array with MODE containing FIELD
7453 should be accessed using BLKmode. */
7455 static bool
7456 ix86_member_type_forces_blk (const_tree field, enum machine_mode mode)
7458 /* Union with XFmode must be in BLKmode. */
7459 return (mode == XFmode
7460 && (TREE_CODE (DECL_FIELD_CONTEXT (field)) == UNION_TYPE
7461 || TREE_CODE (DECL_FIELD_CONTEXT (field)) == QUAL_UNION_TYPE));
7465 ix86_libcall_value (enum machine_mode mode)
7467 return ix86_function_value_1 (NULL, NULL, mode, mode);
7470 /* Return true iff type is returned in memory. */
7472 static bool ATTRIBUTE_UNUSED
7473 return_in_memory_32 (const_tree type, enum machine_mode mode)
7475 HOST_WIDE_INT size;
7477 if (mode == BLKmode)
7478 return true;
7480 size = int_size_in_bytes (type);
7482 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7483 return false;
7485 if (VECTOR_MODE_P (mode) || mode == TImode)
7487 /* User-created vectors small enough to fit in EAX. */
7488 if (size < 8)
7489 return false;
7491 /* MMX/3dNow values are returned in MM0,
7492 except when it doesn't exits or the ABI prescribes otherwise. */
7493 if (size == 8)
7494 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7496 /* SSE values are returned in XMM0, except when it doesn't exist. */
7497 if (size == 16)
7498 return !TARGET_SSE;
7500 /* AVX values are returned in YMM0, except when it doesn't exist. */
7501 if (size == 32)
7502 return !TARGET_AVX;
7505 if (mode == XFmode)
7506 return false;
7508 if (size > 12)
7509 return true;
7511 /* OImode shouldn't be used directly. */
7512 gcc_assert (mode != OImode);
7514 return false;
7517 static bool ATTRIBUTE_UNUSED
7518 return_in_memory_64 (const_tree type, enum machine_mode mode)
7520 int needed_intregs, needed_sseregs;
7521 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7524 static bool ATTRIBUTE_UNUSED
7525 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7527 HOST_WIDE_INT size = int_size_in_bytes (type);
7529 /* __m128 is returned in xmm0. */
7530 if ((!type || VECTOR_INTEGER_TYPE_P (type) || INTEGRAL_TYPE_P (type)
7531 || VECTOR_FLOAT_TYPE_P (type))
7532 && (SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7533 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7534 return false;
7536 /* Otherwise, the size must be exactly in [1248]. */
7537 return size != 1 && size != 2 && size != 4 && size != 8;
7540 static bool
7541 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7543 #ifdef SUBTARGET_RETURN_IN_MEMORY
7544 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7545 #else
7546 const enum machine_mode mode = type_natural_mode (type, NULL, true);
7548 if (TARGET_64BIT)
7550 if (ix86_function_type_abi (fntype) == MS_ABI)
7551 return return_in_memory_ms_64 (type, mode);
7552 else
7553 return return_in_memory_64 (type, mode);
7555 else
7556 return return_in_memory_32 (type, mode);
7557 #endif
7561 /* Create the va_list data type. */
7563 /* Returns the calling convention specific va_list date type.
7564 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7566 static tree
7567 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7569 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7571 /* For i386 we use plain pointer to argument area. */
7572 if (!TARGET_64BIT || abi == MS_ABI)
7573 return build_pointer_type (char_type_node);
7575 record = lang_hooks.types.make_type (RECORD_TYPE);
7576 type_decl = build_decl (BUILTINS_LOCATION,
7577 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7579 f_gpr = build_decl (BUILTINS_LOCATION,
7580 FIELD_DECL, get_identifier ("gp_offset"),
7581 unsigned_type_node);
7582 f_fpr = build_decl (BUILTINS_LOCATION,
7583 FIELD_DECL, get_identifier ("fp_offset"),
7584 unsigned_type_node);
7585 f_ovf = build_decl (BUILTINS_LOCATION,
7586 FIELD_DECL, get_identifier ("overflow_arg_area"),
7587 ptr_type_node);
7588 f_sav = build_decl (BUILTINS_LOCATION,
7589 FIELD_DECL, get_identifier ("reg_save_area"),
7590 ptr_type_node);
7592 va_list_gpr_counter_field = f_gpr;
7593 va_list_fpr_counter_field = f_fpr;
7595 DECL_FIELD_CONTEXT (f_gpr) = record;
7596 DECL_FIELD_CONTEXT (f_fpr) = record;
7597 DECL_FIELD_CONTEXT (f_ovf) = record;
7598 DECL_FIELD_CONTEXT (f_sav) = record;
7600 TYPE_STUB_DECL (record) = type_decl;
7601 TYPE_NAME (record) = type_decl;
7602 TYPE_FIELDS (record) = f_gpr;
7603 DECL_CHAIN (f_gpr) = f_fpr;
7604 DECL_CHAIN (f_fpr) = f_ovf;
7605 DECL_CHAIN (f_ovf) = f_sav;
7607 layout_type (record);
7609 /* The correct type is an array type of one element. */
7610 return build_array_type (record, build_index_type (size_zero_node));
7613 /* Setup the builtin va_list data type and for 64-bit the additional
7614 calling convention specific va_list data types. */
7616 static tree
7617 ix86_build_builtin_va_list (void)
7619 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7621 /* Initialize abi specific va_list builtin types. */
7622 if (TARGET_64BIT)
7624 tree t;
7625 if (ix86_abi == MS_ABI)
7627 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7628 if (TREE_CODE (t) != RECORD_TYPE)
7629 t = build_variant_type_copy (t);
7630 sysv_va_list_type_node = t;
7632 else
7634 t = ret;
7635 if (TREE_CODE (t) != RECORD_TYPE)
7636 t = build_variant_type_copy (t);
7637 sysv_va_list_type_node = t;
7639 if (ix86_abi != MS_ABI)
7641 t = ix86_build_builtin_va_list_abi (MS_ABI);
7642 if (TREE_CODE (t) != RECORD_TYPE)
7643 t = build_variant_type_copy (t);
7644 ms_va_list_type_node = t;
7646 else
7648 t = ret;
7649 if (TREE_CODE (t) != RECORD_TYPE)
7650 t = build_variant_type_copy (t);
7651 ms_va_list_type_node = t;
7655 return ret;
7658 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7660 static void
7661 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7663 rtx save_area, mem;
7664 alias_set_type set;
7665 int i, max;
7667 /* GPR size of varargs save area. */
7668 if (cfun->va_list_gpr_size)
7669 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7670 else
7671 ix86_varargs_gpr_size = 0;
7673 /* FPR size of varargs save area. We don't need it if we don't pass
7674 anything in SSE registers. */
7675 if (TARGET_SSE && cfun->va_list_fpr_size)
7676 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7677 else
7678 ix86_varargs_fpr_size = 0;
7680 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7681 return;
7683 save_area = frame_pointer_rtx;
7684 set = get_varargs_alias_set ();
7686 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7687 if (max > X86_64_REGPARM_MAX)
7688 max = X86_64_REGPARM_MAX;
7690 for (i = cum->regno; i < max; i++)
7692 mem = gen_rtx_MEM (word_mode,
7693 plus_constant (Pmode, save_area, i * UNITS_PER_WORD));
7694 MEM_NOTRAP_P (mem) = 1;
7695 set_mem_alias_set (mem, set);
7696 emit_move_insn (mem,
7697 gen_rtx_REG (word_mode,
7698 x86_64_int_parameter_registers[i]));
7701 if (ix86_varargs_fpr_size)
7703 enum machine_mode smode;
7704 rtx label, test;
7706 /* Now emit code to save SSE registers. The AX parameter contains number
7707 of SSE parameter registers used to call this function, though all we
7708 actually check here is the zero/non-zero status. */
7710 label = gen_label_rtx ();
7711 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7712 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7713 label));
7715 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7716 we used movdqa (i.e. TImode) instead? Perhaps even better would
7717 be if we could determine the real mode of the data, via a hook
7718 into pass_stdarg. Ignore all that for now. */
7719 smode = V4SFmode;
7720 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7721 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7723 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7724 if (max > X86_64_SSE_REGPARM_MAX)
7725 max = X86_64_SSE_REGPARM_MAX;
7727 for (i = cum->sse_regno; i < max; ++i)
7729 mem = plus_constant (Pmode, save_area,
7730 i * 16 + ix86_varargs_gpr_size);
7731 mem = gen_rtx_MEM (smode, mem);
7732 MEM_NOTRAP_P (mem) = 1;
7733 set_mem_alias_set (mem, set);
7734 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7736 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7739 emit_label (label);
7743 static void
7744 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7746 alias_set_type set = get_varargs_alias_set ();
7747 int i;
7749 /* Reset to zero, as there might be a sysv vaarg used
7750 before. */
7751 ix86_varargs_gpr_size = 0;
7752 ix86_varargs_fpr_size = 0;
7754 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7756 rtx reg, mem;
7758 mem = gen_rtx_MEM (Pmode,
7759 plus_constant (Pmode, virtual_incoming_args_rtx,
7760 i * UNITS_PER_WORD));
7761 MEM_NOTRAP_P (mem) = 1;
7762 set_mem_alias_set (mem, set);
7764 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7765 emit_move_insn (mem, reg);
7769 static void
7770 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7771 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7772 int no_rtl)
7774 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7775 CUMULATIVE_ARGS next_cum;
7776 tree fntype;
7778 /* This argument doesn't appear to be used anymore. Which is good,
7779 because the old code here didn't suppress rtl generation. */
7780 gcc_assert (!no_rtl);
7782 if (!TARGET_64BIT)
7783 return;
7785 fntype = TREE_TYPE (current_function_decl);
7787 /* For varargs, we do not want to skip the dummy va_dcl argument.
7788 For stdargs, we do want to skip the last named argument. */
7789 next_cum = *cum;
7790 if (stdarg_p (fntype))
7791 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7792 true);
7794 if (cum->call_abi == MS_ABI)
7795 setup_incoming_varargs_ms_64 (&next_cum);
7796 else
7797 setup_incoming_varargs_64 (&next_cum);
7800 /* Checks if TYPE is of kind va_list char *. */
7802 static bool
7803 is_va_list_char_pointer (tree type)
7805 tree canonic;
7807 /* For 32-bit it is always true. */
7808 if (!TARGET_64BIT)
7809 return true;
7810 canonic = ix86_canonical_va_list_type (type);
7811 return (canonic == ms_va_list_type_node
7812 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7815 /* Implement va_start. */
7817 static void
7818 ix86_va_start (tree valist, rtx nextarg)
7820 HOST_WIDE_INT words, n_gpr, n_fpr;
7821 tree f_gpr, f_fpr, f_ovf, f_sav;
7822 tree gpr, fpr, ovf, sav, t;
7823 tree type;
7824 rtx ovf_rtx;
7826 if (flag_split_stack
7827 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7829 unsigned int scratch_regno;
7831 /* When we are splitting the stack, we can't refer to the stack
7832 arguments using internal_arg_pointer, because they may be on
7833 the old stack. The split stack prologue will arrange to
7834 leave a pointer to the old stack arguments in a scratch
7835 register, which we here copy to a pseudo-register. The split
7836 stack prologue can't set the pseudo-register directly because
7837 it (the prologue) runs before any registers have been saved. */
7839 scratch_regno = split_stack_prologue_scratch_regno ();
7840 if (scratch_regno != INVALID_REGNUM)
7842 rtx reg, seq;
7844 reg = gen_reg_rtx (Pmode);
7845 cfun->machine->split_stack_varargs_pointer = reg;
7847 start_sequence ();
7848 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7849 seq = get_insns ();
7850 end_sequence ();
7852 push_topmost_sequence ();
7853 emit_insn_after (seq, entry_of_function ());
7854 pop_topmost_sequence ();
7858 /* Only 64bit target needs something special. */
7859 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7861 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7862 std_expand_builtin_va_start (valist, nextarg);
7863 else
7865 rtx va_r, next;
7867 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7868 next = expand_binop (ptr_mode, add_optab,
7869 cfun->machine->split_stack_varargs_pointer,
7870 crtl->args.arg_offset_rtx,
7871 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7872 convert_move (va_r, next, 0);
7874 return;
7877 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7878 f_fpr = DECL_CHAIN (f_gpr);
7879 f_ovf = DECL_CHAIN (f_fpr);
7880 f_sav = DECL_CHAIN (f_ovf);
7882 valist = build_simple_mem_ref (valist);
7883 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7884 /* The following should be folded into the MEM_REF offset. */
7885 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7886 f_gpr, NULL_TREE);
7887 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7888 f_fpr, NULL_TREE);
7889 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7890 f_ovf, NULL_TREE);
7891 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7892 f_sav, NULL_TREE);
7894 /* Count number of gp and fp argument registers used. */
7895 words = crtl->args.info.words;
7896 n_gpr = crtl->args.info.regno;
7897 n_fpr = crtl->args.info.sse_regno;
7899 if (cfun->va_list_gpr_size)
7901 type = TREE_TYPE (gpr);
7902 t = build2 (MODIFY_EXPR, type,
7903 gpr, build_int_cst (type, n_gpr * 8));
7904 TREE_SIDE_EFFECTS (t) = 1;
7905 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7908 if (TARGET_SSE && cfun->va_list_fpr_size)
7910 type = TREE_TYPE (fpr);
7911 t = build2 (MODIFY_EXPR, type, fpr,
7912 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7913 TREE_SIDE_EFFECTS (t) = 1;
7914 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7917 /* Find the overflow area. */
7918 type = TREE_TYPE (ovf);
7919 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7920 ovf_rtx = crtl->args.internal_arg_pointer;
7921 else
7922 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7923 t = make_tree (type, ovf_rtx);
7924 if (words != 0)
7925 t = fold_build_pointer_plus_hwi (t, words * UNITS_PER_WORD);
7926 t = build2 (MODIFY_EXPR, type, ovf, t);
7927 TREE_SIDE_EFFECTS (t) = 1;
7928 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7930 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7932 /* Find the register save area.
7933 Prologue of the function save it right above stack frame. */
7934 type = TREE_TYPE (sav);
7935 t = make_tree (type, frame_pointer_rtx);
7936 if (!ix86_varargs_gpr_size)
7937 t = fold_build_pointer_plus_hwi (t, -8 * X86_64_REGPARM_MAX);
7938 t = build2 (MODIFY_EXPR, type, sav, t);
7939 TREE_SIDE_EFFECTS (t) = 1;
7940 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7944 /* Implement va_arg. */
7946 static tree
7947 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7948 gimple_seq *post_p)
7950 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7951 tree f_gpr, f_fpr, f_ovf, f_sav;
7952 tree gpr, fpr, ovf, sav, t;
7953 int size, rsize;
7954 tree lab_false, lab_over = NULL_TREE;
7955 tree addr, t2;
7956 rtx container;
7957 int indirect_p = 0;
7958 tree ptrtype;
7959 enum machine_mode nat_mode;
7960 unsigned int arg_boundary;
7962 /* Only 64bit target needs something special. */
7963 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7964 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7966 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7967 f_fpr = DECL_CHAIN (f_gpr);
7968 f_ovf = DECL_CHAIN (f_fpr);
7969 f_sav = DECL_CHAIN (f_ovf);
7971 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7972 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7973 valist = build_va_arg_indirect_ref (valist);
7974 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7975 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7976 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7978 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7979 if (indirect_p)
7980 type = build_pointer_type (type);
7981 size = int_size_in_bytes (type);
7982 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7984 nat_mode = type_natural_mode (type, NULL, false);
7985 switch (nat_mode)
7987 case V8SFmode:
7988 case V8SImode:
7989 case V32QImode:
7990 case V16HImode:
7991 case V4DFmode:
7992 case V4DImode:
7993 /* Unnamed 256bit vector mode parameters are passed on stack. */
7994 if (!TARGET_64BIT_MS_ABI)
7996 container = NULL;
7997 break;
8000 default:
8001 container = construct_container (nat_mode, TYPE_MODE (type),
8002 type, 0, X86_64_REGPARM_MAX,
8003 X86_64_SSE_REGPARM_MAX, intreg,
8005 break;
8008 /* Pull the value out of the saved registers. */
8010 addr = create_tmp_var (ptr_type_node, "addr");
8012 if (container)
8014 int needed_intregs, needed_sseregs;
8015 bool need_temp;
8016 tree int_addr, sse_addr;
8018 lab_false = create_artificial_label (UNKNOWN_LOCATION);
8019 lab_over = create_artificial_label (UNKNOWN_LOCATION);
8021 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
8023 need_temp = (!REG_P (container)
8024 && ((needed_intregs && TYPE_ALIGN (type) > 64)
8025 || TYPE_ALIGN (type) > 128));
8027 /* In case we are passing structure, verify that it is consecutive block
8028 on the register save area. If not we need to do moves. */
8029 if (!need_temp && !REG_P (container))
8031 /* Verify that all registers are strictly consecutive */
8032 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
8034 int i;
8036 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8038 rtx slot = XVECEXP (container, 0, i);
8039 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
8040 || INTVAL (XEXP (slot, 1)) != i * 16)
8041 need_temp = 1;
8044 else
8046 int i;
8048 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
8050 rtx slot = XVECEXP (container, 0, i);
8051 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
8052 || INTVAL (XEXP (slot, 1)) != i * 8)
8053 need_temp = 1;
8057 if (!need_temp)
8059 int_addr = addr;
8060 sse_addr = addr;
8062 else
8064 int_addr = create_tmp_var (ptr_type_node, "int_addr");
8065 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
8068 /* First ensure that we fit completely in registers. */
8069 if (needed_intregs)
8071 t = build_int_cst (TREE_TYPE (gpr),
8072 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
8073 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
8074 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8075 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8076 gimplify_and_add (t, pre_p);
8078 if (needed_sseregs)
8080 t = build_int_cst (TREE_TYPE (fpr),
8081 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
8082 + X86_64_REGPARM_MAX * 8);
8083 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
8084 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
8085 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
8086 gimplify_and_add (t, pre_p);
8089 /* Compute index to start of area used for integer regs. */
8090 if (needed_intregs)
8092 /* int_addr = gpr + sav; */
8093 t = fold_build_pointer_plus (sav, gpr);
8094 gimplify_assign (int_addr, t, pre_p);
8096 if (needed_sseregs)
8098 /* sse_addr = fpr + sav; */
8099 t = fold_build_pointer_plus (sav, fpr);
8100 gimplify_assign (sse_addr, t, pre_p);
8102 if (need_temp)
8104 int i, prev_size = 0;
8105 tree temp = create_tmp_var (type, "va_arg_tmp");
8107 /* addr = &temp; */
8108 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
8109 gimplify_assign (addr, t, pre_p);
8111 for (i = 0; i < XVECLEN (container, 0); i++)
8113 rtx slot = XVECEXP (container, 0, i);
8114 rtx reg = XEXP (slot, 0);
8115 enum machine_mode mode = GET_MODE (reg);
8116 tree piece_type;
8117 tree addr_type;
8118 tree daddr_type;
8119 tree src_addr, src;
8120 int src_offset;
8121 tree dest_addr, dest;
8122 int cur_size = GET_MODE_SIZE (mode);
8124 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
8125 prev_size = INTVAL (XEXP (slot, 1));
8126 if (prev_size + cur_size > size)
8128 cur_size = size - prev_size;
8129 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
8130 if (mode == BLKmode)
8131 mode = QImode;
8133 piece_type = lang_hooks.types.type_for_mode (mode, 1);
8134 if (mode == GET_MODE (reg))
8135 addr_type = build_pointer_type (piece_type);
8136 else
8137 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8138 true);
8139 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
8140 true);
8142 if (SSE_REGNO_P (REGNO (reg)))
8144 src_addr = sse_addr;
8145 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
8147 else
8149 src_addr = int_addr;
8150 src_offset = REGNO (reg) * 8;
8152 src_addr = fold_convert (addr_type, src_addr);
8153 src_addr = fold_build_pointer_plus_hwi (src_addr, src_offset);
8155 dest_addr = fold_convert (daddr_type, addr);
8156 dest_addr = fold_build_pointer_plus_hwi (dest_addr, prev_size);
8157 if (cur_size == GET_MODE_SIZE (mode))
8159 src = build_va_arg_indirect_ref (src_addr);
8160 dest = build_va_arg_indirect_ref (dest_addr);
8162 gimplify_assign (dest, src, pre_p);
8164 else
8166 tree copy
8167 = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
8168 3, dest_addr, src_addr,
8169 size_int (cur_size));
8170 gimplify_and_add (copy, pre_p);
8172 prev_size += cur_size;
8176 if (needed_intregs)
8178 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
8179 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
8180 gimplify_assign (gpr, t, pre_p);
8183 if (needed_sseregs)
8185 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
8186 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
8187 gimplify_assign (fpr, t, pre_p);
8190 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
8192 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
8195 /* ... otherwise out of the overflow area. */
8197 /* When we align parameter on stack for caller, if the parameter
8198 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
8199 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
8200 here with caller. */
8201 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
8202 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
8203 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
8205 /* Care for on-stack alignment if needed. */
8206 if (arg_boundary <= 64 || size == 0)
8207 t = ovf;
8208 else
8210 HOST_WIDE_INT align = arg_boundary / 8;
8211 t = fold_build_pointer_plus_hwi (ovf, align - 1);
8212 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8213 build_int_cst (TREE_TYPE (t), -align));
8216 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
8217 gimplify_assign (addr, t, pre_p);
8219 t = fold_build_pointer_plus_hwi (t, rsize * UNITS_PER_WORD);
8220 gimplify_assign (unshare_expr (ovf), t, pre_p);
8222 if (container)
8223 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
8225 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
8226 addr = fold_convert (ptrtype, addr);
8228 if (indirect_p)
8229 addr = build_va_arg_indirect_ref (addr);
8230 return build_va_arg_indirect_ref (addr);
8233 /* Return true if OPNUM's MEM should be matched
8234 in movabs* patterns. */
8236 bool
8237 ix86_check_movabs (rtx insn, int opnum)
8239 rtx set, mem;
8241 set = PATTERN (insn);
8242 if (GET_CODE (set) == PARALLEL)
8243 set = XVECEXP (set, 0, 0);
8244 gcc_assert (GET_CODE (set) == SET);
8245 mem = XEXP (set, opnum);
8246 while (GET_CODE (mem) == SUBREG)
8247 mem = SUBREG_REG (mem);
8248 gcc_assert (MEM_P (mem));
8249 return volatile_ok || !MEM_VOLATILE_P (mem);
8252 /* Initialize the table of extra 80387 mathematical constants. */
8254 static void
8255 init_ext_80387_constants (void)
8257 static const char * cst[5] =
8259 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
8260 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
8261 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
8262 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
8263 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8265 int i;
8267 for (i = 0; i < 5; i++)
8269 real_from_string (&ext_80387_constants_table[i], cst[i]);
8270 /* Ensure each constant is rounded to XFmode precision. */
8271 real_convert (&ext_80387_constants_table[i],
8272 XFmode, &ext_80387_constants_table[i]);
8275 ext_80387_constants_init = 1;
8278 /* Return non-zero if the constant is something that
8279 can be loaded with a special instruction. */
8282 standard_80387_constant_p (rtx x)
8284 enum machine_mode mode = GET_MODE (x);
8286 REAL_VALUE_TYPE r;
8288 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8289 return -1;
8291 if (x == CONST0_RTX (mode))
8292 return 1;
8293 if (x == CONST1_RTX (mode))
8294 return 2;
8296 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8298 /* For XFmode constants, try to find a special 80387 instruction when
8299 optimizing for size or on those CPUs that benefit from them. */
8300 if (mode == XFmode
8301 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8303 int i;
8305 if (! ext_80387_constants_init)
8306 init_ext_80387_constants ();
8308 for (i = 0; i < 5; i++)
8309 if (real_identical (&r, &ext_80387_constants_table[i]))
8310 return i + 3;
8313 /* Load of the constant -0.0 or -1.0 will be split as
8314 fldz;fchs or fld1;fchs sequence. */
8315 if (real_isnegzero (&r))
8316 return 8;
8317 if (real_identical (&r, &dconstm1))
8318 return 9;
8320 return 0;
8323 /* Return the opcode of the special instruction to be used to load
8324 the constant X. */
8326 const char *
8327 standard_80387_constant_opcode (rtx x)
8329 switch (standard_80387_constant_p (x))
8331 case 1:
8332 return "fldz";
8333 case 2:
8334 return "fld1";
8335 case 3:
8336 return "fldlg2";
8337 case 4:
8338 return "fldln2";
8339 case 5:
8340 return "fldl2e";
8341 case 6:
8342 return "fldl2t";
8343 case 7:
8344 return "fldpi";
8345 case 8:
8346 case 9:
8347 return "#";
8348 default:
8349 gcc_unreachable ();
8353 /* Return the CONST_DOUBLE representing the 80387 constant that is
8354 loaded by the specified special instruction. The argument IDX
8355 matches the return value from standard_80387_constant_p. */
8358 standard_80387_constant_rtx (int idx)
8360 int i;
8362 if (! ext_80387_constants_init)
8363 init_ext_80387_constants ();
8365 switch (idx)
8367 case 3:
8368 case 4:
8369 case 5:
8370 case 6:
8371 case 7:
8372 i = idx - 3;
8373 break;
8375 default:
8376 gcc_unreachable ();
8379 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8380 XFmode);
8383 /* Return 1 if X is all 0s and 2 if x is all 1s
8384 in supported SSE/AVX vector mode. */
8387 standard_sse_constant_p (rtx x)
8389 enum machine_mode mode = GET_MODE (x);
8391 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8392 return 1;
8393 if (vector_all_ones_operand (x, mode))
8394 switch (mode)
8396 case V16QImode:
8397 case V8HImode:
8398 case V4SImode:
8399 case V2DImode:
8400 if (TARGET_SSE2)
8401 return 2;
8402 case V32QImode:
8403 case V16HImode:
8404 case V8SImode:
8405 case V4DImode:
8406 if (TARGET_AVX2)
8407 return 2;
8408 default:
8409 break;
8412 return 0;
8415 /* Return the opcode of the special instruction to be used to load
8416 the constant X. */
8418 const char *
8419 standard_sse_constant_opcode (rtx insn, rtx x)
8421 switch (standard_sse_constant_p (x))
8423 case 1:
8424 switch (get_attr_mode (insn))
8426 case MODE_TI:
8427 return "%vpxor\t%0, %d0";
8428 case MODE_V2DF:
8429 return "%vxorpd\t%0, %d0";
8430 case MODE_V4SF:
8431 return "%vxorps\t%0, %d0";
8433 case MODE_OI:
8434 return "vpxor\t%x0, %x0, %x0";
8435 case MODE_V4DF:
8436 return "vxorpd\t%x0, %x0, %x0";
8437 case MODE_V8SF:
8438 return "vxorps\t%x0, %x0, %x0";
8440 default:
8441 break;
8444 case 2:
8445 if (TARGET_AVX)
8446 return "vpcmpeqd\t%0, %0, %0";
8447 else
8448 return "pcmpeqd\t%0, %0";
8450 default:
8451 break;
8453 gcc_unreachable ();
8456 /* Returns true if OP contains a symbol reference */
8458 bool
8459 symbolic_reference_mentioned_p (rtx op)
8461 const char *fmt;
8462 int i;
8464 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8465 return true;
8467 fmt = GET_RTX_FORMAT (GET_CODE (op));
8468 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8470 if (fmt[i] == 'E')
8472 int j;
8474 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8475 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8476 return true;
8479 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8480 return true;
8483 return false;
8486 /* Return true if it is appropriate to emit `ret' instructions in the
8487 body of a function. Do this only if the epilogue is simple, needing a
8488 couple of insns. Prior to reloading, we can't tell how many registers
8489 must be saved, so return false then. Return false if there is no frame
8490 marker to de-allocate. */
8492 bool
8493 ix86_can_use_return_insn_p (void)
8495 struct ix86_frame frame;
8497 if (! reload_completed || frame_pointer_needed)
8498 return 0;
8500 /* Don't allow more than 32k pop, since that's all we can do
8501 with one instruction. */
8502 if (crtl->args.pops_args && crtl->args.size >= 32768)
8503 return 0;
8505 ix86_compute_frame_layout (&frame);
8506 return (frame.stack_pointer_offset == UNITS_PER_WORD
8507 && (frame.nregs + frame.nsseregs) == 0);
8510 /* Value should be nonzero if functions must have frame pointers.
8511 Zero means the frame pointer need not be set up (and parms may
8512 be accessed via the stack pointer) in functions that seem suitable. */
8514 static bool
8515 ix86_frame_pointer_required (void)
8517 /* If we accessed previous frames, then the generated code expects
8518 to be able to access the saved ebp value in our frame. */
8519 if (cfun->machine->accesses_prev_frame)
8520 return true;
8522 /* Several x86 os'es need a frame pointer for other reasons,
8523 usually pertaining to setjmp. */
8524 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8525 return true;
8527 /* For older 32-bit runtimes setjmp requires valid frame-pointer. */
8528 if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
8529 return true;
8531 /* Win64 SEH, very large frames need a frame-pointer as maximum stack
8532 allocation is 4GB. */
8533 if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE)
8534 return true;
8536 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8537 turns off the frame pointer by default. Turn it back on now if
8538 we've not got a leaf function. */
8539 if (TARGET_OMIT_LEAF_FRAME_POINTER
8540 && (!crtl->is_leaf
8541 || ix86_current_function_calls_tls_descriptor))
8542 return true;
8544 if (crtl->profile && !flag_fentry)
8545 return true;
8547 return false;
8550 /* Record that the current function accesses previous call frames. */
8552 void
8553 ix86_setup_frame_addresses (void)
8555 cfun->machine->accesses_prev_frame = 1;
8558 #ifndef USE_HIDDEN_LINKONCE
8559 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8560 # define USE_HIDDEN_LINKONCE 1
8561 # else
8562 # define USE_HIDDEN_LINKONCE 0
8563 # endif
8564 #endif
8566 static int pic_labels_used;
8568 /* Fills in the label name that should be used for a pc thunk for
8569 the given register. */
8571 static void
8572 get_pc_thunk_name (char name[32], unsigned int regno)
8574 gcc_assert (!TARGET_64BIT);
8576 if (USE_HIDDEN_LINKONCE)
8577 sprintf (name, "__x86.get_pc_thunk.%s", reg_names[regno]);
8578 else
8579 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8583 /* This function generates code for -fpic that loads %ebx with
8584 the return address of the caller and then returns. */
8586 static void
8587 ix86_code_end (void)
8589 rtx xops[2];
8590 int regno;
8592 for (regno = AX_REG; regno <= SP_REG; regno++)
8594 char name[32];
8595 tree decl;
8597 if (!(pic_labels_used & (1 << regno)))
8598 continue;
8600 get_pc_thunk_name (name, regno);
8602 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8603 get_identifier (name),
8604 build_function_type_list (void_type_node, NULL_TREE));
8605 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8606 NULL_TREE, void_type_node);
8607 TREE_PUBLIC (decl) = 1;
8608 TREE_STATIC (decl) = 1;
8609 DECL_IGNORED_P (decl) = 1;
8611 #if TARGET_MACHO
8612 if (TARGET_MACHO)
8614 switch_to_section (darwin_sections[text_coal_section]);
8615 fputs ("\t.weak_definition\t", asm_out_file);
8616 assemble_name (asm_out_file, name);
8617 fputs ("\n\t.private_extern\t", asm_out_file);
8618 assemble_name (asm_out_file, name);
8619 putc ('\n', asm_out_file);
8620 ASM_OUTPUT_LABEL (asm_out_file, name);
8621 DECL_WEAK (decl) = 1;
8623 else
8624 #endif
8625 if (USE_HIDDEN_LINKONCE)
8627 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8629 targetm.asm_out.unique_section (decl, 0);
8630 switch_to_section (get_named_section (decl, NULL, 0));
8632 targetm.asm_out.globalize_label (asm_out_file, name);
8633 fputs ("\t.hidden\t", asm_out_file);
8634 assemble_name (asm_out_file, name);
8635 putc ('\n', asm_out_file);
8636 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8638 else
8640 switch_to_section (text_section);
8641 ASM_OUTPUT_LABEL (asm_out_file, name);
8644 DECL_INITIAL (decl) = make_node (BLOCK);
8645 current_function_decl = decl;
8646 init_function_start (decl);
8647 first_function_block_is_cold = false;
8648 /* Make sure unwind info is emitted for the thunk if needed. */
8649 final_start_function (emit_barrier (), asm_out_file, 1);
8651 /* Pad stack IP move with 4 instructions (two NOPs count
8652 as one instruction). */
8653 if (TARGET_PAD_SHORT_FUNCTION)
8655 int i = 8;
8657 while (i--)
8658 fputs ("\tnop\n", asm_out_file);
8661 xops[0] = gen_rtx_REG (Pmode, regno);
8662 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8663 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8664 fputs ("\tret\n", asm_out_file);
8665 final_end_function ();
8666 init_insn_lengths ();
8667 free_after_compilation (cfun);
8668 set_cfun (NULL);
8669 current_function_decl = NULL;
8672 if (flag_split_stack)
8673 file_end_indicate_split_stack ();
8676 /* Emit code for the SET_GOT patterns. */
8678 const char *
8679 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8681 rtx xops[3];
8683 xops[0] = dest;
8685 if (TARGET_VXWORKS_RTP && flag_pic)
8687 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8688 xops[2] = gen_rtx_MEM (Pmode,
8689 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8690 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8692 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8693 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8694 an unadorned address. */
8695 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8696 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8697 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8698 return "";
8701 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8703 if (!flag_pic)
8705 if (TARGET_MACHO)
8706 /* We don't need a pic base, we're not producing pic. */
8707 gcc_unreachable ();
8709 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8710 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8711 targetm.asm_out.internal_label (asm_out_file, "L",
8712 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8714 else
8716 char name[32];
8717 get_pc_thunk_name (name, REGNO (dest));
8718 pic_labels_used |= 1 << REGNO (dest);
8720 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8721 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8722 output_asm_insn ("call\t%X2", xops);
8724 #if TARGET_MACHO
8725 /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here.
8726 This is what will be referenced by the Mach-O PIC subsystem. */
8727 if (machopic_should_output_picbase_label () || !label)
8728 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8730 /* When we are restoring the pic base at the site of a nonlocal label,
8731 and we decided to emit the pic base above, we will still output a
8732 local label used for calculating the correction offset (even though
8733 the offset will be 0 in that case). */
8734 if (label)
8735 targetm.asm_out.internal_label (asm_out_file, "L",
8736 CODE_LABEL_NUMBER (label));
8737 #endif
8740 if (!TARGET_MACHO)
8741 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8743 return "";
8746 /* Generate an "push" pattern for input ARG. */
8748 static rtx
8749 gen_push (rtx arg)
8751 struct machine_function *m = cfun->machine;
8753 if (m->fs.cfa_reg == stack_pointer_rtx)
8754 m->fs.cfa_offset += UNITS_PER_WORD;
8755 m->fs.sp_offset += UNITS_PER_WORD;
8757 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8758 arg = gen_rtx_REG (word_mode, REGNO (arg));
8760 return gen_rtx_SET (VOIDmode,
8761 gen_rtx_MEM (word_mode,
8762 gen_rtx_PRE_DEC (Pmode,
8763 stack_pointer_rtx)),
8764 arg);
8767 /* Generate an "pop" pattern for input ARG. */
8769 static rtx
8770 gen_pop (rtx arg)
8772 if (REG_P (arg) && GET_MODE (arg) != word_mode)
8773 arg = gen_rtx_REG (word_mode, REGNO (arg));
8775 return gen_rtx_SET (VOIDmode,
8776 arg,
8777 gen_rtx_MEM (word_mode,
8778 gen_rtx_POST_INC (Pmode,
8779 stack_pointer_rtx)));
8782 /* Return >= 0 if there is an unused call-clobbered register available
8783 for the entire function. */
8785 static unsigned int
8786 ix86_select_alt_pic_regnum (void)
8788 if (crtl->is_leaf
8789 && !crtl->profile
8790 && !ix86_current_function_calls_tls_descriptor)
8792 int i, drap;
8793 /* Can't use the same register for both PIC and DRAP. */
8794 if (crtl->drap_reg)
8795 drap = REGNO (crtl->drap_reg);
8796 else
8797 drap = -1;
8798 for (i = 2; i >= 0; --i)
8799 if (i != drap && !df_regs_ever_live_p (i))
8800 return i;
8803 return INVALID_REGNUM;
8806 /* Return TRUE if we need to save REGNO. */
8808 static bool
8809 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8811 if (pic_offset_table_rtx
8812 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8813 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8814 || crtl->profile
8815 || crtl->calls_eh_return
8816 || crtl->uses_const_pool
8817 || cfun->has_nonlocal_label))
8818 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8820 if (crtl->calls_eh_return && maybe_eh_return)
8822 unsigned i;
8823 for (i = 0; ; i++)
8825 unsigned test = EH_RETURN_DATA_REGNO (i);
8826 if (test == INVALID_REGNUM)
8827 break;
8828 if (test == regno)
8829 return true;
8833 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8834 return true;
8836 return (df_regs_ever_live_p (regno)
8837 && !call_used_regs[regno]
8838 && !fixed_regs[regno]
8839 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8842 /* Return number of saved general prupose registers. */
8844 static int
8845 ix86_nsaved_regs (void)
8847 int nregs = 0;
8848 int regno;
8850 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8851 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8852 nregs ++;
8853 return nregs;
8856 /* Return number of saved SSE registrers. */
8858 static int
8859 ix86_nsaved_sseregs (void)
8861 int nregs = 0;
8862 int regno;
8864 if (!TARGET_64BIT_MS_ABI)
8865 return 0;
8866 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8867 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8868 nregs ++;
8869 return nregs;
8872 /* Given FROM and TO register numbers, say whether this elimination is
8873 allowed. If stack alignment is needed, we can only replace argument
8874 pointer with hard frame pointer, or replace frame pointer with stack
8875 pointer. Otherwise, frame pointer elimination is automatically
8876 handled and all other eliminations are valid. */
8878 static bool
8879 ix86_can_eliminate (const int from, const int to)
8881 if (stack_realign_fp)
8882 return ((from == ARG_POINTER_REGNUM
8883 && to == HARD_FRAME_POINTER_REGNUM)
8884 || (from == FRAME_POINTER_REGNUM
8885 && to == STACK_POINTER_REGNUM));
8886 else
8887 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8890 /* Return the offset between two registers, one to be eliminated, and the other
8891 its replacement, at the start of a routine. */
8893 HOST_WIDE_INT
8894 ix86_initial_elimination_offset (int from, int to)
8896 struct ix86_frame frame;
8897 ix86_compute_frame_layout (&frame);
8899 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8900 return frame.hard_frame_pointer_offset;
8901 else if (from == FRAME_POINTER_REGNUM
8902 && to == HARD_FRAME_POINTER_REGNUM)
8903 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8904 else
8906 gcc_assert (to == STACK_POINTER_REGNUM);
8908 if (from == ARG_POINTER_REGNUM)
8909 return frame.stack_pointer_offset;
8911 gcc_assert (from == FRAME_POINTER_REGNUM);
8912 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8916 /* In a dynamically-aligned function, we can't know the offset from
8917 stack pointer to frame pointer, so we must ensure that setjmp
8918 eliminates fp against the hard fp (%ebp) rather than trying to
8919 index from %esp up to the top of the frame across a gap that is
8920 of unknown (at compile-time) size. */
8921 static rtx
8922 ix86_builtin_setjmp_frame_value (void)
8924 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8927 /* When using -fsplit-stack, the allocation routines set a field in
8928 the TCB to the bottom of the stack plus this much space, measured
8929 in bytes. */
8931 #define SPLIT_STACK_AVAILABLE 256
8933 /* Fill structure ix86_frame about frame of currently computed function. */
8935 static void
8936 ix86_compute_frame_layout (struct ix86_frame *frame)
8938 unsigned HOST_WIDE_INT stack_alignment_needed;
8939 HOST_WIDE_INT offset;
8940 unsigned HOST_WIDE_INT preferred_alignment;
8941 HOST_WIDE_INT size = get_frame_size ();
8942 HOST_WIDE_INT to_allocate;
8944 frame->nregs = ix86_nsaved_regs ();
8945 frame->nsseregs = ix86_nsaved_sseregs ();
8947 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8948 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8950 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8951 function prologues and leaf. */
8952 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8953 && (!crtl->is_leaf || cfun->calls_alloca != 0
8954 || ix86_current_function_calls_tls_descriptor))
8956 preferred_alignment = 16;
8957 stack_alignment_needed = 16;
8958 crtl->preferred_stack_boundary = 128;
8959 crtl->stack_alignment_needed = 128;
8962 gcc_assert (!size || stack_alignment_needed);
8963 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8964 gcc_assert (preferred_alignment <= stack_alignment_needed);
8966 /* For SEH we have to limit the amount of code movement into the prologue.
8967 At present we do this via a BLOCKAGE, at which point there's very little
8968 scheduling that can be done, which means that there's very little point
8969 in doing anything except PUSHs. */
8970 if (TARGET_SEH)
8971 cfun->machine->use_fast_prologue_epilogue = false;
8973 /* During reload iteration the amount of registers saved can change.
8974 Recompute the value as needed. Do not recompute when amount of registers
8975 didn't change as reload does multiple calls to the function and does not
8976 expect the decision to change within single iteration. */
8977 else if (!optimize_function_for_size_p (cfun)
8978 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8980 int count = frame->nregs;
8981 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8983 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8985 /* The fast prologue uses move instead of push to save registers. This
8986 is significantly longer, but also executes faster as modern hardware
8987 can execute the moves in parallel, but can't do that for push/pop.
8989 Be careful about choosing what prologue to emit: When function takes
8990 many instructions to execute we may use slow version as well as in
8991 case function is known to be outside hot spot (this is known with
8992 feedback only). Weight the size of function by number of registers
8993 to save as it is cheap to use one or two push instructions but very
8994 slow to use many of them. */
8995 if (count)
8996 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8997 if (node->frequency < NODE_FREQUENCY_NORMAL
8998 || (flag_branch_probabilities
8999 && node->frequency < NODE_FREQUENCY_HOT))
9000 cfun->machine->use_fast_prologue_epilogue = false;
9001 else
9002 cfun->machine->use_fast_prologue_epilogue
9003 = !expensive_function_p (count);
9006 frame->save_regs_using_mov
9007 = (TARGET_PROLOGUE_USING_MOVE && cfun->machine->use_fast_prologue_epilogue
9008 /* If static stack checking is enabled and done with probes,
9009 the registers need to be saved before allocating the frame. */
9010 && flag_stack_check != STATIC_BUILTIN_STACK_CHECK);
9012 /* Skip return address. */
9013 offset = UNITS_PER_WORD;
9015 /* Skip pushed static chain. */
9016 if (ix86_static_chain_on_stack)
9017 offset += UNITS_PER_WORD;
9019 /* Skip saved base pointer. */
9020 if (frame_pointer_needed)
9021 offset += UNITS_PER_WORD;
9022 frame->hfp_save_offset = offset;
9024 /* The traditional frame pointer location is at the top of the frame. */
9025 frame->hard_frame_pointer_offset = offset;
9027 /* Register save area */
9028 offset += frame->nregs * UNITS_PER_WORD;
9029 frame->reg_save_offset = offset;
9031 /* On SEH target, registers are pushed just before the frame pointer
9032 location. */
9033 if (TARGET_SEH)
9034 frame->hard_frame_pointer_offset = offset;
9036 /* Align and set SSE register save area. */
9037 if (frame->nsseregs)
9039 /* The only ABI that has saved SSE registers (Win64) also has a
9040 16-byte aligned default stack, and thus we don't need to be
9041 within the re-aligned local stack frame to save them. */
9042 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
9043 offset = (offset + 16 - 1) & -16;
9044 offset += frame->nsseregs * 16;
9046 frame->sse_reg_save_offset = offset;
9048 /* The re-aligned stack starts here. Values before this point are not
9049 directly comparable with values below this point. In order to make
9050 sure that no value happens to be the same before and after, force
9051 the alignment computation below to add a non-zero value. */
9052 if (stack_realign_fp)
9053 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
9055 /* Va-arg area */
9056 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
9057 offset += frame->va_arg_size;
9059 /* Align start of frame for local function. */
9060 if (stack_realign_fp
9061 || offset != frame->sse_reg_save_offset
9062 || size != 0
9063 || !crtl->is_leaf
9064 || cfun->calls_alloca
9065 || ix86_current_function_calls_tls_descriptor)
9066 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
9068 /* Frame pointer points here. */
9069 frame->frame_pointer_offset = offset;
9071 offset += size;
9073 /* Add outgoing arguments area. Can be skipped if we eliminated
9074 all the function calls as dead code.
9075 Skipping is however impossible when function calls alloca. Alloca
9076 expander assumes that last crtl->outgoing_args_size
9077 of stack frame are unused. */
9078 if (ACCUMULATE_OUTGOING_ARGS
9079 && (!crtl->is_leaf || cfun->calls_alloca
9080 || ix86_current_function_calls_tls_descriptor))
9082 offset += crtl->outgoing_args_size;
9083 frame->outgoing_arguments_size = crtl->outgoing_args_size;
9085 else
9086 frame->outgoing_arguments_size = 0;
9088 /* Align stack boundary. Only needed if we're calling another function
9089 or using alloca. */
9090 if (!crtl->is_leaf || cfun->calls_alloca
9091 || ix86_current_function_calls_tls_descriptor)
9092 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
9094 /* We've reached end of stack frame. */
9095 frame->stack_pointer_offset = offset;
9097 /* Size prologue needs to allocate. */
9098 to_allocate = offset - frame->sse_reg_save_offset;
9100 if ((!to_allocate && frame->nregs <= 1)
9101 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
9102 frame->save_regs_using_mov = false;
9104 if (ix86_using_red_zone ()
9105 && crtl->sp_is_unchanging
9106 && crtl->is_leaf
9107 && !ix86_current_function_calls_tls_descriptor)
9109 frame->red_zone_size = to_allocate;
9110 if (frame->save_regs_using_mov)
9111 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
9112 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
9113 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
9115 else
9116 frame->red_zone_size = 0;
9117 frame->stack_pointer_offset -= frame->red_zone_size;
9119 /* The SEH frame pointer location is near the bottom of the frame.
9120 This is enforced by the fact that the difference between the
9121 stack pointer and the frame pointer is limited to 240 bytes in
9122 the unwind data structure. */
9123 if (TARGET_SEH)
9125 HOST_WIDE_INT diff;
9127 /* If we can leave the frame pointer where it is, do so. Also, returns
9128 the establisher frame for __builtin_frame_address (0). */
9129 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
9130 if (diff <= SEH_MAX_FRAME_SIZE
9131 && (diff > 240 || (diff & 15) != 0)
9132 && !crtl->accesses_prior_frames)
9134 /* Ideally we'd determine what portion of the local stack frame
9135 (within the constraint of the lowest 240) is most heavily used.
9136 But without that complication, simply bias the frame pointer
9137 by 128 bytes so as to maximize the amount of the local stack
9138 frame that is addressable with 8-bit offsets. */
9139 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
9144 /* This is semi-inlined memory_address_length, but simplified
9145 since we know that we're always dealing with reg+offset, and
9146 to avoid having to create and discard all that rtl. */
9148 static inline int
9149 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
9151 int len = 4;
9153 if (offset == 0)
9155 /* EBP and R13 cannot be encoded without an offset. */
9156 len = (regno == BP_REG || regno == R13_REG);
9158 else if (IN_RANGE (offset, -128, 127))
9159 len = 1;
9161 /* ESP and R12 must be encoded with a SIB byte. */
9162 if (regno == SP_REG || regno == R12_REG)
9163 len++;
9165 return len;
9168 /* Return an RTX that points to CFA_OFFSET within the stack frame.
9169 The valid base registers are taken from CFUN->MACHINE->FS. */
9171 static rtx
9172 choose_baseaddr (HOST_WIDE_INT cfa_offset)
9174 const struct machine_function *m = cfun->machine;
9175 rtx base_reg = NULL;
9176 HOST_WIDE_INT base_offset = 0;
9178 if (m->use_fast_prologue_epilogue)
9180 /* Choose the base register most likely to allow the most scheduling
9181 opportunities. Generally FP is valid throughout the function,
9182 while DRAP must be reloaded within the epilogue. But choose either
9183 over the SP due to increased encoding size. */
9185 if (m->fs.fp_valid)
9187 base_reg = hard_frame_pointer_rtx;
9188 base_offset = m->fs.fp_offset - cfa_offset;
9190 else if (m->fs.drap_valid)
9192 base_reg = crtl->drap_reg;
9193 base_offset = 0 - cfa_offset;
9195 else if (m->fs.sp_valid)
9197 base_reg = stack_pointer_rtx;
9198 base_offset = m->fs.sp_offset - cfa_offset;
9201 else
9203 HOST_WIDE_INT toffset;
9204 int len = 16, tlen;
9206 /* Choose the base register with the smallest address encoding.
9207 With a tie, choose FP > DRAP > SP. */
9208 if (m->fs.sp_valid)
9210 base_reg = stack_pointer_rtx;
9211 base_offset = m->fs.sp_offset - cfa_offset;
9212 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
9214 if (m->fs.drap_valid)
9216 toffset = 0 - cfa_offset;
9217 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
9218 if (tlen <= len)
9220 base_reg = crtl->drap_reg;
9221 base_offset = toffset;
9222 len = tlen;
9225 if (m->fs.fp_valid)
9227 toffset = m->fs.fp_offset - cfa_offset;
9228 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
9229 if (tlen <= len)
9231 base_reg = hard_frame_pointer_rtx;
9232 base_offset = toffset;
9233 len = tlen;
9237 gcc_assert (base_reg != NULL);
9239 return plus_constant (Pmode, base_reg, base_offset);
9242 /* Emit code to save registers in the prologue. */
9244 static void
9245 ix86_emit_save_regs (void)
9247 unsigned int regno;
9248 rtx insn;
9250 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
9251 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9253 insn = emit_insn (gen_push (gen_rtx_REG (word_mode, regno)));
9254 RTX_FRAME_RELATED_P (insn) = 1;
9258 /* Emit a single register save at CFA - CFA_OFFSET. */
9260 static void
9261 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
9262 HOST_WIDE_INT cfa_offset)
9264 struct machine_function *m = cfun->machine;
9265 rtx reg = gen_rtx_REG (mode, regno);
9266 rtx mem, addr, base, insn;
9268 addr = choose_baseaddr (cfa_offset);
9269 mem = gen_frame_mem (mode, addr);
9271 /* For SSE saves, we need to indicate the 128-bit alignment. */
9272 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
9274 insn = emit_move_insn (mem, reg);
9275 RTX_FRAME_RELATED_P (insn) = 1;
9277 base = addr;
9278 if (GET_CODE (base) == PLUS)
9279 base = XEXP (base, 0);
9280 gcc_checking_assert (REG_P (base));
9282 /* When saving registers into a re-aligned local stack frame, avoid
9283 any tricky guessing by dwarf2out. */
9284 if (m->fs.realigned)
9286 gcc_checking_assert (stack_realign_drap);
9288 if (regno == REGNO (crtl->drap_reg))
9290 /* A bit of a hack. We force the DRAP register to be saved in
9291 the re-aligned stack frame, which provides us with a copy
9292 of the CFA that will last past the prologue. Install it. */
9293 gcc_checking_assert (cfun->machine->fs.fp_valid);
9294 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9295 cfun->machine->fs.fp_offset - cfa_offset);
9296 mem = gen_rtx_MEM (mode, addr);
9297 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9299 else
9301 /* The frame pointer is a stable reference within the
9302 aligned frame. Use it. */
9303 gcc_checking_assert (cfun->machine->fs.fp_valid);
9304 addr = plus_constant (Pmode, hard_frame_pointer_rtx,
9305 cfun->machine->fs.fp_offset - cfa_offset);
9306 mem = gen_rtx_MEM (mode, addr);
9307 add_reg_note (insn, REG_CFA_EXPRESSION,
9308 gen_rtx_SET (VOIDmode, mem, reg));
9312 /* The memory may not be relative to the current CFA register,
9313 which means that we may need to generate a new pattern for
9314 use by the unwind info. */
9315 else if (base != m->fs.cfa_reg)
9317 addr = plus_constant (Pmode, m->fs.cfa_reg,
9318 m->fs.cfa_offset - cfa_offset);
9319 mem = gen_rtx_MEM (mode, addr);
9320 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9324 /* Emit code to save registers using MOV insns.
9325 First register is stored at CFA - CFA_OFFSET. */
9326 static void
9327 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9329 unsigned int regno;
9331 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9332 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9334 ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
9335 cfa_offset -= UNITS_PER_WORD;
9339 /* Emit code to save SSE registers using MOV insns.
9340 First register is stored at CFA - CFA_OFFSET. */
9341 static void
9342 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9344 unsigned int regno;
9346 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9347 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9349 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9350 cfa_offset -= 16;
9354 static GTY(()) rtx queued_cfa_restores;
9356 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9357 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9358 Don't add the note if the previously saved value will be left untouched
9359 within stack red-zone till return, as unwinders can find the same value
9360 in the register and on the stack. */
9362 static void
9363 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9365 if (!crtl->shrink_wrapped
9366 && cfa_offset <= cfun->machine->fs.red_zone_offset)
9367 return;
9369 if (insn)
9371 add_reg_note (insn, REG_CFA_RESTORE, reg);
9372 RTX_FRAME_RELATED_P (insn) = 1;
9374 else
9375 queued_cfa_restores
9376 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9379 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9381 static void
9382 ix86_add_queued_cfa_restore_notes (rtx insn)
9384 rtx last;
9385 if (!queued_cfa_restores)
9386 return;
9387 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9389 XEXP (last, 1) = REG_NOTES (insn);
9390 REG_NOTES (insn) = queued_cfa_restores;
9391 queued_cfa_restores = NULL_RTX;
9392 RTX_FRAME_RELATED_P (insn) = 1;
9395 /* Expand prologue or epilogue stack adjustment.
9396 The pattern exist to put a dependency on all ebp-based memory accesses.
9397 STYLE should be negative if instructions should be marked as frame related,
9398 zero if %r11 register is live and cannot be freely used and positive
9399 otherwise. */
9401 static void
9402 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9403 int style, bool set_cfa)
9405 struct machine_function *m = cfun->machine;
9406 rtx insn;
9407 bool add_frame_related_expr = false;
9409 if (Pmode == SImode)
9410 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9411 else if (x86_64_immediate_operand (offset, DImode))
9412 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9413 else
9415 rtx tmp;
9416 /* r11 is used by indirect sibcall return as well, set before the
9417 epilogue and used after the epilogue. */
9418 if (style)
9419 tmp = gen_rtx_REG (DImode, R11_REG);
9420 else
9422 gcc_assert (src != hard_frame_pointer_rtx
9423 && dest != hard_frame_pointer_rtx);
9424 tmp = hard_frame_pointer_rtx;
9426 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9427 if (style < 0)
9428 add_frame_related_expr = true;
9430 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9433 insn = emit_insn (insn);
9434 if (style >= 0)
9435 ix86_add_queued_cfa_restore_notes (insn);
9437 if (set_cfa)
9439 rtx r;
9441 gcc_assert (m->fs.cfa_reg == src);
9442 m->fs.cfa_offset += INTVAL (offset);
9443 m->fs.cfa_reg = dest;
9445 r = gen_rtx_PLUS (Pmode, src, offset);
9446 r = gen_rtx_SET (VOIDmode, dest, r);
9447 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9448 RTX_FRAME_RELATED_P (insn) = 1;
9450 else if (style < 0)
9452 RTX_FRAME_RELATED_P (insn) = 1;
9453 if (add_frame_related_expr)
9455 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9456 r = gen_rtx_SET (VOIDmode, dest, r);
9457 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9461 if (dest == stack_pointer_rtx)
9463 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9464 bool valid = m->fs.sp_valid;
9466 if (src == hard_frame_pointer_rtx)
9468 valid = m->fs.fp_valid;
9469 ooffset = m->fs.fp_offset;
9471 else if (src == crtl->drap_reg)
9473 valid = m->fs.drap_valid;
9474 ooffset = 0;
9476 else
9478 /* Else there are two possibilities: SP itself, which we set
9479 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9480 taken care of this by hand along the eh_return path. */
9481 gcc_checking_assert (src == stack_pointer_rtx
9482 || offset == const0_rtx);
9485 m->fs.sp_offset = ooffset - INTVAL (offset);
9486 m->fs.sp_valid = valid;
9490 /* Find an available register to be used as dynamic realign argument
9491 pointer regsiter. Such a register will be written in prologue and
9492 used in begin of body, so it must not be
9493 1. parameter passing register.
9494 2. GOT pointer.
9495 We reuse static-chain register if it is available. Otherwise, we
9496 use DI for i386 and R13 for x86-64. We chose R13 since it has
9497 shorter encoding.
9499 Return: the regno of chosen register. */
9501 static unsigned int
9502 find_drap_reg (void)
9504 tree decl = cfun->decl;
9506 if (TARGET_64BIT)
9508 /* Use R13 for nested function or function need static chain.
9509 Since function with tail call may use any caller-saved
9510 registers in epilogue, DRAP must not use caller-saved
9511 register in such case. */
9512 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9513 return R13_REG;
9515 return R10_REG;
9517 else
9519 /* Use DI for nested function or function need static chain.
9520 Since function with tail call may use any caller-saved
9521 registers in epilogue, DRAP must not use caller-saved
9522 register in such case. */
9523 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9524 return DI_REG;
9526 /* Reuse static chain register if it isn't used for parameter
9527 passing. */
9528 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9530 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9531 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9532 return CX_REG;
9534 return DI_REG;
9538 /* Return minimum incoming stack alignment. */
9540 static unsigned int
9541 ix86_minimum_incoming_stack_boundary (bool sibcall)
9543 unsigned int incoming_stack_boundary;
9545 /* Prefer the one specified at command line. */
9546 if (ix86_user_incoming_stack_boundary)
9547 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9548 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9549 if -mstackrealign is used, it isn't used for sibcall check and
9550 estimated stack alignment is 128bit. */
9551 else if (!sibcall
9552 && !TARGET_64BIT
9553 && ix86_force_align_arg_pointer
9554 && crtl->stack_alignment_estimated == 128)
9555 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9556 else
9557 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9559 /* Incoming stack alignment can be changed on individual functions
9560 via force_align_arg_pointer attribute. We use the smallest
9561 incoming stack boundary. */
9562 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9563 && lookup_attribute (ix86_force_align_arg_pointer_string,
9564 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9565 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9567 /* The incoming stack frame has to be aligned at least at
9568 parm_stack_boundary. */
9569 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9570 incoming_stack_boundary = crtl->parm_stack_boundary;
9572 /* Stack at entrance of main is aligned by runtime. We use the
9573 smallest incoming stack boundary. */
9574 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9575 && DECL_NAME (current_function_decl)
9576 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9577 && DECL_FILE_SCOPE_P (current_function_decl))
9578 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9580 return incoming_stack_boundary;
9583 /* Update incoming stack boundary and estimated stack alignment. */
9585 static void
9586 ix86_update_stack_boundary (void)
9588 ix86_incoming_stack_boundary
9589 = ix86_minimum_incoming_stack_boundary (false);
9591 /* x86_64 vararg needs 16byte stack alignment for register save
9592 area. */
9593 if (TARGET_64BIT
9594 && cfun->stdarg
9595 && crtl->stack_alignment_estimated < 128)
9596 crtl->stack_alignment_estimated = 128;
9599 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9600 needed or an rtx for DRAP otherwise. */
9602 static rtx
9603 ix86_get_drap_rtx (void)
9605 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9606 crtl->need_drap = true;
9608 if (stack_realign_drap)
9610 /* Assign DRAP to vDRAP and returns vDRAP */
9611 unsigned int regno = find_drap_reg ();
9612 rtx drap_vreg;
9613 rtx arg_ptr;
9614 rtx seq, insn;
9616 arg_ptr = gen_rtx_REG (Pmode, regno);
9617 crtl->drap_reg = arg_ptr;
9619 start_sequence ();
9620 drap_vreg = copy_to_reg (arg_ptr);
9621 seq = get_insns ();
9622 end_sequence ();
9624 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9625 if (!optimize)
9627 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9628 RTX_FRAME_RELATED_P (insn) = 1;
9630 return drap_vreg;
9632 else
9633 return NULL;
9636 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9638 static rtx
9639 ix86_internal_arg_pointer (void)
9641 return virtual_incoming_args_rtx;
9644 struct scratch_reg {
9645 rtx reg;
9646 bool saved;
9649 /* Return a short-lived scratch register for use on function entry.
9650 In 32-bit mode, it is valid only after the registers are saved
9651 in the prologue. This register must be released by means of
9652 release_scratch_register_on_entry once it is dead. */
9654 static void
9655 get_scratch_register_on_entry (struct scratch_reg *sr)
9657 int regno;
9659 sr->saved = false;
9661 if (TARGET_64BIT)
9663 /* We always use R11 in 64-bit mode. */
9664 regno = R11_REG;
9666 else
9668 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9669 bool fastcall_p
9670 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9671 bool thiscall_p
9672 = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9673 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9674 int regparm = ix86_function_regparm (fntype, decl);
9675 int drap_regno
9676 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9678 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9679 for the static chain register. */
9680 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9681 && drap_regno != AX_REG)
9682 regno = AX_REG;
9683 /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx
9684 for the static chain register. */
9685 else if (thiscall_p && !static_chain_p && drap_regno != AX_REG)
9686 regno = AX_REG;
9687 else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG)
9688 regno = DX_REG;
9689 /* ecx is the static chain register. */
9690 else if (regparm < 3 && !fastcall_p && !thiscall_p
9691 && !static_chain_p
9692 && drap_regno != CX_REG)
9693 regno = CX_REG;
9694 else if (ix86_save_reg (BX_REG, true))
9695 regno = BX_REG;
9696 /* esi is the static chain register. */
9697 else if (!(regparm == 3 && static_chain_p)
9698 && ix86_save_reg (SI_REG, true))
9699 regno = SI_REG;
9700 else if (ix86_save_reg (DI_REG, true))
9701 regno = DI_REG;
9702 else
9704 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9705 sr->saved = true;
9709 sr->reg = gen_rtx_REG (Pmode, regno);
9710 if (sr->saved)
9712 rtx insn = emit_insn (gen_push (sr->reg));
9713 RTX_FRAME_RELATED_P (insn) = 1;
9717 /* Release a scratch register obtained from the preceding function. */
9719 static void
9720 release_scratch_register_on_entry (struct scratch_reg *sr)
9722 if (sr->saved)
9724 struct machine_function *m = cfun->machine;
9725 rtx x, insn = emit_insn (gen_pop (sr->reg));
9727 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9728 RTX_FRAME_RELATED_P (insn) = 1;
9729 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9730 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9731 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9732 m->fs.sp_offset -= UNITS_PER_WORD;
9736 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9738 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9740 static void
9741 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9743 /* We skip the probe for the first interval + a small dope of 4 words and
9744 probe that many bytes past the specified size to maintain a protection
9745 area at the botton of the stack. */
9746 const int dope = 4 * UNITS_PER_WORD;
9747 rtx size_rtx = GEN_INT (size), last;
9749 /* See if we have a constant small number of probes to generate. If so,
9750 that's the easy case. The run-time loop is made up of 11 insns in the
9751 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9752 for n # of intervals. */
9753 if (size <= 5 * PROBE_INTERVAL)
9755 HOST_WIDE_INT i, adjust;
9756 bool first_probe = true;
9758 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9759 values of N from 1 until it exceeds SIZE. If only one probe is
9760 needed, this will not generate any code. Then adjust and probe
9761 to PROBE_INTERVAL + SIZE. */
9762 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9764 if (first_probe)
9766 adjust = 2 * PROBE_INTERVAL + dope;
9767 first_probe = false;
9769 else
9770 adjust = PROBE_INTERVAL;
9772 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9773 plus_constant (Pmode, stack_pointer_rtx,
9774 -adjust)));
9775 emit_stack_probe (stack_pointer_rtx);
9778 if (first_probe)
9779 adjust = size + PROBE_INTERVAL + dope;
9780 else
9781 adjust = size + PROBE_INTERVAL - i;
9783 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9784 plus_constant (Pmode, stack_pointer_rtx,
9785 -adjust)));
9786 emit_stack_probe (stack_pointer_rtx);
9788 /* Adjust back to account for the additional first interval. */
9789 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9790 plus_constant (Pmode, stack_pointer_rtx,
9791 PROBE_INTERVAL + dope)));
9794 /* Otherwise, do the same as above, but in a loop. Note that we must be
9795 extra careful with variables wrapping around because we might be at
9796 the very top (or the very bottom) of the address space and we have
9797 to be able to handle this case properly; in particular, we use an
9798 equality test for the loop condition. */
9799 else
9801 HOST_WIDE_INT rounded_size;
9802 struct scratch_reg sr;
9804 get_scratch_register_on_entry (&sr);
9807 /* Step 1: round SIZE to the previous multiple of the interval. */
9809 rounded_size = size & -PROBE_INTERVAL;
9812 /* Step 2: compute initial and final value of the loop counter. */
9814 /* SP = SP_0 + PROBE_INTERVAL. */
9815 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9816 plus_constant (Pmode, stack_pointer_rtx,
9817 - (PROBE_INTERVAL + dope))));
9819 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9820 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9821 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9822 gen_rtx_PLUS (Pmode, sr.reg,
9823 stack_pointer_rtx)));
9826 /* Step 3: the loop
9828 while (SP != LAST_ADDR)
9830 SP = SP + PROBE_INTERVAL
9831 probe at SP
9834 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9835 values of N from 1 until it is equal to ROUNDED_SIZE. */
9837 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9840 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9841 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9843 if (size != rounded_size)
9845 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9846 plus_constant (Pmode, stack_pointer_rtx,
9847 rounded_size - size)));
9848 emit_stack_probe (stack_pointer_rtx);
9851 /* Adjust back to account for the additional first interval. */
9852 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9853 plus_constant (Pmode, stack_pointer_rtx,
9854 PROBE_INTERVAL + dope)));
9856 release_scratch_register_on_entry (&sr);
9859 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9861 /* Even if the stack pointer isn't the CFA register, we need to correctly
9862 describe the adjustments made to it, in particular differentiate the
9863 frame-related ones from the frame-unrelated ones. */
9864 if (size > 0)
9866 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9867 XVECEXP (expr, 0, 0)
9868 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9869 plus_constant (Pmode, stack_pointer_rtx, -size));
9870 XVECEXP (expr, 0, 1)
9871 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9872 plus_constant (Pmode, stack_pointer_rtx,
9873 PROBE_INTERVAL + dope + size));
9874 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9875 RTX_FRAME_RELATED_P (last) = 1;
9877 cfun->machine->fs.sp_offset += size;
9880 /* Make sure nothing is scheduled before we are done. */
9881 emit_insn (gen_blockage ());
9884 /* Adjust the stack pointer up to REG while probing it. */
9886 const char *
9887 output_adjust_stack_and_probe (rtx reg)
9889 static int labelno = 0;
9890 char loop_lab[32], end_lab[32];
9891 rtx xops[2];
9893 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9894 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9896 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9898 /* Jump to END_LAB if SP == LAST_ADDR. */
9899 xops[0] = stack_pointer_rtx;
9900 xops[1] = reg;
9901 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9902 fputs ("\tje\t", asm_out_file);
9903 assemble_name_raw (asm_out_file, end_lab);
9904 fputc ('\n', asm_out_file);
9906 /* SP = SP + PROBE_INTERVAL. */
9907 xops[1] = GEN_INT (PROBE_INTERVAL);
9908 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9910 /* Probe at SP. */
9911 xops[1] = const0_rtx;
9912 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9914 fprintf (asm_out_file, "\tjmp\t");
9915 assemble_name_raw (asm_out_file, loop_lab);
9916 fputc ('\n', asm_out_file);
9918 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9920 return "";
9923 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9924 inclusive. These are offsets from the current stack pointer. */
9926 static void
9927 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9929 /* See if we have a constant small number of probes to generate. If so,
9930 that's the easy case. The run-time loop is made up of 7 insns in the
9931 generic case while the compile-time loop is made up of n insns for n #
9932 of intervals. */
9933 if (size <= 7 * PROBE_INTERVAL)
9935 HOST_WIDE_INT i;
9937 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9938 it exceeds SIZE. If only one probe is needed, this will not
9939 generate any code. Then probe at FIRST + SIZE. */
9940 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9941 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9942 -(first + i)));
9944 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9945 -(first + size)));
9948 /* Otherwise, do the same as above, but in a loop. Note that we must be
9949 extra careful with variables wrapping around because we might be at
9950 the very top (or the very bottom) of the address space and we have
9951 to be able to handle this case properly; in particular, we use an
9952 equality test for the loop condition. */
9953 else
9955 HOST_WIDE_INT rounded_size, last;
9956 struct scratch_reg sr;
9958 get_scratch_register_on_entry (&sr);
9961 /* Step 1: round SIZE to the previous multiple of the interval. */
9963 rounded_size = size & -PROBE_INTERVAL;
9966 /* Step 2: compute initial and final value of the loop counter. */
9968 /* TEST_OFFSET = FIRST. */
9969 emit_move_insn (sr.reg, GEN_INT (-first));
9971 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9972 last = first + rounded_size;
9975 /* Step 3: the loop
9977 while (TEST_ADDR != LAST_ADDR)
9979 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9980 probe at TEST_ADDR
9983 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9984 until it is equal to ROUNDED_SIZE. */
9986 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9989 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9990 that SIZE is equal to ROUNDED_SIZE. */
9992 if (size != rounded_size)
9993 emit_stack_probe (plus_constant (Pmode,
9994 gen_rtx_PLUS (Pmode,
9995 stack_pointer_rtx,
9996 sr.reg),
9997 rounded_size - size));
9999 release_scratch_register_on_entry (&sr);
10002 /* Make sure nothing is scheduled before we are done. */
10003 emit_insn (gen_blockage ());
10006 /* Probe a range of stack addresses from REG to END, inclusive. These are
10007 offsets from the current stack pointer. */
10009 const char *
10010 output_probe_stack_range (rtx reg, rtx end)
10012 static int labelno = 0;
10013 char loop_lab[32], end_lab[32];
10014 rtx xops[3];
10016 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
10017 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
10019 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
10021 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
10022 xops[0] = reg;
10023 xops[1] = end;
10024 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
10025 fputs ("\tje\t", asm_out_file);
10026 assemble_name_raw (asm_out_file, end_lab);
10027 fputc ('\n', asm_out_file);
10029 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
10030 xops[1] = GEN_INT (PROBE_INTERVAL);
10031 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
10033 /* Probe at TEST_ADDR. */
10034 xops[0] = stack_pointer_rtx;
10035 xops[1] = reg;
10036 xops[2] = const0_rtx;
10037 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
10039 fprintf (asm_out_file, "\tjmp\t");
10040 assemble_name_raw (asm_out_file, loop_lab);
10041 fputc ('\n', asm_out_file);
10043 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
10045 return "";
10048 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
10049 to be generated in correct form. */
10050 static void
10051 ix86_finalize_stack_realign_flags (void)
10053 /* Check if stack realign is really needed after reload, and
10054 stores result in cfun */
10055 unsigned int incoming_stack_boundary
10056 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
10057 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
10058 unsigned int stack_realign = (incoming_stack_boundary
10059 < (crtl->is_leaf
10060 ? crtl->max_used_stack_slot_alignment
10061 : crtl->stack_alignment_needed));
10063 if (crtl->stack_realign_finalized)
10065 /* After stack_realign_needed is finalized, we can't no longer
10066 change it. */
10067 gcc_assert (crtl->stack_realign_needed == stack_realign);
10068 return;
10071 /* If the only reason for frame_pointer_needed is that we conservatively
10072 assumed stack realignment might be needed, but in the end nothing that
10073 needed the stack alignment had been spilled, clear frame_pointer_needed
10074 and say we don't need stack realignment. */
10075 if (stack_realign
10076 && !crtl->need_drap
10077 && frame_pointer_needed
10078 && crtl->is_leaf
10079 && flag_omit_frame_pointer
10080 && crtl->sp_is_unchanging
10081 && !ix86_current_function_calls_tls_descriptor
10082 && !crtl->accesses_prior_frames
10083 && !cfun->calls_alloca
10084 && !crtl->calls_eh_return
10085 && !(flag_stack_check && STACK_CHECK_MOVING_SP)
10086 && !ix86_frame_pointer_required ()
10087 && get_frame_size () == 0
10088 && ix86_nsaved_sseregs () == 0
10089 && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
10091 HARD_REG_SET set_up_by_prologue, prologue_used;
10092 basic_block bb;
10094 CLEAR_HARD_REG_SET (prologue_used);
10095 CLEAR_HARD_REG_SET (set_up_by_prologue);
10096 add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
10097 add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
10098 add_to_hard_reg_set (&set_up_by_prologue, Pmode,
10099 HARD_FRAME_POINTER_REGNUM);
10100 FOR_EACH_BB (bb)
10102 rtx insn;
10103 FOR_BB_INSNS (bb, insn)
10104 if (NONDEBUG_INSN_P (insn)
10105 && requires_stack_frame_p (insn, prologue_used,
10106 set_up_by_prologue))
10108 crtl->stack_realign_needed = stack_realign;
10109 crtl->stack_realign_finalized = true;
10110 return;
10114 frame_pointer_needed = false;
10115 stack_realign = false;
10116 crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
10117 crtl->stack_alignment_needed = incoming_stack_boundary;
10118 crtl->stack_alignment_estimated = incoming_stack_boundary;
10119 if (crtl->preferred_stack_boundary > incoming_stack_boundary)
10120 crtl->preferred_stack_boundary = incoming_stack_boundary;
10121 df_finish_pass (true);
10122 df_scan_alloc (NULL);
10123 df_scan_blocks ();
10124 df_compute_regs_ever_live (true);
10125 df_analyze ();
10128 crtl->stack_realign_needed = stack_realign;
10129 crtl->stack_realign_finalized = true;
10132 /* Expand the prologue into a bunch of separate insns. */
10134 void
10135 ix86_expand_prologue (void)
10137 struct machine_function *m = cfun->machine;
10138 rtx insn, t;
10139 bool pic_reg_used;
10140 struct ix86_frame frame;
10141 HOST_WIDE_INT allocate;
10142 bool int_registers_saved;
10143 bool sse_registers_saved;
10145 ix86_finalize_stack_realign_flags ();
10147 /* DRAP should not coexist with stack_realign_fp */
10148 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
10150 memset (&m->fs, 0, sizeof (m->fs));
10152 /* Initialize CFA state for before the prologue. */
10153 m->fs.cfa_reg = stack_pointer_rtx;
10154 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
10156 /* Track SP offset to the CFA. We continue tracking this after we've
10157 swapped the CFA register away from SP. In the case of re-alignment
10158 this is fudged; we're interested to offsets within the local frame. */
10159 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10160 m->fs.sp_valid = true;
10162 ix86_compute_frame_layout (&frame);
10164 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
10166 /* We should have already generated an error for any use of
10167 ms_hook on a nested function. */
10168 gcc_checking_assert (!ix86_static_chain_on_stack);
10170 /* Check if profiling is active and we shall use profiling before
10171 prologue variant. If so sorry. */
10172 if (crtl->profile && flag_fentry != 0)
10173 sorry ("ms_hook_prologue attribute isn%'t compatible "
10174 "with -mfentry for 32-bit");
10176 /* In ix86_asm_output_function_label we emitted:
10177 8b ff movl.s %edi,%edi
10178 55 push %ebp
10179 8b ec movl.s %esp,%ebp
10181 This matches the hookable function prologue in Win32 API
10182 functions in Microsoft Windows XP Service Pack 2 and newer.
10183 Wine uses this to enable Windows apps to hook the Win32 API
10184 functions provided by Wine.
10186 What that means is that we've already set up the frame pointer. */
10188 if (frame_pointer_needed
10189 && !(crtl->drap_reg && crtl->stack_realign_needed))
10191 rtx push, mov;
10193 /* We've decided to use the frame pointer already set up.
10194 Describe this to the unwinder by pretending that both
10195 push and mov insns happen right here.
10197 Putting the unwind info here at the end of the ms_hook
10198 is done so that we can make absolutely certain we get
10199 the required byte sequence at the start of the function,
10200 rather than relying on an assembler that can produce
10201 the exact encoding required.
10203 However it does mean (in the unpatched case) that we have
10204 a 1 insn window where the asynchronous unwind info is
10205 incorrect. However, if we placed the unwind info at
10206 its correct location we would have incorrect unwind info
10207 in the patched case. Which is probably all moot since
10208 I don't expect Wine generates dwarf2 unwind info for the
10209 system libraries that use this feature. */
10211 insn = emit_insn (gen_blockage ());
10213 push = gen_push (hard_frame_pointer_rtx);
10214 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
10215 stack_pointer_rtx);
10216 RTX_FRAME_RELATED_P (push) = 1;
10217 RTX_FRAME_RELATED_P (mov) = 1;
10219 RTX_FRAME_RELATED_P (insn) = 1;
10220 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10221 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
10223 /* Note that gen_push incremented m->fs.cfa_offset, even
10224 though we didn't emit the push insn here. */
10225 m->fs.cfa_reg = hard_frame_pointer_rtx;
10226 m->fs.fp_offset = m->fs.cfa_offset;
10227 m->fs.fp_valid = true;
10229 else
10231 /* The frame pointer is not needed so pop %ebp again.
10232 This leaves us with a pristine state. */
10233 emit_insn (gen_pop (hard_frame_pointer_rtx));
10237 /* The first insn of a function that accepts its static chain on the
10238 stack is to push the register that would be filled in by a direct
10239 call. This insn will be skipped by the trampoline. */
10240 else if (ix86_static_chain_on_stack)
10242 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
10243 emit_insn (gen_blockage ());
10245 /* We don't want to interpret this push insn as a register save,
10246 only as a stack adjustment. The real copy of the register as
10247 a save will be done later, if needed. */
10248 t = plus_constant (Pmode, stack_pointer_rtx, -UNITS_PER_WORD);
10249 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
10250 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
10251 RTX_FRAME_RELATED_P (insn) = 1;
10254 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
10255 of DRAP is needed and stack realignment is really needed after reload */
10256 if (stack_realign_drap)
10258 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10260 /* Only need to push parameter pointer reg if it is caller saved. */
10261 if (!call_used_regs[REGNO (crtl->drap_reg)])
10263 /* Push arg pointer reg */
10264 insn = emit_insn (gen_push (crtl->drap_reg));
10265 RTX_FRAME_RELATED_P (insn) = 1;
10268 /* Grab the argument pointer. */
10269 t = plus_constant (Pmode, stack_pointer_rtx, m->fs.sp_offset);
10270 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10271 RTX_FRAME_RELATED_P (insn) = 1;
10272 m->fs.cfa_reg = crtl->drap_reg;
10273 m->fs.cfa_offset = 0;
10275 /* Align the stack. */
10276 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10277 stack_pointer_rtx,
10278 GEN_INT (-align_bytes)));
10279 RTX_FRAME_RELATED_P (insn) = 1;
10281 /* Replicate the return address on the stack so that return
10282 address can be reached via (argp - 1) slot. This is needed
10283 to implement macro RETURN_ADDR_RTX and intrinsic function
10284 expand_builtin_return_addr etc. */
10285 t = plus_constant (Pmode, crtl->drap_reg, -UNITS_PER_WORD);
10286 t = gen_frame_mem (word_mode, t);
10287 insn = emit_insn (gen_push (t));
10288 RTX_FRAME_RELATED_P (insn) = 1;
10290 /* For the purposes of frame and register save area addressing,
10291 we've started over with a new frame. */
10292 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
10293 m->fs.realigned = true;
10296 int_registers_saved = (frame.nregs == 0);
10297 sse_registers_saved = (frame.nsseregs == 0);
10299 if (frame_pointer_needed && !m->fs.fp_valid)
10301 /* Note: AT&T enter does NOT have reversed args. Enter is probably
10302 slower on all targets. Also sdb doesn't like it. */
10303 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
10304 RTX_FRAME_RELATED_P (insn) = 1;
10306 /* Push registers now, before setting the frame pointer
10307 on SEH target. */
10308 if (!int_registers_saved
10309 && TARGET_SEH
10310 && !frame.save_regs_using_mov)
10312 ix86_emit_save_regs ();
10313 int_registers_saved = true;
10314 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10317 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
10319 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
10320 RTX_FRAME_RELATED_P (insn) = 1;
10322 if (m->fs.cfa_reg == stack_pointer_rtx)
10323 m->fs.cfa_reg = hard_frame_pointer_rtx;
10324 m->fs.fp_offset = m->fs.sp_offset;
10325 m->fs.fp_valid = true;
10329 if (!int_registers_saved)
10331 /* If saving registers via PUSH, do so now. */
10332 if (!frame.save_regs_using_mov)
10334 ix86_emit_save_regs ();
10335 int_registers_saved = true;
10336 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
10339 /* When using red zone we may start register saving before allocating
10340 the stack frame saving one cycle of the prologue. However, avoid
10341 doing this if we have to probe the stack; at least on x86_64 the
10342 stack probe can turn into a call that clobbers a red zone location. */
10343 else if (ix86_using_red_zone ()
10344 && (! TARGET_STACK_PROBE
10345 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
10347 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10348 int_registers_saved = true;
10352 if (stack_realign_fp)
10354 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
10355 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
10357 /* The computation of the size of the re-aligned stack frame means
10358 that we must allocate the size of the register save area before
10359 performing the actual alignment. Otherwise we cannot guarantee
10360 that there's enough storage above the realignment point. */
10361 if (m->fs.sp_offset != frame.sse_reg_save_offset)
10362 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10363 GEN_INT (m->fs.sp_offset
10364 - frame.sse_reg_save_offset),
10365 -1, false);
10367 /* Align the stack. */
10368 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
10369 stack_pointer_rtx,
10370 GEN_INT (-align_bytes)));
10372 /* For the purposes of register save area addressing, the stack
10373 pointer is no longer valid. As for the value of sp_offset,
10374 see ix86_compute_frame_layout, which we need to match in order
10375 to pass verification of stack_pointer_offset at the end. */
10376 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10377 m->fs.sp_valid = false;
10380 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10382 if (flag_stack_usage_info)
10384 /* We start to count from ARG_POINTER. */
10385 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10387 /* If it was realigned, take into account the fake frame. */
10388 if (stack_realign_drap)
10390 if (ix86_static_chain_on_stack)
10391 stack_size += UNITS_PER_WORD;
10393 if (!call_used_regs[REGNO (crtl->drap_reg)])
10394 stack_size += UNITS_PER_WORD;
10396 /* This over-estimates by 1 minimal-stack-alignment-unit but
10397 mitigates that by counting in the new return address slot. */
10398 current_function_dynamic_stack_size
10399 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10402 current_function_static_stack_size = stack_size;
10405 /* On SEH target with very large frame size, allocate an area to save
10406 SSE registers (as the very large allocation won't be described). */
10407 if (TARGET_SEH
10408 && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE
10409 && !sse_registers_saved)
10411 HOST_WIDE_INT sse_size =
10412 frame.sse_reg_save_offset - frame.reg_save_offset;
10414 gcc_assert (int_registers_saved);
10416 /* No need to do stack checking as the area will be immediately
10417 written. */
10418 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10419 GEN_INT (-sse_size), -1,
10420 m->fs.cfa_reg == stack_pointer_rtx);
10421 allocate -= sse_size;
10422 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10423 sse_registers_saved = true;
10426 /* The stack has already been decremented by the instruction calling us
10427 so probe if the size is non-negative to preserve the protection area. */
10428 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10430 /* We expect the registers to be saved when probes are used. */
10431 gcc_assert (int_registers_saved);
10433 if (STACK_CHECK_MOVING_SP)
10435 ix86_adjust_stack_and_probe (allocate);
10436 allocate = 0;
10438 else
10440 HOST_WIDE_INT size = allocate;
10442 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10443 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10445 if (TARGET_STACK_PROBE)
10446 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10447 else
10448 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10452 if (allocate == 0)
10454 else if (!ix86_target_stack_probe ()
10455 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10457 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10458 GEN_INT (-allocate), -1,
10459 m->fs.cfa_reg == stack_pointer_rtx);
10461 else
10463 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10464 rtx r10 = NULL;
10465 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10466 const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx);
10467 bool eax_live = false;
10468 bool r10_live = false;
10470 if (TARGET_64BIT)
10471 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10472 if (!TARGET_64BIT_MS_ABI)
10473 eax_live = ix86_eax_live_at_start_p ();
10475 /* Note that SEH directives need to continue tracking the stack
10476 pointer even after the frame pointer has been set up. */
10477 if (eax_live)
10479 insn = emit_insn (gen_push (eax));
10480 allocate -= UNITS_PER_WORD;
10481 if (sp_is_cfa_reg || TARGET_SEH)
10483 if (sp_is_cfa_reg)
10484 m->fs.cfa_offset += UNITS_PER_WORD;
10485 RTX_FRAME_RELATED_P (insn) = 1;
10489 if (r10_live)
10491 r10 = gen_rtx_REG (Pmode, R10_REG);
10492 insn = emit_insn (gen_push (r10));
10493 allocate -= UNITS_PER_WORD;
10494 if (sp_is_cfa_reg || TARGET_SEH)
10496 if (sp_is_cfa_reg)
10497 m->fs.cfa_offset += UNITS_PER_WORD;
10498 RTX_FRAME_RELATED_P (insn) = 1;
10502 emit_move_insn (eax, GEN_INT (allocate));
10503 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10505 /* Use the fact that AX still contains ALLOCATE. */
10506 adjust_stack_insn = (Pmode == DImode
10507 ? gen_pro_epilogue_adjust_stack_di_sub
10508 : gen_pro_epilogue_adjust_stack_si_sub);
10510 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10511 stack_pointer_rtx, eax));
10513 if (sp_is_cfa_reg || TARGET_SEH)
10515 if (sp_is_cfa_reg)
10516 m->fs.cfa_offset += allocate;
10517 RTX_FRAME_RELATED_P (insn) = 1;
10518 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10519 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10520 plus_constant (Pmode, stack_pointer_rtx,
10521 -allocate)));
10523 m->fs.sp_offset += allocate;
10525 /* Use stack_pointer_rtx for relative addressing so that code
10526 works for realigned stack, too. */
10527 if (r10_live && eax_live)
10529 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
10530 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
10531 gen_frame_mem (word_mode, t));
10532 t = plus_constant (Pmode, t, UNITS_PER_WORD);
10533 emit_move_insn (gen_rtx_REG (word_mode, AX_REG),
10534 gen_frame_mem (word_mode, t));
10536 else if (eax_live || r10_live)
10538 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, eax);
10539 emit_move_insn (gen_rtx_REG (word_mode,
10540 (eax_live ? AX_REG : R10_REG)),
10541 gen_frame_mem (word_mode, t));
10544 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10546 /* If we havn't already set up the frame pointer, do so now. */
10547 if (frame_pointer_needed && !m->fs.fp_valid)
10549 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10550 GEN_INT (frame.stack_pointer_offset
10551 - frame.hard_frame_pointer_offset));
10552 insn = emit_insn (insn);
10553 RTX_FRAME_RELATED_P (insn) = 1;
10554 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10556 if (m->fs.cfa_reg == stack_pointer_rtx)
10557 m->fs.cfa_reg = hard_frame_pointer_rtx;
10558 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10559 m->fs.fp_valid = true;
10562 if (!int_registers_saved)
10563 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10564 if (!sse_registers_saved)
10565 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10567 pic_reg_used = false;
10568 if (pic_offset_table_rtx
10569 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10570 || crtl->profile))
10572 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10574 if (alt_pic_reg_used != INVALID_REGNUM)
10575 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10577 pic_reg_used = true;
10580 if (pic_reg_used)
10582 if (TARGET_64BIT)
10584 if (ix86_cmodel == CM_LARGE_PIC)
10586 rtx label, tmp_reg;
10588 gcc_assert (Pmode == DImode);
10589 label = gen_label_rtx ();
10590 emit_label (label);
10591 LABEL_PRESERVE_P (label) = 1;
10592 tmp_reg = gen_rtx_REG (Pmode, R11_REG);
10593 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10594 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx,
10595 label));
10596 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10597 insn = emit_insn (ix86_gen_add3 (pic_offset_table_rtx,
10598 pic_offset_table_rtx, tmp_reg));
10600 else
10601 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10603 else
10605 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10606 RTX_FRAME_RELATED_P (insn) = 1;
10607 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10611 /* In the pic_reg_used case, make sure that the got load isn't deleted
10612 when mcount needs it. Blockage to avoid call movement across mcount
10613 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10614 note. */
10615 if (crtl->profile && !flag_fentry && pic_reg_used)
10616 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10618 if (crtl->drap_reg && !crtl->stack_realign_needed)
10620 /* vDRAP is setup but after reload it turns out stack realign
10621 isn't necessary, here we will emit prologue to setup DRAP
10622 without stack realign adjustment */
10623 t = choose_baseaddr (0);
10624 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10627 /* Prevent instructions from being scheduled into register save push
10628 sequence when access to the redzone area is done through frame pointer.
10629 The offset between the frame pointer and the stack pointer is calculated
10630 relative to the value of the stack pointer at the end of the function
10631 prologue, and moving instructions that access redzone area via frame
10632 pointer inside push sequence violates this assumption. */
10633 if (frame_pointer_needed && frame.red_zone_size)
10634 emit_insn (gen_memory_blockage ());
10636 /* Emit cld instruction if stringops are used in the function. */
10637 if (TARGET_CLD && ix86_current_function_needs_cld)
10638 emit_insn (gen_cld ());
10640 /* SEH requires that the prologue end within 256 bytes of the start of
10641 the function. Prevent instruction schedules that would extend that.
10642 Further, prevent alloca modifications to the stack pointer from being
10643 combined with prologue modifications. */
10644 if (TARGET_SEH)
10645 emit_insn (gen_prologue_use (stack_pointer_rtx));
10648 /* Emit code to restore REG using a POP insn. */
10650 static void
10651 ix86_emit_restore_reg_using_pop (rtx reg)
10653 struct machine_function *m = cfun->machine;
10654 rtx insn = emit_insn (gen_pop (reg));
10656 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10657 m->fs.sp_offset -= UNITS_PER_WORD;
10659 if (m->fs.cfa_reg == crtl->drap_reg
10660 && REGNO (reg) == REGNO (crtl->drap_reg))
10662 /* Previously we'd represented the CFA as an expression
10663 like *(%ebp - 8). We've just popped that value from
10664 the stack, which means we need to reset the CFA to
10665 the drap register. This will remain until we restore
10666 the stack pointer. */
10667 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10668 RTX_FRAME_RELATED_P (insn) = 1;
10670 /* This means that the DRAP register is valid for addressing too. */
10671 m->fs.drap_valid = true;
10672 return;
10675 if (m->fs.cfa_reg == stack_pointer_rtx)
10677 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
10678 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10679 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10680 RTX_FRAME_RELATED_P (insn) = 1;
10682 m->fs.cfa_offset -= UNITS_PER_WORD;
10685 /* When the frame pointer is the CFA, and we pop it, we are
10686 swapping back to the stack pointer as the CFA. This happens
10687 for stack frames that don't allocate other data, so we assume
10688 the stack pointer is now pointing at the return address, i.e.
10689 the function entry state, which makes the offset be 1 word. */
10690 if (reg == hard_frame_pointer_rtx)
10692 m->fs.fp_valid = false;
10693 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10695 m->fs.cfa_reg = stack_pointer_rtx;
10696 m->fs.cfa_offset -= UNITS_PER_WORD;
10698 add_reg_note (insn, REG_CFA_DEF_CFA,
10699 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10700 GEN_INT (m->fs.cfa_offset)));
10701 RTX_FRAME_RELATED_P (insn) = 1;
10706 /* Emit code to restore saved registers using POP insns. */
10708 static void
10709 ix86_emit_restore_regs_using_pop (void)
10711 unsigned int regno;
10713 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10714 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10715 ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno));
10718 /* Emit code and notes for the LEAVE instruction. */
10720 static void
10721 ix86_emit_leave (void)
10723 struct machine_function *m = cfun->machine;
10724 rtx insn = emit_insn (ix86_gen_leave ());
10726 ix86_add_queued_cfa_restore_notes (insn);
10728 gcc_assert (m->fs.fp_valid);
10729 m->fs.sp_valid = true;
10730 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10731 m->fs.fp_valid = false;
10733 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10735 m->fs.cfa_reg = stack_pointer_rtx;
10736 m->fs.cfa_offset = m->fs.sp_offset;
10738 add_reg_note (insn, REG_CFA_DEF_CFA,
10739 plus_constant (Pmode, stack_pointer_rtx,
10740 m->fs.sp_offset));
10741 RTX_FRAME_RELATED_P (insn) = 1;
10743 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10744 m->fs.fp_offset);
10747 /* Emit code to restore saved registers using MOV insns.
10748 First register is restored from CFA - CFA_OFFSET. */
10749 static void
10750 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10751 bool maybe_eh_return)
10753 struct machine_function *m = cfun->machine;
10754 unsigned int regno;
10756 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10757 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10759 rtx reg = gen_rtx_REG (word_mode, regno);
10760 rtx insn, mem;
10762 mem = choose_baseaddr (cfa_offset);
10763 mem = gen_frame_mem (word_mode, mem);
10764 insn = emit_move_insn (reg, mem);
10766 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10768 /* Previously we'd represented the CFA as an expression
10769 like *(%ebp - 8). We've just popped that value from
10770 the stack, which means we need to reset the CFA to
10771 the drap register. This will remain until we restore
10772 the stack pointer. */
10773 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10774 RTX_FRAME_RELATED_P (insn) = 1;
10776 /* This means that the DRAP register is valid for addressing. */
10777 m->fs.drap_valid = true;
10779 else
10780 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10782 cfa_offset -= UNITS_PER_WORD;
10786 /* Emit code to restore saved registers using MOV insns.
10787 First register is restored from CFA - CFA_OFFSET. */
10788 static void
10789 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10790 bool maybe_eh_return)
10792 unsigned int regno;
10794 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10795 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10797 rtx reg = gen_rtx_REG (V4SFmode, regno);
10798 rtx mem;
10800 mem = choose_baseaddr (cfa_offset);
10801 mem = gen_rtx_MEM (V4SFmode, mem);
10802 set_mem_align (mem, 128);
10803 emit_move_insn (reg, mem);
10805 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10807 cfa_offset -= 16;
10811 /* Restore function stack, frame, and registers. */
10813 void
10814 ix86_expand_epilogue (int style)
10816 struct machine_function *m = cfun->machine;
10817 struct machine_frame_state frame_state_save = m->fs;
10818 struct ix86_frame frame;
10819 bool restore_regs_via_mov;
10820 bool using_drap;
10822 ix86_finalize_stack_realign_flags ();
10823 ix86_compute_frame_layout (&frame);
10825 m->fs.sp_valid = (!frame_pointer_needed
10826 || (crtl->sp_is_unchanging
10827 && !stack_realign_fp));
10828 gcc_assert (!m->fs.sp_valid
10829 || m->fs.sp_offset == frame.stack_pointer_offset);
10831 /* The FP must be valid if the frame pointer is present. */
10832 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10833 gcc_assert (!m->fs.fp_valid
10834 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10836 /* We must have *some* valid pointer to the stack frame. */
10837 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10839 /* The DRAP is never valid at this point. */
10840 gcc_assert (!m->fs.drap_valid);
10842 /* See the comment about red zone and frame
10843 pointer usage in ix86_expand_prologue. */
10844 if (frame_pointer_needed && frame.red_zone_size)
10845 emit_insn (gen_memory_blockage ());
10847 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10848 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10850 /* Determine the CFA offset of the end of the red-zone. */
10851 m->fs.red_zone_offset = 0;
10852 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10854 /* The red-zone begins below the return address. */
10855 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10857 /* When the register save area is in the aligned portion of
10858 the stack, determine the maximum runtime displacement that
10859 matches up with the aligned frame. */
10860 if (stack_realign_drap)
10861 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10862 + UNITS_PER_WORD);
10865 /* Special care must be taken for the normal return case of a function
10866 using eh_return: the eax and edx registers are marked as saved, but
10867 not restored along this path. Adjust the save location to match. */
10868 if (crtl->calls_eh_return && style != 2)
10869 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10871 /* EH_RETURN requires the use of moves to function properly. */
10872 if (crtl->calls_eh_return)
10873 restore_regs_via_mov = true;
10874 /* SEH requires the use of pops to identify the epilogue. */
10875 else if (TARGET_SEH)
10876 restore_regs_via_mov = false;
10877 /* If we're only restoring one register and sp is not valid then
10878 using a move instruction to restore the register since it's
10879 less work than reloading sp and popping the register. */
10880 else if (!m->fs.sp_valid && frame.nregs <= 1)
10881 restore_regs_via_mov = true;
10882 else if (TARGET_EPILOGUE_USING_MOVE
10883 && cfun->machine->use_fast_prologue_epilogue
10884 && (frame.nregs > 1
10885 || m->fs.sp_offset != frame.reg_save_offset))
10886 restore_regs_via_mov = true;
10887 else if (frame_pointer_needed
10888 && !frame.nregs
10889 && m->fs.sp_offset != frame.reg_save_offset)
10890 restore_regs_via_mov = true;
10891 else if (frame_pointer_needed
10892 && TARGET_USE_LEAVE
10893 && cfun->machine->use_fast_prologue_epilogue
10894 && frame.nregs == 1)
10895 restore_regs_via_mov = true;
10896 else
10897 restore_regs_via_mov = false;
10899 if (restore_regs_via_mov || frame.nsseregs)
10901 /* Ensure that the entire register save area is addressable via
10902 the stack pointer, if we will restore via sp. */
10903 if (TARGET_64BIT
10904 && m->fs.sp_offset > 0x7fffffff
10905 && !(m->fs.fp_valid || m->fs.drap_valid)
10906 && (frame.nsseregs + frame.nregs) != 0)
10908 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10909 GEN_INT (m->fs.sp_offset
10910 - frame.sse_reg_save_offset),
10911 style,
10912 m->fs.cfa_reg == stack_pointer_rtx);
10916 /* If there are any SSE registers to restore, then we have to do it
10917 via moves, since there's obviously no pop for SSE regs. */
10918 if (frame.nsseregs)
10919 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10920 style == 2);
10922 if (restore_regs_via_mov)
10924 rtx t;
10926 if (frame.nregs)
10927 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10929 /* eh_return epilogues need %ecx added to the stack pointer. */
10930 if (style == 2)
10932 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10934 /* Stack align doesn't work with eh_return. */
10935 gcc_assert (!stack_realign_drap);
10936 /* Neither does regparm nested functions. */
10937 gcc_assert (!ix86_static_chain_on_stack);
10939 if (frame_pointer_needed)
10941 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10942 t = plus_constant (Pmode, t, m->fs.fp_offset - UNITS_PER_WORD);
10943 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10945 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10946 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10948 /* Note that we use SA as a temporary CFA, as the return
10949 address is at the proper place relative to it. We
10950 pretend this happens at the FP restore insn because
10951 prior to this insn the FP would be stored at the wrong
10952 offset relative to SA, and after this insn we have no
10953 other reasonable register to use for the CFA. We don't
10954 bother resetting the CFA to the SP for the duration of
10955 the return insn. */
10956 add_reg_note (insn, REG_CFA_DEF_CFA,
10957 plus_constant (Pmode, sa, UNITS_PER_WORD));
10958 ix86_add_queued_cfa_restore_notes (insn);
10959 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10960 RTX_FRAME_RELATED_P (insn) = 1;
10962 m->fs.cfa_reg = sa;
10963 m->fs.cfa_offset = UNITS_PER_WORD;
10964 m->fs.fp_valid = false;
10966 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10967 const0_rtx, style, false);
10969 else
10971 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10972 t = plus_constant (Pmode, t, m->fs.sp_offset - UNITS_PER_WORD);
10973 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10974 ix86_add_queued_cfa_restore_notes (insn);
10976 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10977 if (m->fs.cfa_offset != UNITS_PER_WORD)
10979 m->fs.cfa_offset = UNITS_PER_WORD;
10980 add_reg_note (insn, REG_CFA_DEF_CFA,
10981 plus_constant (Pmode, stack_pointer_rtx,
10982 UNITS_PER_WORD));
10983 RTX_FRAME_RELATED_P (insn) = 1;
10986 m->fs.sp_offset = UNITS_PER_WORD;
10987 m->fs.sp_valid = true;
10990 else
10992 /* SEH requires that the function end with (1) a stack adjustment
10993 if necessary, (2) a sequence of pops, and (3) a return or
10994 jump instruction. Prevent insns from the function body from
10995 being scheduled into this sequence. */
10996 if (TARGET_SEH)
10998 /* Prevent a catch region from being adjacent to the standard
10999 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
11000 several other flags that would be interesting to test are
11001 not yet set up. */
11002 if (flag_non_call_exceptions)
11003 emit_insn (gen_nops (const1_rtx));
11004 else
11005 emit_insn (gen_blockage ());
11008 /* First step is to deallocate the stack frame so that we can
11009 pop the registers. Also do it on SEH target for very large
11010 frame as the emitted instructions aren't allowed by the ABI in
11011 epilogues. */
11012 if (!m->fs.sp_valid
11013 || (TARGET_SEH
11014 && (m->fs.sp_offset - frame.reg_save_offset
11015 >= SEH_MAX_FRAME_SIZE)))
11017 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
11018 GEN_INT (m->fs.fp_offset
11019 - frame.reg_save_offset),
11020 style, false);
11022 else if (m->fs.sp_offset != frame.reg_save_offset)
11024 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11025 GEN_INT (m->fs.sp_offset
11026 - frame.reg_save_offset),
11027 style,
11028 m->fs.cfa_reg == stack_pointer_rtx);
11031 ix86_emit_restore_regs_using_pop ();
11034 /* If we used a stack pointer and haven't already got rid of it,
11035 then do so now. */
11036 if (m->fs.fp_valid)
11038 /* If the stack pointer is valid and pointing at the frame
11039 pointer store address, then we only need a pop. */
11040 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
11041 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11042 /* Leave results in shorter dependency chains on CPUs that are
11043 able to grok it fast. */
11044 else if (TARGET_USE_LEAVE
11045 || optimize_function_for_size_p (cfun)
11046 || !cfun->machine->use_fast_prologue_epilogue)
11047 ix86_emit_leave ();
11048 else
11050 pro_epilogue_adjust_stack (stack_pointer_rtx,
11051 hard_frame_pointer_rtx,
11052 const0_rtx, style, !using_drap);
11053 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
11057 if (using_drap)
11059 int param_ptr_offset = UNITS_PER_WORD;
11060 rtx insn;
11062 gcc_assert (stack_realign_drap);
11064 if (ix86_static_chain_on_stack)
11065 param_ptr_offset += UNITS_PER_WORD;
11066 if (!call_used_regs[REGNO (crtl->drap_reg)])
11067 param_ptr_offset += UNITS_PER_WORD;
11069 insn = emit_insn (gen_rtx_SET
11070 (VOIDmode, stack_pointer_rtx,
11071 gen_rtx_PLUS (Pmode,
11072 crtl->drap_reg,
11073 GEN_INT (-param_ptr_offset))));
11074 m->fs.cfa_reg = stack_pointer_rtx;
11075 m->fs.cfa_offset = param_ptr_offset;
11076 m->fs.sp_offset = param_ptr_offset;
11077 m->fs.realigned = false;
11079 add_reg_note (insn, REG_CFA_DEF_CFA,
11080 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11081 GEN_INT (param_ptr_offset)));
11082 RTX_FRAME_RELATED_P (insn) = 1;
11084 if (!call_used_regs[REGNO (crtl->drap_reg)])
11085 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
11088 /* At this point the stack pointer must be valid, and we must have
11089 restored all of the registers. We may not have deallocated the
11090 entire stack frame. We've delayed this until now because it may
11091 be possible to merge the local stack deallocation with the
11092 deallocation forced by ix86_static_chain_on_stack. */
11093 gcc_assert (m->fs.sp_valid);
11094 gcc_assert (!m->fs.fp_valid);
11095 gcc_assert (!m->fs.realigned);
11096 if (m->fs.sp_offset != UNITS_PER_WORD)
11098 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11099 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
11100 style, true);
11102 else
11103 ix86_add_queued_cfa_restore_notes (get_last_insn ());
11105 /* Sibcall epilogues don't want a return instruction. */
11106 if (style == 0)
11108 m->fs = frame_state_save;
11109 return;
11112 if (crtl->args.pops_args && crtl->args.size)
11114 rtx popc = GEN_INT (crtl->args.pops_args);
11116 /* i386 can only pop 64K bytes. If asked to pop more, pop return
11117 address, do explicit add, and jump indirectly to the caller. */
11119 if (crtl->args.pops_args >= 65536)
11121 rtx ecx = gen_rtx_REG (SImode, CX_REG);
11122 rtx insn;
11124 /* There is no "pascal" calling convention in any 64bit ABI. */
11125 gcc_assert (!TARGET_64BIT);
11127 insn = emit_insn (gen_pop (ecx));
11128 m->fs.cfa_offset -= UNITS_PER_WORD;
11129 m->fs.sp_offset -= UNITS_PER_WORD;
11131 rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11132 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
11133 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11134 add_reg_note (insn, REG_CFA_REGISTER,
11135 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
11136 RTX_FRAME_RELATED_P (insn) = 1;
11138 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
11139 popc, -1, true);
11140 emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11142 else
11143 emit_jump_insn (gen_simple_return_pop_internal (popc));
11145 else
11146 emit_jump_insn (gen_simple_return_internal ());
11148 /* Restore the state back to the state from the prologue,
11149 so that it's correct for the next epilogue. */
11150 m->fs = frame_state_save;
11153 /* Reset from the function's potential modifications. */
11155 static void
11156 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
11157 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
11159 if (pic_offset_table_rtx)
11160 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
11161 #if TARGET_MACHO
11162 /* Mach-O doesn't support labels at the end of objects, so if
11163 it looks like we might want one, insert a NOP. */
11165 rtx insn = get_last_insn ();
11166 rtx deleted_debug_label = NULL_RTX;
11167 while (insn
11168 && NOTE_P (insn)
11169 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
11171 /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
11172 notes only, instead set their CODE_LABEL_NUMBER to -1,
11173 otherwise there would be code generation differences
11174 in between -g and -g0. */
11175 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11176 deleted_debug_label = insn;
11177 insn = PREV_INSN (insn);
11179 if (insn
11180 && (LABEL_P (insn)
11181 || (NOTE_P (insn)
11182 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
11183 fputs ("\tnop\n", file);
11184 else if (deleted_debug_label)
11185 for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
11186 if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
11187 CODE_LABEL_NUMBER (insn) = -1;
11189 #endif
11193 /* Return a scratch register to use in the split stack prologue. The
11194 split stack prologue is used for -fsplit-stack. It is the first
11195 instructions in the function, even before the regular prologue.
11196 The scratch register can be any caller-saved register which is not
11197 used for parameters or for the static chain. */
11199 static unsigned int
11200 split_stack_prologue_scratch_regno (void)
11202 if (TARGET_64BIT)
11203 return R11_REG;
11204 else
11206 bool is_fastcall, is_thiscall;
11207 int regparm;
11209 is_fastcall = (lookup_attribute ("fastcall",
11210 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11211 != NULL);
11212 is_thiscall = (lookup_attribute ("thiscall",
11213 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
11214 != NULL);
11215 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
11217 if (is_fastcall)
11219 if (DECL_STATIC_CHAIN (cfun->decl))
11221 sorry ("-fsplit-stack does not support fastcall with "
11222 "nested function");
11223 return INVALID_REGNUM;
11225 return AX_REG;
11227 else if (is_thiscall)
11229 if (!DECL_STATIC_CHAIN (cfun->decl))
11230 return DX_REG;
11231 return AX_REG;
11233 else if (regparm < 3)
11235 if (!DECL_STATIC_CHAIN (cfun->decl))
11236 return CX_REG;
11237 else
11239 if (regparm >= 2)
11241 sorry ("-fsplit-stack does not support 2 register "
11242 " parameters for a nested function");
11243 return INVALID_REGNUM;
11245 return DX_REG;
11248 else
11250 /* FIXME: We could make this work by pushing a register
11251 around the addition and comparison. */
11252 sorry ("-fsplit-stack does not support 3 register parameters");
11253 return INVALID_REGNUM;
11258 /* A SYMBOL_REF for the function which allocates new stackspace for
11259 -fsplit-stack. */
11261 static GTY(()) rtx split_stack_fn;
11263 /* A SYMBOL_REF for the more stack function when using the large
11264 model. */
11266 static GTY(()) rtx split_stack_fn_large;
11268 /* Handle -fsplit-stack. These are the first instructions in the
11269 function, even before the regular prologue. */
11271 void
11272 ix86_expand_split_stack_prologue (void)
11274 struct ix86_frame frame;
11275 HOST_WIDE_INT allocate;
11276 unsigned HOST_WIDE_INT args_size;
11277 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
11278 rtx scratch_reg = NULL_RTX;
11279 rtx varargs_label = NULL_RTX;
11280 rtx fn;
11282 gcc_assert (flag_split_stack && reload_completed);
11284 ix86_finalize_stack_realign_flags ();
11285 ix86_compute_frame_layout (&frame);
11286 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
11288 /* This is the label we will branch to if we have enough stack
11289 space. We expect the basic block reordering pass to reverse this
11290 branch if optimizing, so that we branch in the unlikely case. */
11291 label = gen_label_rtx ();
11293 /* We need to compare the stack pointer minus the frame size with
11294 the stack boundary in the TCB. The stack boundary always gives
11295 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
11296 can compare directly. Otherwise we need to do an addition. */
11298 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
11299 UNSPEC_STACK_CHECK);
11300 limit = gen_rtx_CONST (Pmode, limit);
11301 limit = gen_rtx_MEM (Pmode, limit);
11302 if (allocate < SPLIT_STACK_AVAILABLE)
11303 current = stack_pointer_rtx;
11304 else
11306 unsigned int scratch_regno;
11307 rtx offset;
11309 /* We need a scratch register to hold the stack pointer minus
11310 the required frame size. Since this is the very start of the
11311 function, the scratch register can be any caller-saved
11312 register which is not used for parameters. */
11313 offset = GEN_INT (- allocate);
11314 scratch_regno = split_stack_prologue_scratch_regno ();
11315 if (scratch_regno == INVALID_REGNUM)
11316 return;
11317 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11318 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
11320 /* We don't use ix86_gen_add3 in this case because it will
11321 want to split to lea, but when not optimizing the insn
11322 will not be split after this point. */
11323 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11324 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11325 offset)));
11327 else
11329 emit_move_insn (scratch_reg, offset);
11330 emit_insn (ix86_gen_add3 (scratch_reg, scratch_reg,
11331 stack_pointer_rtx));
11333 current = scratch_reg;
11336 ix86_expand_branch (GEU, current, limit, label);
11337 jump_insn = get_last_insn ();
11338 JUMP_LABEL (jump_insn) = label;
11340 /* Mark the jump as very likely to be taken. */
11341 add_reg_note (jump_insn, REG_BR_PROB,
11342 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
11344 if (split_stack_fn == NULL_RTX)
11345 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
11346 fn = split_stack_fn;
11348 /* Get more stack space. We pass in the desired stack space and the
11349 size of the arguments to copy to the new stack. In 32-bit mode
11350 we push the parameters; __morestack will return on a new stack
11351 anyhow. In 64-bit mode we pass the parameters in r10 and
11352 r11. */
11353 allocate_rtx = GEN_INT (allocate);
11354 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
11355 call_fusage = NULL_RTX;
11356 if (TARGET_64BIT)
11358 rtx reg10, reg11;
11360 reg10 = gen_rtx_REG (Pmode, R10_REG);
11361 reg11 = gen_rtx_REG (Pmode, R11_REG);
11363 /* If this function uses a static chain, it will be in %r10.
11364 Preserve it across the call to __morestack. */
11365 if (DECL_STATIC_CHAIN (cfun->decl))
11367 rtx rax;
11369 rax = gen_rtx_REG (word_mode, AX_REG);
11370 emit_move_insn (rax, gen_rtx_REG (word_mode, R10_REG));
11371 use_reg (&call_fusage, rax);
11374 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
11376 HOST_WIDE_INT argval;
11378 gcc_assert (Pmode == DImode);
11379 /* When using the large model we need to load the address
11380 into a register, and we've run out of registers. So we
11381 switch to a different calling convention, and we call a
11382 different function: __morestack_large. We pass the
11383 argument size in the upper 32 bits of r10 and pass the
11384 frame size in the lower 32 bits. */
11385 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
11386 gcc_assert ((args_size & 0xffffffff) == args_size);
11388 if (split_stack_fn_large == NULL_RTX)
11389 split_stack_fn_large =
11390 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
11392 if (ix86_cmodel == CM_LARGE_PIC)
11394 rtx label, x;
11396 label = gen_label_rtx ();
11397 emit_label (label);
11398 LABEL_PRESERVE_P (label) = 1;
11399 emit_insn (gen_set_rip_rex64 (reg10, label));
11400 emit_insn (gen_set_got_offset_rex64 (reg11, label));
11401 emit_insn (ix86_gen_add3 (reg10, reg10, reg11));
11402 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
11403 UNSPEC_GOT);
11404 x = gen_rtx_CONST (Pmode, x);
11405 emit_move_insn (reg11, x);
11406 x = gen_rtx_PLUS (Pmode, reg10, reg11);
11407 x = gen_const_mem (Pmode, x);
11408 emit_move_insn (reg11, x);
11410 else
11411 emit_move_insn (reg11, split_stack_fn_large);
11413 fn = reg11;
11415 argval = ((args_size << 16) << 16) + allocate;
11416 emit_move_insn (reg10, GEN_INT (argval));
11418 else
11420 emit_move_insn (reg10, allocate_rtx);
11421 emit_move_insn (reg11, GEN_INT (args_size));
11422 use_reg (&call_fusage, reg11);
11425 use_reg (&call_fusage, reg10);
11427 else
11429 emit_insn (gen_push (GEN_INT (args_size)));
11430 emit_insn (gen_push (allocate_rtx));
11432 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
11433 GEN_INT (UNITS_PER_WORD), constm1_rtx,
11434 NULL_RTX, false);
11435 add_function_usage_to (call_insn, call_fusage);
11437 /* In order to make call/return prediction work right, we now need
11438 to execute a return instruction. See
11439 libgcc/config/i386/morestack.S for the details on how this works.
11441 For flow purposes gcc must not see this as a return
11442 instruction--we need control flow to continue at the subsequent
11443 label. Therefore, we use an unspec. */
11444 gcc_assert (crtl->args.pops_args < 65536);
11445 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11447 /* If we are in 64-bit mode and this function uses a static chain,
11448 we saved %r10 in %rax before calling _morestack. */
11449 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11450 emit_move_insn (gen_rtx_REG (word_mode, R10_REG),
11451 gen_rtx_REG (word_mode, AX_REG));
11453 /* If this function calls va_start, we need to store a pointer to
11454 the arguments on the old stack, because they may not have been
11455 all copied to the new stack. At this point the old stack can be
11456 found at the frame pointer value used by __morestack, because
11457 __morestack has set that up before calling back to us. Here we
11458 store that pointer in a scratch register, and in
11459 ix86_expand_prologue we store the scratch register in a stack
11460 slot. */
11461 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11463 unsigned int scratch_regno;
11464 rtx frame_reg;
11465 int words;
11467 scratch_regno = split_stack_prologue_scratch_regno ();
11468 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11469 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11471 /* 64-bit:
11472 fp -> old fp value
11473 return address within this function
11474 return address of caller of this function
11475 stack arguments
11476 So we add three words to get to the stack arguments.
11478 32-bit:
11479 fp -> old fp value
11480 return address within this function
11481 first argument to __morestack
11482 second argument to __morestack
11483 return address of caller of this function
11484 stack arguments
11485 So we add five words to get to the stack arguments.
11487 words = TARGET_64BIT ? 3 : 5;
11488 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11489 gen_rtx_PLUS (Pmode, frame_reg,
11490 GEN_INT (words * UNITS_PER_WORD))));
11492 varargs_label = gen_label_rtx ();
11493 emit_jump_insn (gen_jump (varargs_label));
11494 JUMP_LABEL (get_last_insn ()) = varargs_label;
11496 emit_barrier ();
11499 emit_label (label);
11500 LABEL_NUSES (label) = 1;
11502 /* If this function calls va_start, we now have to set the scratch
11503 register for the case where we do not call __morestack. In this
11504 case we need to set it based on the stack pointer. */
11505 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11507 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11508 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11509 GEN_INT (UNITS_PER_WORD))));
11511 emit_label (varargs_label);
11512 LABEL_NUSES (varargs_label) = 1;
11516 /* We may have to tell the dataflow pass that the split stack prologue
11517 is initializing a scratch register. */
11519 static void
11520 ix86_live_on_entry (bitmap regs)
11522 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11524 gcc_assert (flag_split_stack);
11525 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11529 /* Extract the parts of an RTL expression that is a valid memory address
11530 for an instruction. Return 0 if the structure of the address is
11531 grossly off. Return -1 if the address contains ASHIFT, so it is not
11532 strictly valid, but still used for computing length of lea instruction. */
11535 ix86_decompose_address (rtx addr, struct ix86_address *out)
11537 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11538 rtx base_reg, index_reg;
11539 HOST_WIDE_INT scale = 1;
11540 rtx scale_rtx = NULL_RTX;
11541 rtx tmp;
11542 int retval = 1;
11543 enum ix86_address_seg seg = SEG_DEFAULT;
11545 /* Allow zero-extended SImode addresses,
11546 they will be emitted with addr32 prefix. */
11547 if (TARGET_64BIT && GET_MODE (addr) == DImode)
11549 if (GET_CODE (addr) == ZERO_EXTEND
11550 && GET_MODE (XEXP (addr, 0)) == SImode)
11552 addr = XEXP (addr, 0);
11553 if (CONST_INT_P (addr))
11554 return 0;
11556 else if (GET_CODE (addr) == AND
11557 && const_32bit_mask (XEXP (addr, 1), DImode))
11559 addr = simplify_gen_subreg (SImode, XEXP (addr, 0), DImode, 0);
11560 if (addr == NULL_RTX)
11561 return 0;
11563 if (CONST_INT_P (addr))
11564 return 0;
11568 /* Allow SImode subregs of DImode addresses,
11569 they will be emitted with addr32 prefix. */
11570 if (TARGET_64BIT && GET_MODE (addr) == SImode)
11572 if (GET_CODE (addr) == SUBREG
11573 && GET_MODE (SUBREG_REG (addr)) == DImode)
11575 addr = SUBREG_REG (addr);
11576 if (CONST_INT_P (addr))
11577 return 0;
11581 if (REG_P (addr))
11582 base = addr;
11583 else if (GET_CODE (addr) == SUBREG)
11585 if (REG_P (SUBREG_REG (addr)))
11586 base = addr;
11587 else
11588 return 0;
11590 else if (GET_CODE (addr) == PLUS)
11592 rtx addends[4], op;
11593 int n = 0, i;
11595 op = addr;
11598 if (n >= 4)
11599 return 0;
11600 addends[n++] = XEXP (op, 1);
11601 op = XEXP (op, 0);
11603 while (GET_CODE (op) == PLUS);
11604 if (n >= 4)
11605 return 0;
11606 addends[n] = op;
11608 for (i = n; i >= 0; --i)
11610 op = addends[i];
11611 switch (GET_CODE (op))
11613 case MULT:
11614 if (index)
11615 return 0;
11616 index = XEXP (op, 0);
11617 scale_rtx = XEXP (op, 1);
11618 break;
11620 case ASHIFT:
11621 if (index)
11622 return 0;
11623 index = XEXP (op, 0);
11624 tmp = XEXP (op, 1);
11625 if (!CONST_INT_P (tmp))
11626 return 0;
11627 scale = INTVAL (tmp);
11628 if ((unsigned HOST_WIDE_INT) scale > 3)
11629 return 0;
11630 scale = 1 << scale;
11631 break;
11633 case ZERO_EXTEND:
11634 op = XEXP (op, 0);
11635 if (GET_CODE (op) != UNSPEC)
11636 return 0;
11637 /* FALLTHRU */
11639 case UNSPEC:
11640 if (XINT (op, 1) == UNSPEC_TP
11641 && TARGET_TLS_DIRECT_SEG_REFS
11642 && seg == SEG_DEFAULT)
11643 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11644 else
11645 return 0;
11646 break;
11648 case SUBREG:
11649 if (!REG_P (SUBREG_REG (op)))
11650 return 0;
11651 /* FALLTHRU */
11653 case REG:
11654 if (!base)
11655 base = op;
11656 else if (!index)
11657 index = op;
11658 else
11659 return 0;
11660 break;
11662 case CONST:
11663 case CONST_INT:
11664 case SYMBOL_REF:
11665 case LABEL_REF:
11666 if (disp)
11667 return 0;
11668 disp = op;
11669 break;
11671 default:
11672 return 0;
11676 else if (GET_CODE (addr) == MULT)
11678 index = XEXP (addr, 0); /* index*scale */
11679 scale_rtx = XEXP (addr, 1);
11681 else if (GET_CODE (addr) == ASHIFT)
11683 /* We're called for lea too, which implements ashift on occasion. */
11684 index = XEXP (addr, 0);
11685 tmp = XEXP (addr, 1);
11686 if (!CONST_INT_P (tmp))
11687 return 0;
11688 scale = INTVAL (tmp);
11689 if ((unsigned HOST_WIDE_INT) scale > 3)
11690 return 0;
11691 scale = 1 << scale;
11692 retval = -1;
11694 else
11695 disp = addr; /* displacement */
11697 if (index)
11699 if (REG_P (index))
11701 else if (GET_CODE (index) == SUBREG
11702 && REG_P (SUBREG_REG (index)))
11704 else
11705 return 0;
11708 /* Extract the integral value of scale. */
11709 if (scale_rtx)
11711 if (!CONST_INT_P (scale_rtx))
11712 return 0;
11713 scale = INTVAL (scale_rtx);
11716 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11717 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11719 /* Avoid useless 0 displacement. */
11720 if (disp == const0_rtx && (base || index))
11721 disp = NULL_RTX;
11723 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11724 if (base_reg && index_reg && scale == 1
11725 && (index_reg == arg_pointer_rtx
11726 || index_reg == frame_pointer_rtx
11727 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11729 rtx tmp;
11730 tmp = base, base = index, index = tmp;
11731 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11734 /* Special case: %ebp cannot be encoded as a base without a displacement.
11735 Similarly %r13. */
11736 if (!disp
11737 && base_reg
11738 && (base_reg == hard_frame_pointer_rtx
11739 || base_reg == frame_pointer_rtx
11740 || base_reg == arg_pointer_rtx
11741 || (REG_P (base_reg)
11742 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11743 || REGNO (base_reg) == R13_REG))))
11744 disp = const0_rtx;
11746 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11747 Avoid this by transforming to [%esi+0].
11748 Reload calls address legitimization without cfun defined, so we need
11749 to test cfun for being non-NULL. */
11750 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11751 && base_reg && !index_reg && !disp
11752 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11753 disp = const0_rtx;
11755 /* Special case: encode reg+reg instead of reg*2. */
11756 if (!base && index && scale == 2)
11757 base = index, base_reg = index_reg, scale = 1;
11759 /* Special case: scaling cannot be encoded without base or displacement. */
11760 if (!base && !disp && index && scale != 1)
11761 disp = const0_rtx;
11763 out->base = base;
11764 out->index = index;
11765 out->disp = disp;
11766 out->scale = scale;
11767 out->seg = seg;
11769 return retval;
11772 /* Return cost of the memory address x.
11773 For i386, it is better to use a complex address than let gcc copy
11774 the address into a reg and make a new pseudo. But not if the address
11775 requires to two regs - that would mean more pseudos with longer
11776 lifetimes. */
11777 static int
11778 ix86_address_cost (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED,
11779 addr_space_t as ATTRIBUTE_UNUSED,
11780 bool speed ATTRIBUTE_UNUSED)
11782 struct ix86_address parts;
11783 int cost = 1;
11784 int ok = ix86_decompose_address (x, &parts);
11786 gcc_assert (ok);
11788 if (parts.base && GET_CODE (parts.base) == SUBREG)
11789 parts.base = SUBREG_REG (parts.base);
11790 if (parts.index && GET_CODE (parts.index) == SUBREG)
11791 parts.index = SUBREG_REG (parts.index);
11793 /* Attempt to minimize number of registers in the address. */
11794 if ((parts.base
11795 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11796 || (parts.index
11797 && (!REG_P (parts.index)
11798 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11799 cost++;
11801 if (parts.base
11802 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11803 && parts.index
11804 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11805 && parts.base != parts.index)
11806 cost++;
11808 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11809 since it's predecode logic can't detect the length of instructions
11810 and it degenerates to vector decoded. Increase cost of such
11811 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11812 to split such addresses or even refuse such addresses at all.
11814 Following addressing modes are affected:
11815 [base+scale*index]
11816 [scale*index+disp]
11817 [base+index]
11819 The first and last case may be avoidable by explicitly coding the zero in
11820 memory address, but I don't have AMD-K6 machine handy to check this
11821 theory. */
11823 if (TARGET_K6
11824 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11825 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11826 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11827 cost += 10;
11829 return cost;
11832 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11833 this is used for to form addresses to local data when -fPIC is in
11834 use. */
11836 static bool
11837 darwin_local_data_pic (rtx disp)
11839 return (GET_CODE (disp) == UNSPEC
11840 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11843 /* Determine if a given RTX is a valid constant. We already know this
11844 satisfies CONSTANT_P. */
11846 static bool
11847 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11849 switch (GET_CODE (x))
11851 case CONST:
11852 x = XEXP (x, 0);
11854 if (GET_CODE (x) == PLUS)
11856 if (!CONST_INT_P (XEXP (x, 1)))
11857 return false;
11858 x = XEXP (x, 0);
11861 if (TARGET_MACHO && darwin_local_data_pic (x))
11862 return true;
11864 /* Only some unspecs are valid as "constants". */
11865 if (GET_CODE (x) == UNSPEC)
11866 switch (XINT (x, 1))
11868 case UNSPEC_GOT:
11869 case UNSPEC_GOTOFF:
11870 case UNSPEC_PLTOFF:
11871 return TARGET_64BIT;
11872 case UNSPEC_TPOFF:
11873 case UNSPEC_NTPOFF:
11874 x = XVECEXP (x, 0, 0);
11875 return (GET_CODE (x) == SYMBOL_REF
11876 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11877 case UNSPEC_DTPOFF:
11878 x = XVECEXP (x, 0, 0);
11879 return (GET_CODE (x) == SYMBOL_REF
11880 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11881 default:
11882 return false;
11885 /* We must have drilled down to a symbol. */
11886 if (GET_CODE (x) == LABEL_REF)
11887 return true;
11888 if (GET_CODE (x) != SYMBOL_REF)
11889 return false;
11890 /* FALLTHRU */
11892 case SYMBOL_REF:
11893 /* TLS symbols are never valid. */
11894 if (SYMBOL_REF_TLS_MODEL (x))
11895 return false;
11897 /* DLLIMPORT symbols are never valid. */
11898 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11899 && SYMBOL_REF_DLLIMPORT_P (x))
11900 return false;
11902 #if TARGET_MACHO
11903 /* mdynamic-no-pic */
11904 if (MACHO_DYNAMIC_NO_PIC_P)
11905 return machopic_symbol_defined_p (x);
11906 #endif
11907 break;
11909 case CONST_DOUBLE:
11910 if (GET_MODE (x) == TImode
11911 && x != CONST0_RTX (TImode)
11912 && !TARGET_64BIT)
11913 return false;
11914 break;
11916 case CONST_VECTOR:
11917 if (!standard_sse_constant_p (x))
11918 return false;
11920 default:
11921 break;
11924 /* Otherwise we handle everything else in the move patterns. */
11925 return true;
11928 /* Determine if it's legal to put X into the constant pool. This
11929 is not possible for the address of thread-local symbols, which
11930 is checked above. */
11932 static bool
11933 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11935 /* We can always put integral constants and vectors in memory. */
11936 switch (GET_CODE (x))
11938 case CONST_INT:
11939 case CONST_DOUBLE:
11940 case CONST_VECTOR:
11941 return false;
11943 default:
11944 break;
11946 return !ix86_legitimate_constant_p (mode, x);
11950 /* Nonzero if the constant value X is a legitimate general operand
11951 when generating PIC code. It is given that flag_pic is on and
11952 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11954 bool
11955 legitimate_pic_operand_p (rtx x)
11957 rtx inner;
11959 switch (GET_CODE (x))
11961 case CONST:
11962 inner = XEXP (x, 0);
11963 if (GET_CODE (inner) == PLUS
11964 && CONST_INT_P (XEXP (inner, 1)))
11965 inner = XEXP (inner, 0);
11967 /* Only some unspecs are valid as "constants". */
11968 if (GET_CODE (inner) == UNSPEC)
11969 switch (XINT (inner, 1))
11971 case UNSPEC_GOT:
11972 case UNSPEC_GOTOFF:
11973 case UNSPEC_PLTOFF:
11974 return TARGET_64BIT;
11975 case UNSPEC_TPOFF:
11976 x = XVECEXP (inner, 0, 0);
11977 return (GET_CODE (x) == SYMBOL_REF
11978 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11979 case UNSPEC_MACHOPIC_OFFSET:
11980 return legitimate_pic_address_disp_p (x);
11981 default:
11982 return false;
11984 /* FALLTHRU */
11986 case SYMBOL_REF:
11987 case LABEL_REF:
11988 return legitimate_pic_address_disp_p (x);
11990 default:
11991 return true;
11995 /* Determine if a given CONST RTX is a valid memory displacement
11996 in PIC mode. */
11998 bool
11999 legitimate_pic_address_disp_p (rtx disp)
12001 bool saw_plus;
12003 /* In 64bit mode we can allow direct addresses of symbols and labels
12004 when they are not dynamic symbols. */
12005 if (TARGET_64BIT)
12007 rtx op0 = disp, op1;
12009 switch (GET_CODE (disp))
12011 case LABEL_REF:
12012 return true;
12014 case CONST:
12015 if (GET_CODE (XEXP (disp, 0)) != PLUS)
12016 break;
12017 op0 = XEXP (XEXP (disp, 0), 0);
12018 op1 = XEXP (XEXP (disp, 0), 1);
12019 if (!CONST_INT_P (op1)
12020 || INTVAL (op1) >= 16*1024*1024
12021 || INTVAL (op1) < -16*1024*1024)
12022 break;
12023 if (GET_CODE (op0) == LABEL_REF)
12024 return true;
12025 if (GET_CODE (op0) == CONST
12026 && GET_CODE (XEXP (op0, 0)) == UNSPEC
12027 && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
12028 return true;
12029 if (GET_CODE (op0) == UNSPEC
12030 && XINT (op0, 1) == UNSPEC_PCREL)
12031 return true;
12032 if (GET_CODE (op0) != SYMBOL_REF)
12033 break;
12034 /* FALLTHRU */
12036 case SYMBOL_REF:
12037 /* TLS references should always be enclosed in UNSPEC. */
12038 if (SYMBOL_REF_TLS_MODEL (op0))
12039 return false;
12040 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
12041 && ix86_cmodel != CM_LARGE_PIC)
12042 return true;
12043 break;
12045 default:
12046 break;
12049 if (GET_CODE (disp) != CONST)
12050 return false;
12051 disp = XEXP (disp, 0);
12053 if (TARGET_64BIT)
12055 /* We are unsafe to allow PLUS expressions. This limit allowed distance
12056 of GOT tables. We should not need these anyway. */
12057 if (GET_CODE (disp) != UNSPEC
12058 || (XINT (disp, 1) != UNSPEC_GOTPCREL
12059 && XINT (disp, 1) != UNSPEC_GOTOFF
12060 && XINT (disp, 1) != UNSPEC_PCREL
12061 && XINT (disp, 1) != UNSPEC_PLTOFF))
12062 return false;
12064 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
12065 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
12066 return false;
12067 return true;
12070 saw_plus = false;
12071 if (GET_CODE (disp) == PLUS)
12073 if (!CONST_INT_P (XEXP (disp, 1)))
12074 return false;
12075 disp = XEXP (disp, 0);
12076 saw_plus = true;
12079 if (TARGET_MACHO && darwin_local_data_pic (disp))
12080 return true;
12082 if (GET_CODE (disp) != UNSPEC)
12083 return false;
12085 switch (XINT (disp, 1))
12087 case UNSPEC_GOT:
12088 if (saw_plus)
12089 return false;
12090 /* We need to check for both symbols and labels because VxWorks loads
12091 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
12092 details. */
12093 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12094 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
12095 case UNSPEC_GOTOFF:
12096 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
12097 While ABI specify also 32bit relocation but we don't produce it in
12098 small PIC model at all. */
12099 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
12100 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
12101 && !TARGET_64BIT)
12102 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
12103 return false;
12104 case UNSPEC_GOTTPOFF:
12105 case UNSPEC_GOTNTPOFF:
12106 case UNSPEC_INDNTPOFF:
12107 if (saw_plus)
12108 return false;
12109 disp = XVECEXP (disp, 0, 0);
12110 return (GET_CODE (disp) == SYMBOL_REF
12111 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
12112 case UNSPEC_NTPOFF:
12113 disp = XVECEXP (disp, 0, 0);
12114 return (GET_CODE (disp) == SYMBOL_REF
12115 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
12116 case UNSPEC_DTPOFF:
12117 disp = XVECEXP (disp, 0, 0);
12118 return (GET_CODE (disp) == SYMBOL_REF
12119 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
12122 return false;
12125 /* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to
12126 replace the input X, or the original X if no replacement is called for.
12127 The output parameter *WIN is 1 if the calling macro should goto WIN,
12128 0 if it should not. */
12130 bool
12131 ix86_legitimize_reload_address (rtx x,
12132 enum machine_mode mode ATTRIBUTE_UNUSED,
12133 int opnum, int type,
12134 int ind_levels ATTRIBUTE_UNUSED)
12136 /* Reload can generate:
12138 (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
12139 (reg:DI 97))
12140 (reg:DI 2 cx))
12142 This RTX is rejected from ix86_legitimate_address_p due to
12143 non-strictness of base register 97. Following this rejection,
12144 reload pushes all three components into separate registers,
12145 creating invalid memory address RTX.
12147 Following code reloads only the invalid part of the
12148 memory address RTX. */
12150 if (GET_CODE (x) == PLUS
12151 && REG_P (XEXP (x, 1))
12152 && GET_CODE (XEXP (x, 0)) == PLUS
12153 && REG_P (XEXP (XEXP (x, 0), 1)))
12155 rtx base, index;
12156 bool something_reloaded = false;
12158 base = XEXP (XEXP (x, 0), 1);
12159 if (!REG_OK_FOR_BASE_STRICT_P (base))
12161 push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
12162 BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12163 opnum, (enum reload_type) type);
12164 something_reloaded = true;
12167 index = XEXP (x, 1);
12168 if (!REG_OK_FOR_INDEX_STRICT_P (index))
12170 push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
12171 INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
12172 opnum, (enum reload_type) type);
12173 something_reloaded = true;
12176 gcc_assert (something_reloaded);
12177 return true;
12180 return false;
12183 /* Determine if op is suitable RTX for an address register.
12184 Return naked register if a register or a register subreg is
12185 found, otherwise return NULL_RTX. */
12187 static rtx
12188 ix86_validate_address_register (rtx op)
12190 enum machine_mode mode = GET_MODE (op);
12192 /* Only SImode or DImode registers can form the address. */
12193 if (mode != SImode && mode != DImode)
12194 return NULL_RTX;
12196 if (REG_P (op))
12197 return op;
12198 else if (GET_CODE (op) == SUBREG)
12200 rtx reg = SUBREG_REG (op);
12202 if (!REG_P (reg))
12203 return NULL_RTX;
12205 mode = GET_MODE (reg);
12207 /* Don't allow SUBREGs that span more than a word. It can
12208 lead to spill failures when the register is one word out
12209 of a two word structure. */
12210 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
12211 return NULL_RTX;
12213 /* Allow only SUBREGs of non-eliminable hard registers. */
12214 if (register_no_elim_operand (reg, mode))
12215 return reg;
12218 /* Op is not a register. */
12219 return NULL_RTX;
12222 /* Recognizes RTL expressions that are valid memory addresses for an
12223 instruction. The MODE argument is the machine mode for the MEM
12224 expression that wants to use this address.
12226 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
12227 convert common non-canonical forms to canonical form so that they will
12228 be recognized. */
12230 static bool
12231 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
12232 rtx addr, bool strict)
12234 struct ix86_address parts;
12235 rtx base, index, disp;
12236 HOST_WIDE_INT scale;
12237 enum ix86_address_seg seg;
12239 if (ix86_decompose_address (addr, &parts) <= 0)
12240 /* Decomposition failed. */
12241 return false;
12243 base = parts.base;
12244 index = parts.index;
12245 disp = parts.disp;
12246 scale = parts.scale;
12247 seg = parts.seg;
12249 /* Validate base register. */
12250 if (base)
12252 rtx reg = ix86_validate_address_register (base);
12254 if (reg == NULL_RTX)
12255 return false;
12257 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
12258 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
12259 /* Base is not valid. */
12260 return false;
12263 /* Validate index register. */
12264 if (index)
12266 rtx reg = ix86_validate_address_register (index);
12268 if (reg == NULL_RTX)
12269 return false;
12271 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
12272 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
12273 /* Index is not valid. */
12274 return false;
12277 /* Index and base should have the same mode. */
12278 if (base && index
12279 && GET_MODE (base) != GET_MODE (index))
12280 return false;
12282 /* Address override works only on the (%reg) part of %fs:(%reg). */
12283 if (seg != SEG_DEFAULT
12284 && ((base && GET_MODE (base) != word_mode)
12285 || (index && GET_MODE (index) != word_mode)))
12286 return false;
12288 /* Validate scale factor. */
12289 if (scale != 1)
12291 if (!index)
12292 /* Scale without index. */
12293 return false;
12295 if (scale != 2 && scale != 4 && scale != 8)
12296 /* Scale is not a valid multiplier. */
12297 return false;
12300 /* Validate displacement. */
12301 if (disp)
12303 if (GET_CODE (disp) == CONST
12304 && GET_CODE (XEXP (disp, 0)) == UNSPEC
12305 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
12306 switch (XINT (XEXP (disp, 0), 1))
12308 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
12309 used. While ABI specify also 32bit relocations, we don't produce
12310 them at all and use IP relative instead. */
12311 case UNSPEC_GOT:
12312 case UNSPEC_GOTOFF:
12313 gcc_assert (flag_pic);
12314 if (!TARGET_64BIT)
12315 goto is_legitimate_pic;
12317 /* 64bit address unspec. */
12318 return false;
12320 case UNSPEC_GOTPCREL:
12321 case UNSPEC_PCREL:
12322 gcc_assert (flag_pic);
12323 goto is_legitimate_pic;
12325 case UNSPEC_GOTTPOFF:
12326 case UNSPEC_GOTNTPOFF:
12327 case UNSPEC_INDNTPOFF:
12328 case UNSPEC_NTPOFF:
12329 case UNSPEC_DTPOFF:
12330 break;
12332 case UNSPEC_STACK_CHECK:
12333 gcc_assert (flag_split_stack);
12334 break;
12336 default:
12337 /* Invalid address unspec. */
12338 return false;
12341 else if (SYMBOLIC_CONST (disp)
12342 && (flag_pic
12343 || (TARGET_MACHO
12344 #if TARGET_MACHO
12345 && MACHOPIC_INDIRECT
12346 && !machopic_operand_p (disp)
12347 #endif
12351 is_legitimate_pic:
12352 if (TARGET_64BIT && (index || base))
12354 /* foo@dtpoff(%rX) is ok. */
12355 if (GET_CODE (disp) != CONST
12356 || GET_CODE (XEXP (disp, 0)) != PLUS
12357 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
12358 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
12359 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
12360 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
12361 /* Non-constant pic memory reference. */
12362 return false;
12364 else if ((!TARGET_MACHO || flag_pic)
12365 && ! legitimate_pic_address_disp_p (disp))
12366 /* Displacement is an invalid pic construct. */
12367 return false;
12368 #if TARGET_MACHO
12369 else if (MACHO_DYNAMIC_NO_PIC_P
12370 && !ix86_legitimate_constant_p (Pmode, disp))
12371 /* displacment must be referenced via non_lazy_pointer */
12372 return false;
12373 #endif
12375 /* This code used to verify that a symbolic pic displacement
12376 includes the pic_offset_table_rtx register.
12378 While this is good idea, unfortunately these constructs may
12379 be created by "adds using lea" optimization for incorrect
12380 code like:
12382 int a;
12383 int foo(int i)
12385 return *(&a+i);
12388 This code is nonsensical, but results in addressing
12389 GOT table with pic_offset_table_rtx base. We can't
12390 just refuse it easily, since it gets matched by
12391 "addsi3" pattern, that later gets split to lea in the
12392 case output register differs from input. While this
12393 can be handled by separate addsi pattern for this case
12394 that never results in lea, this seems to be easier and
12395 correct fix for crash to disable this test. */
12397 else if (GET_CODE (disp) != LABEL_REF
12398 && !CONST_INT_P (disp)
12399 && (GET_CODE (disp) != CONST
12400 || !ix86_legitimate_constant_p (Pmode, disp))
12401 && (GET_CODE (disp) != SYMBOL_REF
12402 || !ix86_legitimate_constant_p (Pmode, disp)))
12403 /* Displacement is not constant. */
12404 return false;
12405 else if (TARGET_64BIT
12406 && !x86_64_immediate_operand (disp, VOIDmode))
12407 /* Displacement is out of range. */
12408 return false;
12409 /* In x32 mode, constant addresses are sign extended to 64bit, so
12410 we have to prevent addresses from 0x80000000 to 0xffffffff. */
12411 else if (TARGET_X32 && !(index || base)
12412 && CONST_INT_P (disp)
12413 && val_signbit_known_set_p (SImode, INTVAL (disp)))
12414 return false;
12417 /* Everything looks valid. */
12418 return true;
12421 /* Determine if a given RTX is a valid constant address. */
12423 bool
12424 constant_address_p (rtx x)
12426 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
12429 /* Return a unique alias set for the GOT. */
12431 static alias_set_type
12432 ix86_GOT_alias_set (void)
12434 static alias_set_type set = -1;
12435 if (set == -1)
12436 set = new_alias_set ();
12437 return set;
12440 /* Return a legitimate reference for ORIG (an address) using the
12441 register REG. If REG is 0, a new pseudo is generated.
12443 There are two types of references that must be handled:
12445 1. Global data references must load the address from the GOT, via
12446 the PIC reg. An insn is emitted to do this load, and the reg is
12447 returned.
12449 2. Static data references, constant pool addresses, and code labels
12450 compute the address as an offset from the GOT, whose base is in
12451 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
12452 differentiate them from global data objects. The returned
12453 address is the PIC reg + an unspec constant.
12455 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
12456 reg also appears in the address. */
12458 static rtx
12459 legitimize_pic_address (rtx orig, rtx reg)
12461 rtx addr = orig;
12462 rtx new_rtx = orig;
12464 #if TARGET_MACHO
12465 if (TARGET_MACHO && !TARGET_64BIT)
12467 if (reg == 0)
12468 reg = gen_reg_rtx (Pmode);
12469 /* Use the generic Mach-O PIC machinery. */
12470 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
12472 #endif
12474 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
12475 new_rtx = addr;
12476 else if (TARGET_64BIT
12477 && ix86_cmodel != CM_SMALL_PIC
12478 && gotoff_operand (addr, Pmode))
12480 rtx tmpreg;
12481 /* This symbol may be referenced via a displacement from the PIC
12482 base address (@GOTOFF). */
12484 if (reload_in_progress)
12485 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12486 if (GET_CODE (addr) == CONST)
12487 addr = XEXP (addr, 0);
12488 if (GET_CODE (addr) == PLUS)
12490 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12491 UNSPEC_GOTOFF);
12492 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12494 else
12495 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12496 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12497 if (!reg)
12498 tmpreg = gen_reg_rtx (Pmode);
12499 else
12500 tmpreg = reg;
12501 emit_move_insn (tmpreg, new_rtx);
12503 if (reg != 0)
12505 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
12506 tmpreg, 1, OPTAB_DIRECT);
12507 new_rtx = reg;
12509 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
12511 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
12513 /* This symbol may be referenced via a displacement from the PIC
12514 base address (@GOTOFF). */
12516 if (reload_in_progress)
12517 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12518 if (GET_CODE (addr) == CONST)
12519 addr = XEXP (addr, 0);
12520 if (GET_CODE (addr) == PLUS)
12522 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
12523 UNSPEC_GOTOFF);
12524 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
12526 else
12527 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
12528 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12529 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12531 if (reg != 0)
12533 emit_move_insn (reg, new_rtx);
12534 new_rtx = reg;
12537 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
12538 /* We can't use @GOTOFF for text labels on VxWorks;
12539 see gotoff_operand. */
12540 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
12542 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12544 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
12545 return legitimize_dllimport_symbol (addr, true);
12546 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
12547 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
12548 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
12550 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
12551 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
12555 /* For x64 PE-COFF there is no GOT table. So we use address
12556 directly. */
12557 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12559 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
12560 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12562 if (reg == 0)
12563 reg = gen_reg_rtx (Pmode);
12564 emit_move_insn (reg, new_rtx);
12565 new_rtx = reg;
12567 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
12569 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
12570 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12571 new_rtx = gen_const_mem (Pmode, new_rtx);
12572 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12574 if (reg == 0)
12575 reg = gen_reg_rtx (Pmode);
12576 /* Use directly gen_movsi, otherwise the address is loaded
12577 into register for CSE. We don't want to CSE this addresses,
12578 instead we CSE addresses from the GOT table, so skip this. */
12579 emit_insn (gen_movsi (reg, new_rtx));
12580 new_rtx = reg;
12582 else
12584 /* This symbol must be referenced via a load from the
12585 Global Offset Table (@GOT). */
12587 if (reload_in_progress)
12588 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12589 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
12590 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12591 if (TARGET_64BIT)
12592 new_rtx = force_reg (Pmode, new_rtx);
12593 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12594 new_rtx = gen_const_mem (Pmode, new_rtx);
12595 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
12597 if (reg == 0)
12598 reg = gen_reg_rtx (Pmode);
12599 emit_move_insn (reg, new_rtx);
12600 new_rtx = reg;
12603 else
12605 if (CONST_INT_P (addr)
12606 && !x86_64_immediate_operand (addr, VOIDmode))
12608 if (reg)
12610 emit_move_insn (reg, addr);
12611 new_rtx = reg;
12613 else
12614 new_rtx = force_reg (Pmode, addr);
12616 else if (GET_CODE (addr) == CONST)
12618 addr = XEXP (addr, 0);
12620 /* We must match stuff we generate before. Assume the only
12621 unspecs that can get here are ours. Not that we could do
12622 anything with them anyway.... */
12623 if (GET_CODE (addr) == UNSPEC
12624 || (GET_CODE (addr) == PLUS
12625 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12626 return orig;
12627 gcc_assert (GET_CODE (addr) == PLUS);
12629 if (GET_CODE (addr) == PLUS)
12631 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12633 /* Check first to see if this is a constant offset from a @GOTOFF
12634 symbol reference. */
12635 if (gotoff_operand (op0, Pmode)
12636 && CONST_INT_P (op1))
12638 if (!TARGET_64BIT)
12640 if (reload_in_progress)
12641 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12642 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12643 UNSPEC_GOTOFF);
12644 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12645 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12646 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12648 if (reg != 0)
12650 emit_move_insn (reg, new_rtx);
12651 new_rtx = reg;
12654 else
12656 if (INTVAL (op1) < -16*1024*1024
12657 || INTVAL (op1) >= 16*1024*1024)
12659 if (!x86_64_immediate_operand (op1, Pmode))
12660 op1 = force_reg (Pmode, op1);
12661 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12665 else
12667 rtx base = legitimize_pic_address (op0, reg);
12668 enum machine_mode mode = GET_MODE (base);
12669 new_rtx
12670 = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg);
12672 if (CONST_INT_P (new_rtx))
12674 if (INTVAL (new_rtx) < -16*1024*1024
12675 || INTVAL (new_rtx) >= 16*1024*1024)
12677 if (!x86_64_immediate_operand (new_rtx, mode))
12678 new_rtx = force_reg (mode, new_rtx);
12679 new_rtx
12680 = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx);
12682 else
12683 new_rtx = plus_constant (mode, base, INTVAL (new_rtx));
12685 else
12687 if (GET_CODE (new_rtx) == PLUS
12688 && CONSTANT_P (XEXP (new_rtx, 1)))
12690 base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0));
12691 new_rtx = XEXP (new_rtx, 1);
12693 new_rtx = gen_rtx_PLUS (mode, base, new_rtx);
12698 return new_rtx;
12701 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12703 static rtx
12704 get_thread_pointer (enum machine_mode tp_mode, bool to_reg)
12706 rtx tp = gen_rtx_UNSPEC (ptr_mode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12708 if (GET_MODE (tp) != tp_mode)
12710 gcc_assert (GET_MODE (tp) == SImode);
12711 gcc_assert (tp_mode == DImode);
12713 tp = gen_rtx_ZERO_EXTEND (tp_mode, tp);
12716 if (to_reg)
12717 tp = copy_to_mode_reg (tp_mode, tp);
12719 return tp;
12722 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12724 static GTY(()) rtx ix86_tls_symbol;
12726 static rtx
12727 ix86_tls_get_addr (void)
12729 if (!ix86_tls_symbol)
12731 const char *sym
12732 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12733 ? "___tls_get_addr" : "__tls_get_addr");
12735 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12738 return ix86_tls_symbol;
12741 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12743 static GTY(()) rtx ix86_tls_module_base_symbol;
12746 ix86_tls_module_base (void)
12748 if (!ix86_tls_module_base_symbol)
12750 ix86_tls_module_base_symbol
12751 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12753 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12754 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12757 return ix86_tls_module_base_symbol;
12760 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12761 false if we expect this to be used for a memory address and true if
12762 we expect to load the address into a register. */
12764 static rtx
12765 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12767 rtx dest, base, off;
12768 rtx pic = NULL_RTX, tp = NULL_RTX;
12769 enum machine_mode tp_mode = Pmode;
12770 int type;
12772 switch (model)
12774 case TLS_MODEL_GLOBAL_DYNAMIC:
12775 dest = gen_reg_rtx (Pmode);
12777 if (!TARGET_64BIT)
12779 if (flag_pic)
12780 pic = pic_offset_table_rtx;
12781 else
12783 pic = gen_reg_rtx (Pmode);
12784 emit_insn (gen_set_got (pic));
12788 if (TARGET_GNU2_TLS)
12790 if (TARGET_64BIT)
12791 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12792 else
12793 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12795 tp = get_thread_pointer (Pmode, true);
12796 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12798 if (GET_MODE (x) != Pmode)
12799 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12801 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12803 else
12805 rtx caddr = ix86_tls_get_addr ();
12807 if (TARGET_64BIT)
12809 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12810 rtx insns;
12812 start_sequence ();
12813 emit_call_insn
12814 (ix86_gen_tls_global_dynamic_64 (rax, x, caddr));
12815 insns = get_insns ();
12816 end_sequence ();
12818 if (GET_MODE (x) != Pmode)
12819 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12821 RTL_CONST_CALL_P (insns) = 1;
12822 emit_libcall_block (insns, dest, rax, x);
12824 else
12825 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12827 break;
12829 case TLS_MODEL_LOCAL_DYNAMIC:
12830 base = gen_reg_rtx (Pmode);
12832 if (!TARGET_64BIT)
12834 if (flag_pic)
12835 pic = pic_offset_table_rtx;
12836 else
12838 pic = gen_reg_rtx (Pmode);
12839 emit_insn (gen_set_got (pic));
12843 if (TARGET_GNU2_TLS)
12845 rtx tmp = ix86_tls_module_base ();
12847 if (TARGET_64BIT)
12848 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12849 else
12850 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12852 tp = get_thread_pointer (Pmode, true);
12853 set_unique_reg_note (get_last_insn (), REG_EQUAL,
12854 gen_rtx_MINUS (Pmode, tmp, tp));
12856 else
12858 rtx caddr = ix86_tls_get_addr ();
12860 if (TARGET_64BIT)
12862 rtx rax = gen_rtx_REG (Pmode, AX_REG);
12863 rtx insns, eqv;
12865 start_sequence ();
12866 emit_call_insn
12867 (ix86_gen_tls_local_dynamic_base_64 (rax, caddr));
12868 insns = get_insns ();
12869 end_sequence ();
12871 /* Attach a unique REG_EQUAL, to allow the RTL optimizers to
12872 share the LD_BASE result with other LD model accesses. */
12873 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12874 UNSPEC_TLS_LD_BASE);
12876 RTL_CONST_CALL_P (insns) = 1;
12877 emit_libcall_block (insns, base, rax, eqv);
12879 else
12880 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12883 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12884 off = gen_rtx_CONST (Pmode, off);
12886 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12888 if (TARGET_GNU2_TLS)
12890 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12892 if (GET_MODE (x) != Pmode)
12893 x = gen_rtx_ZERO_EXTEND (Pmode, x);
12895 set_unique_reg_note (get_last_insn (), REG_EQUAL, x);
12897 break;
12899 case TLS_MODEL_INITIAL_EXEC:
12900 if (TARGET_64BIT)
12902 if (TARGET_SUN_TLS && !TARGET_X32)
12904 /* The Sun linker took the AMD64 TLS spec literally
12905 and can only handle %rax as destination of the
12906 initial executable code sequence. */
12908 dest = gen_reg_rtx (DImode);
12909 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12910 return dest;
12913 /* Generate DImode references to avoid %fs:(%reg32)
12914 problems and linker IE->LE relaxation bug. */
12915 tp_mode = DImode;
12916 pic = NULL;
12917 type = UNSPEC_GOTNTPOFF;
12919 else if (flag_pic)
12921 if (reload_in_progress)
12922 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12923 pic = pic_offset_table_rtx;
12924 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12926 else if (!TARGET_ANY_GNU_TLS)
12928 pic = gen_reg_rtx (Pmode);
12929 emit_insn (gen_set_got (pic));
12930 type = UNSPEC_GOTTPOFF;
12932 else
12934 pic = NULL;
12935 type = UNSPEC_INDNTPOFF;
12938 off = gen_rtx_UNSPEC (tp_mode, gen_rtvec (1, x), type);
12939 off = gen_rtx_CONST (tp_mode, off);
12940 if (pic)
12941 off = gen_rtx_PLUS (tp_mode, pic, off);
12942 off = gen_const_mem (tp_mode, off);
12943 set_mem_alias_set (off, ix86_GOT_alias_set ());
12945 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12947 base = get_thread_pointer (tp_mode,
12948 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12949 off = force_reg (tp_mode, off);
12950 return gen_rtx_PLUS (tp_mode, base, off);
12952 else
12954 base = get_thread_pointer (Pmode, true);
12955 dest = gen_reg_rtx (Pmode);
12956 emit_insn (ix86_gen_sub3 (dest, base, off));
12958 break;
12960 case TLS_MODEL_LOCAL_EXEC:
12961 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12962 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12963 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12964 off = gen_rtx_CONST (Pmode, off);
12966 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12968 base = get_thread_pointer (Pmode,
12969 for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12970 return gen_rtx_PLUS (Pmode, base, off);
12972 else
12974 base = get_thread_pointer (Pmode, true);
12975 dest = gen_reg_rtx (Pmode);
12976 emit_insn (ix86_gen_sub3 (dest, base, off));
12978 break;
12980 default:
12981 gcc_unreachable ();
12984 return dest;
12987 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12988 to symbol DECL. */
12990 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12991 htab_t dllimport_map;
12993 static tree
12994 get_dllimport_decl (tree decl)
12996 struct tree_map *h, in;
12997 void **loc;
12998 const char *name;
12999 const char *prefix;
13000 size_t namelen, prefixlen;
13001 char *imp_name;
13002 tree to;
13003 rtx rtl;
13005 if (!dllimport_map)
13006 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
13008 in.hash = htab_hash_pointer (decl);
13009 in.base.from = decl;
13010 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
13011 h = (struct tree_map *) *loc;
13012 if (h)
13013 return h->to;
13015 *loc = h = ggc_alloc_tree_map ();
13016 h->hash = in.hash;
13017 h->base.from = decl;
13018 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
13019 VAR_DECL, NULL, ptr_type_node);
13020 DECL_ARTIFICIAL (to) = 1;
13021 DECL_IGNORED_P (to) = 1;
13022 DECL_EXTERNAL (to) = 1;
13023 TREE_READONLY (to) = 1;
13025 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
13026 name = targetm.strip_name_encoding (name);
13027 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
13028 ? "*__imp_" : "*__imp__";
13029 namelen = strlen (name);
13030 prefixlen = strlen (prefix);
13031 imp_name = (char *) alloca (namelen + prefixlen + 1);
13032 memcpy (imp_name, prefix, prefixlen);
13033 memcpy (imp_name + prefixlen, name, namelen + 1);
13035 name = ggc_alloc_string (imp_name, namelen + prefixlen);
13036 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
13037 SET_SYMBOL_REF_DECL (rtl, to);
13038 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
13040 rtl = gen_const_mem (Pmode, rtl);
13041 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
13043 SET_DECL_RTL (to, rtl);
13044 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
13046 return to;
13049 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
13050 true if we require the result be a register. */
13052 static rtx
13053 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
13055 tree imp_decl;
13056 rtx x;
13058 gcc_assert (SYMBOL_REF_DECL (symbol));
13059 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
13061 x = DECL_RTL (imp_decl);
13062 if (want_reg)
13063 x = force_reg (Pmode, x);
13064 return x;
13067 /* Try machine-dependent ways of modifying an illegitimate address
13068 to be legitimate. If we find one, return the new, valid address.
13069 This macro is used in only one place: `memory_address' in explow.c.
13071 OLDX is the address as it was before break_out_memory_refs was called.
13072 In some cases it is useful to look at this to decide what needs to be done.
13074 It is always safe for this macro to do nothing. It exists to recognize
13075 opportunities to optimize the output.
13077 For the 80386, we handle X+REG by loading X into a register R and
13078 using R+REG. R will go in a general reg and indexing will be used.
13079 However, if REG is a broken-out memory address or multiplication,
13080 nothing needs to be done because REG can certainly go in a general reg.
13082 When -fpic is used, special handling is needed for symbolic references.
13083 See comments by legitimize_pic_address in i386.c for details. */
13085 static rtx
13086 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
13087 enum machine_mode mode)
13089 int changed = 0;
13090 unsigned log;
13092 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
13093 if (log)
13094 return legitimize_tls_address (x, (enum tls_model) log, false);
13095 if (GET_CODE (x) == CONST
13096 && GET_CODE (XEXP (x, 0)) == PLUS
13097 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13098 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
13100 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
13101 (enum tls_model) log, false);
13102 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13105 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
13107 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
13108 return legitimize_dllimport_symbol (x, true);
13109 if (GET_CODE (x) == CONST
13110 && GET_CODE (XEXP (x, 0)) == PLUS
13111 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
13112 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
13114 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
13115 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
13119 if (flag_pic && SYMBOLIC_CONST (x))
13120 return legitimize_pic_address (x, 0);
13122 #if TARGET_MACHO
13123 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
13124 return machopic_indirect_data_reference (x, 0);
13125 #endif
13127 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
13128 if (GET_CODE (x) == ASHIFT
13129 && CONST_INT_P (XEXP (x, 1))
13130 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
13132 changed = 1;
13133 log = INTVAL (XEXP (x, 1));
13134 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
13135 GEN_INT (1 << log));
13138 if (GET_CODE (x) == PLUS)
13140 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
13142 if (GET_CODE (XEXP (x, 0)) == ASHIFT
13143 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13144 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
13146 changed = 1;
13147 log = INTVAL (XEXP (XEXP (x, 0), 1));
13148 XEXP (x, 0) = gen_rtx_MULT (Pmode,
13149 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
13150 GEN_INT (1 << log));
13153 if (GET_CODE (XEXP (x, 1)) == ASHIFT
13154 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
13155 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
13157 changed = 1;
13158 log = INTVAL (XEXP (XEXP (x, 1), 1));
13159 XEXP (x, 1) = gen_rtx_MULT (Pmode,
13160 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
13161 GEN_INT (1 << log));
13164 /* Put multiply first if it isn't already. */
13165 if (GET_CODE (XEXP (x, 1)) == MULT)
13167 rtx tmp = XEXP (x, 0);
13168 XEXP (x, 0) = XEXP (x, 1);
13169 XEXP (x, 1) = tmp;
13170 changed = 1;
13173 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
13174 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
13175 created by virtual register instantiation, register elimination, and
13176 similar optimizations. */
13177 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
13179 changed = 1;
13180 x = gen_rtx_PLUS (Pmode,
13181 gen_rtx_PLUS (Pmode, XEXP (x, 0),
13182 XEXP (XEXP (x, 1), 0)),
13183 XEXP (XEXP (x, 1), 1));
13186 /* Canonicalize
13187 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
13188 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
13189 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
13190 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13191 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
13192 && CONSTANT_P (XEXP (x, 1)))
13194 rtx constant;
13195 rtx other = NULL_RTX;
13197 if (CONST_INT_P (XEXP (x, 1)))
13199 constant = XEXP (x, 1);
13200 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
13202 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
13204 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
13205 other = XEXP (x, 1);
13207 else
13208 constant = 0;
13210 if (constant)
13212 changed = 1;
13213 x = gen_rtx_PLUS (Pmode,
13214 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
13215 XEXP (XEXP (XEXP (x, 0), 1), 0)),
13216 plus_constant (Pmode, other,
13217 INTVAL (constant)));
13221 if (changed && ix86_legitimate_address_p (mode, x, false))
13222 return x;
13224 if (GET_CODE (XEXP (x, 0)) == MULT)
13226 changed = 1;
13227 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
13230 if (GET_CODE (XEXP (x, 1)) == MULT)
13232 changed = 1;
13233 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
13236 if (changed
13237 && REG_P (XEXP (x, 1))
13238 && REG_P (XEXP (x, 0)))
13239 return x;
13241 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
13243 changed = 1;
13244 x = legitimize_pic_address (x, 0);
13247 if (changed && ix86_legitimate_address_p (mode, x, false))
13248 return x;
13250 if (REG_P (XEXP (x, 0)))
13252 rtx temp = gen_reg_rtx (Pmode);
13253 rtx val = force_operand (XEXP (x, 1), temp);
13254 if (val != temp)
13256 val = convert_to_mode (Pmode, val, 1);
13257 emit_move_insn (temp, val);
13260 XEXP (x, 1) = temp;
13261 return x;
13264 else if (REG_P (XEXP (x, 1)))
13266 rtx temp = gen_reg_rtx (Pmode);
13267 rtx val = force_operand (XEXP (x, 0), temp);
13268 if (val != temp)
13270 val = convert_to_mode (Pmode, val, 1);
13271 emit_move_insn (temp, val);
13274 XEXP (x, 0) = temp;
13275 return x;
13279 return x;
13282 /* Print an integer constant expression in assembler syntax. Addition
13283 and subtraction are the only arithmetic that may appear in these
13284 expressions. FILE is the stdio stream to write to, X is the rtx, and
13285 CODE is the operand print code from the output string. */
13287 static void
13288 output_pic_addr_const (FILE *file, rtx x, int code)
13290 char buf[256];
13292 switch (GET_CODE (x))
13294 case PC:
13295 gcc_assert (flag_pic);
13296 putc ('.', file);
13297 break;
13299 case SYMBOL_REF:
13300 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
13301 output_addr_const (file, x);
13302 else
13304 const char *name = XSTR (x, 0);
13306 /* Mark the decl as referenced so that cgraph will
13307 output the function. */
13308 if (SYMBOL_REF_DECL (x))
13309 mark_decl_referenced (SYMBOL_REF_DECL (x));
13311 #if TARGET_MACHO
13312 if (MACHOPIC_INDIRECT
13313 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
13314 name = machopic_indirection_name (x, /*stub_p=*/true);
13315 #endif
13316 assemble_name (file, name);
13318 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
13319 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
13320 fputs ("@PLT", file);
13321 break;
13323 case LABEL_REF:
13324 x = XEXP (x, 0);
13325 /* FALLTHRU */
13326 case CODE_LABEL:
13327 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
13328 assemble_name (asm_out_file, buf);
13329 break;
13331 case CONST_INT:
13332 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
13333 break;
13335 case CONST:
13336 /* This used to output parentheses around the expression,
13337 but that does not work on the 386 (either ATT or BSD assembler). */
13338 output_pic_addr_const (file, XEXP (x, 0), code);
13339 break;
13341 case CONST_DOUBLE:
13342 if (GET_MODE (x) == VOIDmode)
13344 /* We can use %d if the number is <32 bits and positive. */
13345 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
13346 fprintf (file, "0x%lx%08lx",
13347 (unsigned long) CONST_DOUBLE_HIGH (x),
13348 (unsigned long) CONST_DOUBLE_LOW (x));
13349 else
13350 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
13352 else
13353 /* We can't handle floating point constants;
13354 TARGET_PRINT_OPERAND must handle them. */
13355 output_operand_lossage ("floating constant misused");
13356 break;
13358 case PLUS:
13359 /* Some assemblers need integer constants to appear first. */
13360 if (CONST_INT_P (XEXP (x, 0)))
13362 output_pic_addr_const (file, XEXP (x, 0), code);
13363 putc ('+', file);
13364 output_pic_addr_const (file, XEXP (x, 1), code);
13366 else
13368 gcc_assert (CONST_INT_P (XEXP (x, 1)));
13369 output_pic_addr_const (file, XEXP (x, 1), code);
13370 putc ('+', file);
13371 output_pic_addr_const (file, XEXP (x, 0), code);
13373 break;
13375 case MINUS:
13376 if (!TARGET_MACHO)
13377 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
13378 output_pic_addr_const (file, XEXP (x, 0), code);
13379 putc ('-', file);
13380 output_pic_addr_const (file, XEXP (x, 1), code);
13381 if (!TARGET_MACHO)
13382 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
13383 break;
13385 case UNSPEC:
13386 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
13388 bool f = i386_asm_output_addr_const_extra (file, x);
13389 gcc_assert (f);
13390 break;
13393 gcc_assert (XVECLEN (x, 0) == 1);
13394 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
13395 switch (XINT (x, 1))
13397 case UNSPEC_GOT:
13398 fputs ("@GOT", file);
13399 break;
13400 case UNSPEC_GOTOFF:
13401 fputs ("@GOTOFF", file);
13402 break;
13403 case UNSPEC_PLTOFF:
13404 fputs ("@PLTOFF", file);
13405 break;
13406 case UNSPEC_PCREL:
13407 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13408 "(%rip)" : "[rip]", file);
13409 break;
13410 case UNSPEC_GOTPCREL:
13411 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13412 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
13413 break;
13414 case UNSPEC_GOTTPOFF:
13415 /* FIXME: This might be @TPOFF in Sun ld too. */
13416 fputs ("@gottpoff", file);
13417 break;
13418 case UNSPEC_TPOFF:
13419 fputs ("@tpoff", file);
13420 break;
13421 case UNSPEC_NTPOFF:
13422 if (TARGET_64BIT)
13423 fputs ("@tpoff", file);
13424 else
13425 fputs ("@ntpoff", file);
13426 break;
13427 case UNSPEC_DTPOFF:
13428 fputs ("@dtpoff", file);
13429 break;
13430 case UNSPEC_GOTNTPOFF:
13431 if (TARGET_64BIT)
13432 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
13433 "@gottpoff(%rip)": "@gottpoff[rip]", file);
13434 else
13435 fputs ("@gotntpoff", file);
13436 break;
13437 case UNSPEC_INDNTPOFF:
13438 fputs ("@indntpoff", file);
13439 break;
13440 #if TARGET_MACHO
13441 case UNSPEC_MACHOPIC_OFFSET:
13442 putc ('-', file);
13443 machopic_output_function_base_name (file);
13444 break;
13445 #endif
13446 default:
13447 output_operand_lossage ("invalid UNSPEC as operand");
13448 break;
13450 break;
13452 default:
13453 output_operand_lossage ("invalid expression as operand");
13457 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
13458 We need to emit DTP-relative relocations. */
13460 static void ATTRIBUTE_UNUSED
13461 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
13463 fputs (ASM_LONG, file);
13464 output_addr_const (file, x);
13465 fputs ("@dtpoff", file);
13466 switch (size)
13468 case 4:
13469 break;
13470 case 8:
13471 fputs (", 0", file);
13472 break;
13473 default:
13474 gcc_unreachable ();
13478 /* Return true if X is a representation of the PIC register. This copes
13479 with calls from ix86_find_base_term, where the register might have
13480 been replaced by a cselib value. */
13482 static bool
13483 ix86_pic_register_p (rtx x)
13485 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
13486 return (pic_offset_table_rtx
13487 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
13488 else
13489 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
13492 /* Helper function for ix86_delegitimize_address.
13493 Attempt to delegitimize TLS local-exec accesses. */
13495 static rtx
13496 ix86_delegitimize_tls_address (rtx orig_x)
13498 rtx x = orig_x, unspec;
13499 struct ix86_address addr;
13501 if (!TARGET_TLS_DIRECT_SEG_REFS)
13502 return orig_x;
13503 if (MEM_P (x))
13504 x = XEXP (x, 0);
13505 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
13506 return orig_x;
13507 if (ix86_decompose_address (x, &addr) == 0
13508 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
13509 || addr.disp == NULL_RTX
13510 || GET_CODE (addr.disp) != CONST)
13511 return orig_x;
13512 unspec = XEXP (addr.disp, 0);
13513 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
13514 unspec = XEXP (unspec, 0);
13515 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
13516 return orig_x;
13517 x = XVECEXP (unspec, 0, 0);
13518 gcc_assert (GET_CODE (x) == SYMBOL_REF);
13519 if (unspec != XEXP (addr.disp, 0))
13520 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
13521 if (addr.index)
13523 rtx idx = addr.index;
13524 if (addr.scale != 1)
13525 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
13526 x = gen_rtx_PLUS (Pmode, idx, x);
13528 if (addr.base)
13529 x = gen_rtx_PLUS (Pmode, addr.base, x);
13530 if (MEM_P (orig_x))
13531 x = replace_equiv_address_nv (orig_x, x);
13532 return x;
13535 /* In the name of slightly smaller debug output, and to cater to
13536 general assembler lossage, recognize PIC+GOTOFF and turn it back
13537 into a direct symbol reference.
13539 On Darwin, this is necessary to avoid a crash, because Darwin
13540 has a different PIC label for each routine but the DWARF debugging
13541 information is not associated with any particular routine, so it's
13542 necessary to remove references to the PIC label from RTL stored by
13543 the DWARF output code. */
13545 static rtx
13546 ix86_delegitimize_address (rtx x)
13548 rtx orig_x = delegitimize_mem_from_attrs (x);
13549 /* addend is NULL or some rtx if x is something+GOTOFF where
13550 something doesn't include the PIC register. */
13551 rtx addend = NULL_RTX;
13552 /* reg_addend is NULL or a multiple of some register. */
13553 rtx reg_addend = NULL_RTX;
13554 /* const_addend is NULL or a const_int. */
13555 rtx const_addend = NULL_RTX;
13556 /* This is the result, or NULL. */
13557 rtx result = NULL_RTX;
13559 x = orig_x;
13561 if (MEM_P (x))
13562 x = XEXP (x, 0);
13564 if (TARGET_64BIT)
13566 if (GET_CODE (x) == CONST
13567 && GET_CODE (XEXP (x, 0)) == PLUS
13568 && GET_MODE (XEXP (x, 0)) == Pmode
13569 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13570 && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
13571 && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
13573 rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
13574 x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
13575 if (MEM_P (orig_x))
13576 x = replace_equiv_address_nv (orig_x, x);
13577 return x;
13580 if (GET_CODE (x) == CONST
13581 && GET_CODE (XEXP (x, 0)) == UNSPEC
13582 && (XINT (XEXP (x, 0), 1) == UNSPEC_GOTPCREL
13583 || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL)
13584 && (MEM_P (orig_x) || XINT (XEXP (x, 0), 1) == UNSPEC_PCREL))
13586 x = XVECEXP (XEXP (x, 0), 0, 0);
13587 if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
13589 x = simplify_gen_subreg (GET_MODE (orig_x), x,
13590 GET_MODE (x), 0);
13591 if (x == NULL_RTX)
13592 return orig_x;
13594 return x;
13597 if (ix86_cmodel != CM_MEDIUM_PIC && ix86_cmodel != CM_LARGE_PIC)
13598 return ix86_delegitimize_tls_address (orig_x);
13600 /* Fall thru into the code shared with -m32 for -mcmodel=large -fpic
13601 and -mcmodel=medium -fpic. */
13604 if (GET_CODE (x) != PLUS
13605 || GET_CODE (XEXP (x, 1)) != CONST)
13606 return ix86_delegitimize_tls_address (orig_x);
13608 if (ix86_pic_register_p (XEXP (x, 0)))
13609 /* %ebx + GOT/GOTOFF */
13611 else if (GET_CODE (XEXP (x, 0)) == PLUS)
13613 /* %ebx + %reg * scale + GOT/GOTOFF */
13614 reg_addend = XEXP (x, 0);
13615 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
13616 reg_addend = XEXP (reg_addend, 1);
13617 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
13618 reg_addend = XEXP (reg_addend, 0);
13619 else
13621 reg_addend = NULL_RTX;
13622 addend = XEXP (x, 0);
13625 else
13626 addend = XEXP (x, 0);
13628 x = XEXP (XEXP (x, 1), 0);
13629 if (GET_CODE (x) == PLUS
13630 && CONST_INT_P (XEXP (x, 1)))
13632 const_addend = XEXP (x, 1);
13633 x = XEXP (x, 0);
13636 if (GET_CODE (x) == UNSPEC
13637 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
13638 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))
13639 || (XINT (x, 1) == UNSPEC_PLTOFF && ix86_cmodel == CM_LARGE_PIC
13640 && !MEM_P (orig_x) && !addend)))
13641 result = XVECEXP (x, 0, 0);
13643 if (!TARGET_64BIT && TARGET_MACHO && darwin_local_data_pic (x)
13644 && !MEM_P (orig_x))
13645 result = XVECEXP (x, 0, 0);
13647 if (! result)
13648 return ix86_delegitimize_tls_address (orig_x);
13650 if (const_addend)
13651 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
13652 if (reg_addend)
13653 result = gen_rtx_PLUS (Pmode, reg_addend, result);
13654 if (addend)
13656 /* If the rest of original X doesn't involve the PIC register, add
13657 addend and subtract pic_offset_table_rtx. This can happen e.g.
13658 for code like:
13659 leal (%ebx, %ecx, 4), %ecx
13661 movl foo@GOTOFF(%ecx), %edx
13662 in which case we return (%ecx - %ebx) + foo. */
13663 if (pic_offset_table_rtx)
13664 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
13665 pic_offset_table_rtx),
13666 result);
13667 else
13668 return orig_x;
13670 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13672 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13673 if (result == NULL_RTX)
13674 return orig_x;
13676 return result;
13679 /* If X is a machine specific address (i.e. a symbol or label being
13680 referenced as a displacement from the GOT implemented using an
13681 UNSPEC), then return the base term. Otherwise return X. */
13684 ix86_find_base_term (rtx x)
13686 rtx term;
13688 if (TARGET_64BIT)
13690 if (GET_CODE (x) != CONST)
13691 return x;
13692 term = XEXP (x, 0);
13693 if (GET_CODE (term) == PLUS
13694 && (CONST_INT_P (XEXP (term, 1))
13695 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13696 term = XEXP (term, 0);
13697 if (GET_CODE (term) != UNSPEC
13698 || (XINT (term, 1) != UNSPEC_GOTPCREL
13699 && XINT (term, 1) != UNSPEC_PCREL))
13700 return x;
13702 return XVECEXP (term, 0, 0);
13705 return ix86_delegitimize_address (x);
13708 static void
13709 put_condition_code (enum rtx_code code, enum machine_mode mode, bool reverse,
13710 bool fp, FILE *file)
13712 const char *suffix;
13714 if (mode == CCFPmode || mode == CCFPUmode)
13716 code = ix86_fp_compare_code_to_integer (code);
13717 mode = CCmode;
13719 if (reverse)
13720 code = reverse_condition (code);
13722 switch (code)
13724 case EQ:
13725 switch (mode)
13727 case CCAmode:
13728 suffix = "a";
13729 break;
13731 case CCCmode:
13732 suffix = "c";
13733 break;
13735 case CCOmode:
13736 suffix = "o";
13737 break;
13739 case CCSmode:
13740 suffix = "s";
13741 break;
13743 default:
13744 suffix = "e";
13746 break;
13747 case NE:
13748 switch (mode)
13750 case CCAmode:
13751 suffix = "na";
13752 break;
13754 case CCCmode:
13755 suffix = "nc";
13756 break;
13758 case CCOmode:
13759 suffix = "no";
13760 break;
13762 case CCSmode:
13763 suffix = "ns";
13764 break;
13766 default:
13767 suffix = "ne";
13769 break;
13770 case GT:
13771 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13772 suffix = "g";
13773 break;
13774 case GTU:
13775 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13776 Those same assemblers have the same but opposite lossage on cmov. */
13777 if (mode == CCmode)
13778 suffix = fp ? "nbe" : "a";
13779 else
13780 gcc_unreachable ();
13781 break;
13782 case LT:
13783 switch (mode)
13785 case CCNOmode:
13786 case CCGOCmode:
13787 suffix = "s";
13788 break;
13790 case CCmode:
13791 case CCGCmode:
13792 suffix = "l";
13793 break;
13795 default:
13796 gcc_unreachable ();
13798 break;
13799 case LTU:
13800 if (mode == CCmode)
13801 suffix = "b";
13802 else if (mode == CCCmode)
13803 suffix = "c";
13804 else
13805 gcc_unreachable ();
13806 break;
13807 case GE:
13808 switch (mode)
13810 case CCNOmode:
13811 case CCGOCmode:
13812 suffix = "ns";
13813 break;
13815 case CCmode:
13816 case CCGCmode:
13817 suffix = "ge";
13818 break;
13820 default:
13821 gcc_unreachable ();
13823 break;
13824 case GEU:
13825 if (mode == CCmode)
13826 suffix = fp ? "nb" : "ae";
13827 else if (mode == CCCmode)
13828 suffix = "nc";
13829 else
13830 gcc_unreachable ();
13831 break;
13832 case LE:
13833 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13834 suffix = "le";
13835 break;
13836 case LEU:
13837 if (mode == CCmode)
13838 suffix = "be";
13839 else
13840 gcc_unreachable ();
13841 break;
13842 case UNORDERED:
13843 suffix = fp ? "u" : "p";
13844 break;
13845 case ORDERED:
13846 suffix = fp ? "nu" : "np";
13847 break;
13848 default:
13849 gcc_unreachable ();
13851 fputs (suffix, file);
13854 /* Print the name of register X to FILE based on its machine mode and number.
13855 If CODE is 'w', pretend the mode is HImode.
13856 If CODE is 'b', pretend the mode is QImode.
13857 If CODE is 'k', pretend the mode is SImode.
13858 If CODE is 'q', pretend the mode is DImode.
13859 If CODE is 'x', pretend the mode is V4SFmode.
13860 If CODE is 't', pretend the mode is V8SFmode.
13861 If CODE is 'h', pretend the reg is the 'high' byte register.
13862 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13863 If CODE is 'd', duplicate the operand for AVX instruction.
13866 void
13867 print_reg (rtx x, int code, FILE *file)
13869 const char *reg;
13870 unsigned int regno;
13871 bool duplicated = code == 'd' && TARGET_AVX;
13873 if (ASSEMBLER_DIALECT == ASM_ATT)
13874 putc ('%', file);
13876 if (x == pc_rtx)
13878 gcc_assert (TARGET_64BIT);
13879 fputs ("rip", file);
13880 return;
13883 regno = true_regnum (x);
13884 gcc_assert (regno != ARG_POINTER_REGNUM
13885 && regno != FRAME_POINTER_REGNUM
13886 && regno != FLAGS_REG
13887 && regno != FPSR_REG
13888 && regno != FPCR_REG);
13890 if (code == 'w' || MMX_REG_P (x))
13891 code = 2;
13892 else if (code == 'b')
13893 code = 1;
13894 else if (code == 'k')
13895 code = 4;
13896 else if (code == 'q')
13897 code = 8;
13898 else if (code == 'y')
13899 code = 3;
13900 else if (code == 'h')
13901 code = 0;
13902 else if (code == 'x')
13903 code = 16;
13904 else if (code == 't')
13905 code = 32;
13906 else
13907 code = GET_MODE_SIZE (GET_MODE (x));
13909 /* Irritatingly, AMD extended registers use different naming convention
13910 from the normal registers: "r%d[bwd]" */
13911 if (REX_INT_REGNO_P (regno))
13913 gcc_assert (TARGET_64BIT);
13914 putc ('r', file);
13915 fprint_ul (file, regno - FIRST_REX_INT_REG + 8);
13916 switch (code)
13918 case 0:
13919 error ("extended registers have no high halves");
13920 break;
13921 case 1:
13922 putc ('b', file);
13923 break;
13924 case 2:
13925 putc ('w', file);
13926 break;
13927 case 4:
13928 putc ('d', file);
13929 break;
13930 case 8:
13931 /* no suffix */
13932 break;
13933 default:
13934 error ("unsupported operand size for extended register");
13935 break;
13937 return;
13940 reg = NULL;
13941 switch (code)
13943 case 3:
13944 if (STACK_TOP_P (x))
13946 reg = "st(0)";
13947 break;
13949 /* FALLTHRU */
13950 case 8:
13951 case 4:
13952 case 12:
13953 if (! ANY_FP_REG_P (x))
13954 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13955 /* FALLTHRU */
13956 case 16:
13957 case 2:
13958 normal:
13959 reg = hi_reg_name[regno];
13960 break;
13961 case 1:
13962 if (regno >= ARRAY_SIZE (qi_reg_name))
13963 goto normal;
13964 reg = qi_reg_name[regno];
13965 break;
13966 case 0:
13967 if (regno >= ARRAY_SIZE (qi_high_reg_name))
13968 goto normal;
13969 reg = qi_high_reg_name[regno];
13970 break;
13971 case 32:
13972 if (SSE_REG_P (x))
13974 gcc_assert (!duplicated);
13975 putc ('y', file);
13976 fputs (hi_reg_name[regno] + 1, file);
13977 return;
13979 break;
13980 default:
13981 gcc_unreachable ();
13984 fputs (reg, file);
13985 if (duplicated)
13987 if (ASSEMBLER_DIALECT == ASM_ATT)
13988 fprintf (file, ", %%%s", reg);
13989 else
13990 fprintf (file, ", %s", reg);
13994 /* Locate some local-dynamic symbol still in use by this function
13995 so that we can print its name in some tls_local_dynamic_base
13996 pattern. */
13998 static int
13999 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
14001 rtx x = *px;
14003 if (GET_CODE (x) == SYMBOL_REF
14004 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
14006 cfun->machine->some_ld_name = XSTR (x, 0);
14007 return 1;
14010 return 0;
14013 static const char *
14014 get_some_local_dynamic_name (void)
14016 rtx insn;
14018 if (cfun->machine->some_ld_name)
14019 return cfun->machine->some_ld_name;
14021 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
14022 if (NONDEBUG_INSN_P (insn)
14023 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
14024 return cfun->machine->some_ld_name;
14026 return NULL;
14029 /* Meaning of CODE:
14030 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
14031 C -- print opcode suffix for set/cmov insn.
14032 c -- like C, but print reversed condition
14033 F,f -- likewise, but for floating-point.
14034 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
14035 otherwise nothing
14036 R -- print the prefix for register names.
14037 z -- print the opcode suffix for the size of the current operand.
14038 Z -- likewise, with special suffixes for x87 instructions.
14039 * -- print a star (in certain assembler syntax)
14040 A -- print an absolute memory reference.
14041 E -- print address with DImode register names if TARGET_64BIT.
14042 w -- print the operand as if it's a "word" (HImode) even if it isn't.
14043 s -- print a shift double count, followed by the assemblers argument
14044 delimiter.
14045 b -- print the QImode name of the register for the indicated operand.
14046 %b0 would print %al if operands[0] is reg 0.
14047 w -- likewise, print the HImode name of the register.
14048 k -- likewise, print the SImode name of the register.
14049 q -- likewise, print the DImode name of the register.
14050 x -- likewise, print the V4SFmode name of the register.
14051 t -- likewise, print the V8SFmode name of the register.
14052 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
14053 y -- print "st(0)" instead of "st" as a register.
14054 d -- print duplicated register operand for AVX instruction.
14055 D -- print condition for SSE cmp instruction.
14056 P -- if PIC, print an @PLT suffix.
14057 p -- print raw symbol name.
14058 X -- don't print any sort of PIC '@' suffix for a symbol.
14059 & -- print some in-use local-dynamic symbol name.
14060 H -- print a memory address offset by 8; used for sse high-parts
14061 Y -- print condition for XOP pcom* instruction.
14062 + -- print a branch hint as 'cs' or 'ds' prefix
14063 ; -- print a semicolon (after prefixes due to bug in older gas).
14064 ~ -- print "i" if TARGET_AVX2, "f" otherwise.
14065 @ -- print a segment register of thread base pointer load
14066 ^ -- print addr32 prefix if TARGET_64BIT and Pmode != word_mode
14069 void
14070 ix86_print_operand (FILE *file, rtx x, int code)
14072 if (code)
14074 switch (code)
14076 case 'A':
14077 switch (ASSEMBLER_DIALECT)
14079 case ASM_ATT:
14080 putc ('*', file);
14081 break;
14083 case ASM_INTEL:
14084 /* Intel syntax. For absolute addresses, registers should not
14085 be surrounded by braces. */
14086 if (!REG_P (x))
14088 putc ('[', file);
14089 ix86_print_operand (file, x, 0);
14090 putc (']', file);
14091 return;
14093 break;
14095 default:
14096 gcc_unreachable ();
14099 ix86_print_operand (file, x, 0);
14100 return;
14102 case 'E':
14103 /* Wrap address in an UNSPEC to declare special handling. */
14104 if (TARGET_64BIT)
14105 x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR);
14107 output_address (x);
14108 return;
14110 case 'L':
14111 if (ASSEMBLER_DIALECT == ASM_ATT)
14112 putc ('l', file);
14113 return;
14115 case 'W':
14116 if (ASSEMBLER_DIALECT == ASM_ATT)
14117 putc ('w', file);
14118 return;
14120 case 'B':
14121 if (ASSEMBLER_DIALECT == ASM_ATT)
14122 putc ('b', file);
14123 return;
14125 case 'Q':
14126 if (ASSEMBLER_DIALECT == ASM_ATT)
14127 putc ('l', file);
14128 return;
14130 case 'S':
14131 if (ASSEMBLER_DIALECT == ASM_ATT)
14132 putc ('s', file);
14133 return;
14135 case 'T':
14136 if (ASSEMBLER_DIALECT == ASM_ATT)
14137 putc ('t', file);
14138 return;
14140 case 'O':
14141 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14142 if (ASSEMBLER_DIALECT != ASM_ATT)
14143 return;
14145 switch (GET_MODE_SIZE (GET_MODE (x)))
14147 case 2:
14148 putc ('w', file);
14149 break;
14151 case 4:
14152 putc ('l', file);
14153 break;
14155 case 8:
14156 putc ('q', file);
14157 break;
14159 default:
14160 output_operand_lossage
14161 ("invalid operand size for operand code 'O'");
14162 return;
14165 putc ('.', file);
14166 #endif
14167 return;
14169 case 'z':
14170 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14172 /* Opcodes don't get size suffixes if using Intel opcodes. */
14173 if (ASSEMBLER_DIALECT == ASM_INTEL)
14174 return;
14176 switch (GET_MODE_SIZE (GET_MODE (x)))
14178 case 1:
14179 putc ('b', file);
14180 return;
14182 case 2:
14183 putc ('w', file);
14184 return;
14186 case 4:
14187 putc ('l', file);
14188 return;
14190 case 8:
14191 putc ('q', file);
14192 return;
14194 default:
14195 output_operand_lossage
14196 ("invalid operand size for operand code 'z'");
14197 return;
14201 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14202 warning
14203 (0, "non-integer operand used with operand code 'z'");
14204 /* FALLTHRU */
14206 case 'Z':
14207 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
14208 if (ASSEMBLER_DIALECT == ASM_INTEL)
14209 return;
14211 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
14213 switch (GET_MODE_SIZE (GET_MODE (x)))
14215 case 2:
14216 #ifdef HAVE_AS_IX86_FILDS
14217 putc ('s', file);
14218 #endif
14219 return;
14221 case 4:
14222 putc ('l', file);
14223 return;
14225 case 8:
14226 #ifdef HAVE_AS_IX86_FILDQ
14227 putc ('q', file);
14228 #else
14229 fputs ("ll", file);
14230 #endif
14231 return;
14233 default:
14234 break;
14237 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
14239 /* 387 opcodes don't get size suffixes
14240 if the operands are registers. */
14241 if (STACK_REG_P (x))
14242 return;
14244 switch (GET_MODE_SIZE (GET_MODE (x)))
14246 case 4:
14247 putc ('s', file);
14248 return;
14250 case 8:
14251 putc ('l', file);
14252 return;
14254 case 12:
14255 case 16:
14256 putc ('t', file);
14257 return;
14259 default:
14260 break;
14263 else
14265 output_operand_lossage
14266 ("invalid operand type used with operand code 'Z'");
14267 return;
14270 output_operand_lossage
14271 ("invalid operand size for operand code 'Z'");
14272 return;
14274 case 'd':
14275 case 'b':
14276 case 'w':
14277 case 'k':
14278 case 'q':
14279 case 'h':
14280 case 't':
14281 case 'y':
14282 case 'x':
14283 case 'X':
14284 case 'P':
14285 case 'p':
14286 break;
14288 case 's':
14289 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
14291 ix86_print_operand (file, x, 0);
14292 fputs (", ", file);
14294 return;
14296 case 'Y':
14297 switch (GET_CODE (x))
14299 case NE:
14300 fputs ("neq", file);
14301 break;
14302 case EQ:
14303 fputs ("eq", file);
14304 break;
14305 case GE:
14306 case GEU:
14307 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
14308 break;
14309 case GT:
14310 case GTU:
14311 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
14312 break;
14313 case LE:
14314 case LEU:
14315 fputs ("le", file);
14316 break;
14317 case LT:
14318 case LTU:
14319 fputs ("lt", file);
14320 break;
14321 case UNORDERED:
14322 fputs ("unord", file);
14323 break;
14324 case ORDERED:
14325 fputs ("ord", file);
14326 break;
14327 case UNEQ:
14328 fputs ("ueq", file);
14329 break;
14330 case UNGE:
14331 fputs ("nlt", file);
14332 break;
14333 case UNGT:
14334 fputs ("nle", file);
14335 break;
14336 case UNLE:
14337 fputs ("ule", file);
14338 break;
14339 case UNLT:
14340 fputs ("ult", file);
14341 break;
14342 case LTGT:
14343 fputs ("une", file);
14344 break;
14345 default:
14346 output_operand_lossage ("operand is not a condition code, "
14347 "invalid operand code 'Y'");
14348 return;
14350 return;
14352 case 'D':
14353 /* Little bit of braindamage here. The SSE compare instructions
14354 does use completely different names for the comparisons that the
14355 fp conditional moves. */
14356 switch (GET_CODE (x))
14358 case UNEQ:
14359 if (TARGET_AVX)
14361 fputs ("eq_us", file);
14362 break;
14364 case EQ:
14365 fputs ("eq", file);
14366 break;
14367 case UNLT:
14368 if (TARGET_AVX)
14370 fputs ("nge", file);
14371 break;
14373 case LT:
14374 fputs ("lt", file);
14375 break;
14376 case UNLE:
14377 if (TARGET_AVX)
14379 fputs ("ngt", file);
14380 break;
14382 case LE:
14383 fputs ("le", file);
14384 break;
14385 case UNORDERED:
14386 fputs ("unord", file);
14387 break;
14388 case LTGT:
14389 if (TARGET_AVX)
14391 fputs ("neq_oq", file);
14392 break;
14394 case NE:
14395 fputs ("neq", file);
14396 break;
14397 case GE:
14398 if (TARGET_AVX)
14400 fputs ("ge", file);
14401 break;
14403 case UNGE:
14404 fputs ("nlt", file);
14405 break;
14406 case GT:
14407 if (TARGET_AVX)
14409 fputs ("gt", file);
14410 break;
14412 case UNGT:
14413 fputs ("nle", file);
14414 break;
14415 case ORDERED:
14416 fputs ("ord", file);
14417 break;
14418 default:
14419 output_operand_lossage ("operand is not a condition code, "
14420 "invalid operand code 'D'");
14421 return;
14423 return;
14425 case 'F':
14426 case 'f':
14427 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
14428 if (ASSEMBLER_DIALECT == ASM_ATT)
14429 putc ('.', file);
14430 #endif
14432 case 'C':
14433 case 'c':
14434 if (!COMPARISON_P (x))
14436 output_operand_lossage ("operand is not a condition code, "
14437 "invalid operand code '%c'", code);
14438 return;
14440 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)),
14441 code == 'c' || code == 'f',
14442 code == 'F' || code == 'f',
14443 file);
14444 return;
14446 case 'H':
14447 if (!offsettable_memref_p (x))
14449 output_operand_lossage ("operand is not an offsettable memory "
14450 "reference, invalid operand code 'H'");
14451 return;
14453 /* It doesn't actually matter what mode we use here, as we're
14454 only going to use this for printing. */
14455 x = adjust_address_nv (x, DImode, 8);
14456 break;
14458 case 'K':
14459 gcc_assert (CONST_INT_P (x));
14461 if (INTVAL (x) & IX86_HLE_ACQUIRE)
14462 #ifdef HAVE_AS_IX86_HLE
14463 fputs ("xacquire ", file);
14464 #else
14465 fputs ("\n" ASM_BYTE "0xf2\n\t", file);
14466 #endif
14467 else if (INTVAL (x) & IX86_HLE_RELEASE)
14468 #ifdef HAVE_AS_IX86_HLE
14469 fputs ("xrelease ", file);
14470 #else
14471 fputs ("\n" ASM_BYTE "0xf3\n\t", file);
14472 #endif
14473 /* We do not want to print value of the operand. */
14474 return;
14476 case '*':
14477 if (ASSEMBLER_DIALECT == ASM_ATT)
14478 putc ('*', file);
14479 return;
14481 case '&':
14483 const char *name = get_some_local_dynamic_name ();
14484 if (name == NULL)
14485 output_operand_lossage ("'%%&' used without any "
14486 "local dynamic TLS references");
14487 else
14488 assemble_name (file, name);
14489 return;
14492 case '+':
14494 rtx x;
14496 if (!optimize
14497 || optimize_function_for_size_p (cfun)
14498 || !TARGET_BRANCH_PREDICTION_HINTS)
14499 return;
14501 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
14502 if (x)
14504 int pred_val = INTVAL (XEXP (x, 0));
14506 if (pred_val < REG_BR_PROB_BASE * 45 / 100
14507 || pred_val > REG_BR_PROB_BASE * 55 / 100)
14509 bool taken = pred_val > REG_BR_PROB_BASE / 2;
14510 bool cputaken
14511 = final_forward_branch_p (current_output_insn) == 0;
14513 /* Emit hints only in the case default branch prediction
14514 heuristics would fail. */
14515 if (taken != cputaken)
14517 /* We use 3e (DS) prefix for taken branches and
14518 2e (CS) prefix for not taken branches. */
14519 if (taken)
14520 fputs ("ds ; ", file);
14521 else
14522 fputs ("cs ; ", file);
14526 return;
14529 case ';':
14530 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
14531 putc (';', file);
14532 #endif
14533 return;
14535 case '@':
14536 if (ASSEMBLER_DIALECT == ASM_ATT)
14537 putc ('%', file);
14539 /* The kernel uses a different segment register for performance
14540 reasons; a system call would not have to trash the userspace
14541 segment register, which would be expensive. */
14542 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
14543 fputs ("fs", file);
14544 else
14545 fputs ("gs", file);
14546 return;
14548 case '~':
14549 putc (TARGET_AVX2 ? 'i' : 'f', file);
14550 return;
14552 case '^':
14553 if (TARGET_64BIT && Pmode != word_mode)
14554 fputs ("addr32 ", file);
14555 return;
14557 default:
14558 output_operand_lossage ("invalid operand code '%c'", code);
14562 if (REG_P (x))
14563 print_reg (x, code, file);
14565 else if (MEM_P (x))
14567 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
14568 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
14569 && GET_MODE (x) != BLKmode)
14571 const char * size;
14572 switch (GET_MODE_SIZE (GET_MODE (x)))
14574 case 1: size = "BYTE"; break;
14575 case 2: size = "WORD"; break;
14576 case 4: size = "DWORD"; break;
14577 case 8: size = "QWORD"; break;
14578 case 12: size = "TBYTE"; break;
14579 case 16:
14580 if (GET_MODE (x) == XFmode)
14581 size = "TBYTE";
14582 else
14583 size = "XMMWORD";
14584 break;
14585 case 32: size = "YMMWORD"; break;
14586 default:
14587 gcc_unreachable ();
14590 /* Check for explicit size override (codes 'b', 'w', 'k',
14591 'q' and 'x') */
14592 if (code == 'b')
14593 size = "BYTE";
14594 else if (code == 'w')
14595 size = "WORD";
14596 else if (code == 'k')
14597 size = "DWORD";
14598 else if (code == 'q')
14599 size = "QWORD";
14600 else if (code == 'x')
14601 size = "XMMWORD";
14603 fputs (size, file);
14604 fputs (" PTR ", file);
14607 x = XEXP (x, 0);
14608 /* Avoid (%rip) for call operands. */
14609 if (CONSTANT_ADDRESS_P (x) && code == 'P'
14610 && !CONST_INT_P (x))
14611 output_addr_const (file, x);
14612 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
14613 output_operand_lossage ("invalid constraints for operand");
14614 else
14615 output_address (x);
14618 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
14620 REAL_VALUE_TYPE r;
14621 long l;
14623 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14624 REAL_VALUE_TO_TARGET_SINGLE (r, l);
14626 if (ASSEMBLER_DIALECT == ASM_ATT)
14627 putc ('$', file);
14628 /* Sign extend 32bit SFmode immediate to 8 bytes. */
14629 if (code == 'q')
14630 fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x",
14631 (unsigned long long) (int) l);
14632 else
14633 fprintf (file, "0x%08x", (unsigned int) l);
14636 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
14638 REAL_VALUE_TYPE r;
14639 long l[2];
14641 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
14642 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
14644 if (ASSEMBLER_DIALECT == ASM_ATT)
14645 putc ('$', file);
14646 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
14649 /* These float cases don't actually occur as immediate operands. */
14650 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
14652 char dstr[30];
14654 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
14655 fputs (dstr, file);
14658 else
14660 /* We have patterns that allow zero sets of memory, for instance.
14661 In 64-bit mode, we should probably support all 8-byte vectors,
14662 since we can in fact encode that into an immediate. */
14663 if (GET_CODE (x) == CONST_VECTOR)
14665 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
14666 x = const0_rtx;
14669 if (code != 'P' && code != 'p')
14671 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
14673 if (ASSEMBLER_DIALECT == ASM_ATT)
14674 putc ('$', file);
14676 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
14677 || GET_CODE (x) == LABEL_REF)
14679 if (ASSEMBLER_DIALECT == ASM_ATT)
14680 putc ('$', file);
14681 else
14682 fputs ("OFFSET FLAT:", file);
14685 if (CONST_INT_P (x))
14686 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14687 else if (flag_pic || MACHOPIC_INDIRECT)
14688 output_pic_addr_const (file, x, code);
14689 else
14690 output_addr_const (file, x);
14694 static bool
14695 ix86_print_operand_punct_valid_p (unsigned char code)
14697 return (code == '@' || code == '*' || code == '+' || code == '&'
14698 || code == ';' || code == '~' || code == '^');
14701 /* Print a memory operand whose address is ADDR. */
14703 static void
14704 ix86_print_operand_address (FILE *file, rtx addr)
14706 struct ix86_address parts;
14707 rtx base, index, disp;
14708 int scale;
14709 int ok;
14710 bool vsib = false;
14711 int code = 0;
14713 if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
14715 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14716 gcc_assert (parts.index == NULL_RTX);
14717 parts.index = XVECEXP (addr, 0, 1);
14718 parts.scale = INTVAL (XVECEXP (addr, 0, 2));
14719 addr = XVECEXP (addr, 0, 0);
14720 vsib = true;
14722 else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR)
14724 gcc_assert (TARGET_64BIT);
14725 ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
14726 code = 'q';
14728 else
14729 ok = ix86_decompose_address (addr, &parts);
14731 gcc_assert (ok);
14733 base = parts.base;
14734 index = parts.index;
14735 disp = parts.disp;
14736 scale = parts.scale;
14738 switch (parts.seg)
14740 case SEG_DEFAULT:
14741 break;
14742 case SEG_FS:
14743 case SEG_GS:
14744 if (ASSEMBLER_DIALECT == ASM_ATT)
14745 putc ('%', file);
14746 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14747 break;
14748 default:
14749 gcc_unreachable ();
14752 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14753 if (TARGET_64BIT && !base && !index)
14755 rtx symbol = disp;
14757 if (GET_CODE (disp) == CONST
14758 && GET_CODE (XEXP (disp, 0)) == PLUS
14759 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14760 symbol = XEXP (XEXP (disp, 0), 0);
14762 if (GET_CODE (symbol) == LABEL_REF
14763 || (GET_CODE (symbol) == SYMBOL_REF
14764 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14765 base = pc_rtx;
14767 if (!base && !index)
14769 /* Displacement only requires special attention. */
14771 if (CONST_INT_P (disp))
14773 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14774 fputs ("ds:", file);
14775 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14777 else if (flag_pic)
14778 output_pic_addr_const (file, disp, 0);
14779 else
14780 output_addr_const (file, disp);
14782 else
14784 /* Print SImode register names to force addr32 prefix. */
14785 if (SImode_address_operand (addr, VOIDmode))
14787 #ifdef ENABLE_CHECKING
14788 gcc_assert (TARGET_64BIT);
14789 switch (GET_CODE (addr))
14791 case SUBREG:
14792 gcc_assert (GET_MODE (addr) == SImode);
14793 gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode);
14794 break;
14795 case ZERO_EXTEND:
14796 case AND:
14797 gcc_assert (GET_MODE (addr) == DImode);
14798 break;
14799 default:
14800 gcc_unreachable ();
14802 #endif
14803 gcc_assert (!code);
14804 code = 'k';
14806 else if (code == 0
14807 && TARGET_X32
14808 && disp
14809 && CONST_INT_P (disp)
14810 && INTVAL (disp) < -16*1024*1024)
14812 /* X32 runs in 64-bit mode, where displacement, DISP, in
14813 address DISP(%r64), is encoded as 32-bit immediate sign-
14814 extended from 32-bit to 64-bit. For -0x40000300(%r64),
14815 address is %r64 + 0xffffffffbffffd00. When %r64 <
14816 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64,
14817 which is invalid for x32. The correct address is %r64
14818 - 0x40000300 == 0xf7ffdd64. To properly encode
14819 -0x40000300(%r64) for x32, we zero-extend negative
14820 displacement by forcing addr32 prefix which truncates
14821 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should
14822 zero-extend all negative displacements, including -1(%rsp).
14823 However, for small negative displacements, sign-extension
14824 won't cause overflow. We only zero-extend negative
14825 displacements if they < -16*1024*1024, which is also used
14826 to check legitimate address displacements for PIC. */
14827 code = 'k';
14830 if (ASSEMBLER_DIALECT == ASM_ATT)
14832 if (disp)
14834 if (flag_pic)
14835 output_pic_addr_const (file, disp, 0);
14836 else if (GET_CODE (disp) == LABEL_REF)
14837 output_asm_label (disp);
14838 else
14839 output_addr_const (file, disp);
14842 putc ('(', file);
14843 if (base)
14844 print_reg (base, code, file);
14845 if (index)
14847 putc (',', file);
14848 print_reg (index, vsib ? 0 : code, file);
14849 if (scale != 1 || vsib)
14850 fprintf (file, ",%d", scale);
14852 putc (')', file);
14854 else
14856 rtx offset = NULL_RTX;
14858 if (disp)
14860 /* Pull out the offset of a symbol; print any symbol itself. */
14861 if (GET_CODE (disp) == CONST
14862 && GET_CODE (XEXP (disp, 0)) == PLUS
14863 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14865 offset = XEXP (XEXP (disp, 0), 1);
14866 disp = gen_rtx_CONST (VOIDmode,
14867 XEXP (XEXP (disp, 0), 0));
14870 if (flag_pic)
14871 output_pic_addr_const (file, disp, 0);
14872 else if (GET_CODE (disp) == LABEL_REF)
14873 output_asm_label (disp);
14874 else if (CONST_INT_P (disp))
14875 offset = disp;
14876 else
14877 output_addr_const (file, disp);
14880 putc ('[', file);
14881 if (base)
14883 print_reg (base, code, file);
14884 if (offset)
14886 if (INTVAL (offset) >= 0)
14887 putc ('+', file);
14888 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14891 else if (offset)
14892 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14893 else
14894 putc ('0', file);
14896 if (index)
14898 putc ('+', file);
14899 print_reg (index, vsib ? 0 : code, file);
14900 if (scale != 1 || vsib)
14901 fprintf (file, "*%d", scale);
14903 putc (']', file);
14908 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14910 static bool
14911 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14913 rtx op;
14915 if (GET_CODE (x) != UNSPEC)
14916 return false;
14918 op = XVECEXP (x, 0, 0);
14919 switch (XINT (x, 1))
14921 case UNSPEC_GOTTPOFF:
14922 output_addr_const (file, op);
14923 /* FIXME: This might be @TPOFF in Sun ld. */
14924 fputs ("@gottpoff", file);
14925 break;
14926 case UNSPEC_TPOFF:
14927 output_addr_const (file, op);
14928 fputs ("@tpoff", file);
14929 break;
14930 case UNSPEC_NTPOFF:
14931 output_addr_const (file, op);
14932 if (TARGET_64BIT)
14933 fputs ("@tpoff", file);
14934 else
14935 fputs ("@ntpoff", file);
14936 break;
14937 case UNSPEC_DTPOFF:
14938 output_addr_const (file, op);
14939 fputs ("@dtpoff", file);
14940 break;
14941 case UNSPEC_GOTNTPOFF:
14942 output_addr_const (file, op);
14943 if (TARGET_64BIT)
14944 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14945 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14946 else
14947 fputs ("@gotntpoff", file);
14948 break;
14949 case UNSPEC_INDNTPOFF:
14950 output_addr_const (file, op);
14951 fputs ("@indntpoff", file);
14952 break;
14953 #if TARGET_MACHO
14954 case UNSPEC_MACHOPIC_OFFSET:
14955 output_addr_const (file, op);
14956 putc ('-', file);
14957 machopic_output_function_base_name (file);
14958 break;
14959 #endif
14961 case UNSPEC_STACK_CHECK:
14963 int offset;
14965 gcc_assert (flag_split_stack);
14967 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14968 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14969 #else
14970 gcc_unreachable ();
14971 #endif
14973 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14975 break;
14977 default:
14978 return false;
14981 return true;
14984 /* Split one or more double-mode RTL references into pairs of half-mode
14985 references. The RTL can be REG, offsettable MEM, integer constant, or
14986 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14987 split and "num" is its length. lo_half and hi_half are output arrays
14988 that parallel "operands". */
14990 void
14991 split_double_mode (enum machine_mode mode, rtx operands[],
14992 int num, rtx lo_half[], rtx hi_half[])
14994 enum machine_mode half_mode;
14995 unsigned int byte;
14997 switch (mode)
14999 case TImode:
15000 half_mode = DImode;
15001 break;
15002 case DImode:
15003 half_mode = SImode;
15004 break;
15005 default:
15006 gcc_unreachable ();
15009 byte = GET_MODE_SIZE (half_mode);
15011 while (num--)
15013 rtx op = operands[num];
15015 /* simplify_subreg refuse to split volatile memory addresses,
15016 but we still have to handle it. */
15017 if (MEM_P (op))
15019 lo_half[num] = adjust_address (op, half_mode, 0);
15020 hi_half[num] = adjust_address (op, half_mode, byte);
15022 else
15024 lo_half[num] = simplify_gen_subreg (half_mode, op,
15025 GET_MODE (op) == VOIDmode
15026 ? mode : GET_MODE (op), 0);
15027 hi_half[num] = simplify_gen_subreg (half_mode, op,
15028 GET_MODE (op) == VOIDmode
15029 ? mode : GET_MODE (op), byte);
15034 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
15035 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
15036 is the expression of the binary operation. The output may either be
15037 emitted here, or returned to the caller, like all output_* functions.
15039 There is no guarantee that the operands are the same mode, as they
15040 might be within FLOAT or FLOAT_EXTEND expressions. */
15042 #ifndef SYSV386_COMPAT
15043 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
15044 wants to fix the assemblers because that causes incompatibility
15045 with gcc. No-one wants to fix gcc because that causes
15046 incompatibility with assemblers... You can use the option of
15047 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
15048 #define SYSV386_COMPAT 1
15049 #endif
15051 const char *
15052 output_387_binary_op (rtx insn, rtx *operands)
15054 static char buf[40];
15055 const char *p;
15056 const char *ssep;
15057 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
15059 #ifdef ENABLE_CHECKING
15060 /* Even if we do not want to check the inputs, this documents input
15061 constraints. Which helps in understanding the following code. */
15062 if (STACK_REG_P (operands[0])
15063 && ((REG_P (operands[1])
15064 && REGNO (operands[0]) == REGNO (operands[1])
15065 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
15066 || (REG_P (operands[2])
15067 && REGNO (operands[0]) == REGNO (operands[2])
15068 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
15069 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
15070 ; /* ok */
15071 else
15072 gcc_assert (is_sse);
15073 #endif
15075 switch (GET_CODE (operands[3]))
15077 case PLUS:
15078 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15079 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15080 p = "fiadd";
15081 else
15082 p = "fadd";
15083 ssep = "vadd";
15084 break;
15086 case MINUS:
15087 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15088 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15089 p = "fisub";
15090 else
15091 p = "fsub";
15092 ssep = "vsub";
15093 break;
15095 case MULT:
15096 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15097 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15098 p = "fimul";
15099 else
15100 p = "fmul";
15101 ssep = "vmul";
15102 break;
15104 case DIV:
15105 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
15106 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
15107 p = "fidiv";
15108 else
15109 p = "fdiv";
15110 ssep = "vdiv";
15111 break;
15113 default:
15114 gcc_unreachable ();
15117 if (is_sse)
15119 if (TARGET_AVX)
15121 strcpy (buf, ssep);
15122 if (GET_MODE (operands[0]) == SFmode)
15123 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
15124 else
15125 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
15127 else
15129 strcpy (buf, ssep + 1);
15130 if (GET_MODE (operands[0]) == SFmode)
15131 strcat (buf, "ss\t{%2, %0|%0, %2}");
15132 else
15133 strcat (buf, "sd\t{%2, %0|%0, %2}");
15135 return buf;
15137 strcpy (buf, p);
15139 switch (GET_CODE (operands[3]))
15141 case MULT:
15142 case PLUS:
15143 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
15145 rtx temp = operands[2];
15146 operands[2] = operands[1];
15147 operands[1] = temp;
15150 /* know operands[0] == operands[1]. */
15152 if (MEM_P (operands[2]))
15154 p = "%Z2\t%2";
15155 break;
15158 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15160 if (STACK_TOP_P (operands[0]))
15161 /* How is it that we are storing to a dead operand[2]?
15162 Well, presumably operands[1] is dead too. We can't
15163 store the result to st(0) as st(0) gets popped on this
15164 instruction. Instead store to operands[2] (which I
15165 think has to be st(1)). st(1) will be popped later.
15166 gcc <= 2.8.1 didn't have this check and generated
15167 assembly code that the Unixware assembler rejected. */
15168 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15169 else
15170 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15171 break;
15174 if (STACK_TOP_P (operands[0]))
15175 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15176 else
15177 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15178 break;
15180 case MINUS:
15181 case DIV:
15182 if (MEM_P (operands[1]))
15184 p = "r%Z1\t%1";
15185 break;
15188 if (MEM_P (operands[2]))
15190 p = "%Z2\t%2";
15191 break;
15194 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
15196 #if SYSV386_COMPAT
15197 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
15198 derived assemblers, confusingly reverse the direction of
15199 the operation for fsub{r} and fdiv{r} when the
15200 destination register is not st(0). The Intel assembler
15201 doesn't have this brain damage. Read !SYSV386_COMPAT to
15202 figure out what the hardware really does. */
15203 if (STACK_TOP_P (operands[0]))
15204 p = "{p\t%0, %2|rp\t%2, %0}";
15205 else
15206 p = "{rp\t%2, %0|p\t%0, %2}";
15207 #else
15208 if (STACK_TOP_P (operands[0]))
15209 /* As above for fmul/fadd, we can't store to st(0). */
15210 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
15211 else
15212 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
15213 #endif
15214 break;
15217 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
15219 #if SYSV386_COMPAT
15220 if (STACK_TOP_P (operands[0]))
15221 p = "{rp\t%0, %1|p\t%1, %0}";
15222 else
15223 p = "{p\t%1, %0|rp\t%0, %1}";
15224 #else
15225 if (STACK_TOP_P (operands[0]))
15226 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
15227 else
15228 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
15229 #endif
15230 break;
15233 if (STACK_TOP_P (operands[0]))
15235 if (STACK_TOP_P (operands[1]))
15236 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
15237 else
15238 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
15239 break;
15241 else if (STACK_TOP_P (operands[1]))
15243 #if SYSV386_COMPAT
15244 p = "{\t%1, %0|r\t%0, %1}";
15245 #else
15246 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
15247 #endif
15249 else
15251 #if SYSV386_COMPAT
15252 p = "{r\t%2, %0|\t%0, %2}";
15253 #else
15254 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
15255 #endif
15257 break;
15259 default:
15260 gcc_unreachable ();
15263 strcat (buf, p);
15264 return buf;
15267 /* Check if a 256bit AVX register is referenced inside of EXP. */
15269 static int
15270 ix86_check_avx256_register (rtx *pexp, void *data ATTRIBUTE_UNUSED)
15272 rtx exp = *pexp;
15274 if (GET_CODE (exp) == SUBREG)
15275 exp = SUBREG_REG (exp);
15277 if (REG_P (exp)
15278 && VALID_AVX256_REG_OR_OI_MODE (GET_MODE (exp)))
15279 return 1;
15281 return 0;
15284 /* Return needed mode for entity in optimize_mode_switching pass. */
15286 static int
15287 ix86_avx_u128_mode_needed (rtx insn)
15289 if (CALL_P (insn))
15291 rtx link;
15293 /* Needed mode is set to AVX_U128_CLEAN if there are
15294 no 256bit modes used in function arguments. */
15295 for (link = CALL_INSN_FUNCTION_USAGE (insn);
15296 link;
15297 link = XEXP (link, 1))
15299 if (GET_CODE (XEXP (link, 0)) == USE)
15301 rtx arg = XEXP (XEXP (link, 0), 0);
15303 if (ix86_check_avx256_register (&arg, NULL))
15304 return AVX_U128_DIRTY;
15308 return AVX_U128_CLEAN;
15311 /* Require DIRTY mode if a 256bit AVX register is referenced. Hardware
15312 changes state only when a 256bit register is written to, but we need
15313 to prevent the compiler from moving optimal insertion point above
15314 eventual read from 256bit register. */
15315 if (for_each_rtx (&PATTERN (insn), ix86_check_avx256_register, NULL))
15316 return AVX_U128_DIRTY;
15318 return AVX_U128_ANY;
15321 /* Return mode that i387 must be switched into
15322 prior to the execution of insn. */
15324 static int
15325 ix86_i387_mode_needed (int entity, rtx insn)
15327 enum attr_i387_cw mode;
15329 /* The mode UNINITIALIZED is used to store control word after a
15330 function call or ASM pattern. The mode ANY specify that function
15331 has no requirements on the control word and make no changes in the
15332 bits we are interested in. */
15334 if (CALL_P (insn)
15335 || (NONJUMP_INSN_P (insn)
15336 && (asm_noperands (PATTERN (insn)) >= 0
15337 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
15338 return I387_CW_UNINITIALIZED;
15340 if (recog_memoized (insn) < 0)
15341 return I387_CW_ANY;
15343 mode = get_attr_i387_cw (insn);
15345 switch (entity)
15347 case I387_TRUNC:
15348 if (mode == I387_CW_TRUNC)
15349 return mode;
15350 break;
15352 case I387_FLOOR:
15353 if (mode == I387_CW_FLOOR)
15354 return mode;
15355 break;
15357 case I387_CEIL:
15358 if (mode == I387_CW_CEIL)
15359 return mode;
15360 break;
15362 case I387_MASK_PM:
15363 if (mode == I387_CW_MASK_PM)
15364 return mode;
15365 break;
15367 default:
15368 gcc_unreachable ();
15371 return I387_CW_ANY;
15374 /* Return mode that entity must be switched into
15375 prior to the execution of insn. */
15378 ix86_mode_needed (int entity, rtx insn)
15380 switch (entity)
15382 case AVX_U128:
15383 return ix86_avx_u128_mode_needed (insn);
15384 case I387_TRUNC:
15385 case I387_FLOOR:
15386 case I387_CEIL:
15387 case I387_MASK_PM:
15388 return ix86_i387_mode_needed (entity, insn);
15389 default:
15390 gcc_unreachable ();
15392 return 0;
15395 /* Check if a 256bit AVX register is referenced in stores. */
15397 static void
15398 ix86_check_avx256_stores (rtx dest, const_rtx set ATTRIBUTE_UNUSED, void *data)
15400 if (ix86_check_avx256_register (&dest, NULL))
15402 bool *used = (bool *) data;
15403 *used = true;
15407 /* Calculate mode of upper 128bit AVX registers after the insn. */
15409 static int
15410 ix86_avx_u128_mode_after (int mode, rtx insn)
15412 rtx pat = PATTERN (insn);
15414 if (vzeroupper_operation (pat, VOIDmode)
15415 || vzeroall_operation (pat, VOIDmode))
15416 return AVX_U128_CLEAN;
15418 /* We know that state is clean after CALL insn if there are no
15419 256bit registers used in the function return register. */
15420 if (CALL_P (insn))
15422 bool avx_reg256_found = false;
15423 note_stores (pat, ix86_check_avx256_stores, &avx_reg256_found);
15425 return avx_reg256_found ? AVX_U128_DIRTY : AVX_U128_CLEAN;
15428 /* Otherwise, return current mode. Remember that if insn
15429 references AVX 256bit registers, the mode was already changed
15430 to DIRTY from MODE_NEEDED. */
15431 return mode;
15434 /* Return the mode that an insn results in. */
15437 ix86_mode_after (int entity, int mode, rtx insn)
15439 switch (entity)
15441 case AVX_U128:
15442 return ix86_avx_u128_mode_after (mode, insn);
15443 case I387_TRUNC:
15444 case I387_FLOOR:
15445 case I387_CEIL:
15446 case I387_MASK_PM:
15447 return mode;
15448 default:
15449 gcc_unreachable ();
15453 static int
15454 ix86_avx_u128_mode_entry (void)
15456 tree arg;
15458 /* Entry mode is set to AVX_U128_DIRTY if there are
15459 256bit modes used in function arguments. */
15460 for (arg = DECL_ARGUMENTS (current_function_decl); arg;
15461 arg = TREE_CHAIN (arg))
15463 rtx incoming = DECL_INCOMING_RTL (arg);
15465 if (incoming && ix86_check_avx256_register (&incoming, NULL))
15466 return AVX_U128_DIRTY;
15469 return AVX_U128_CLEAN;
15472 /* Return a mode that ENTITY is assumed to be
15473 switched to at function entry. */
15476 ix86_mode_entry (int entity)
15478 switch (entity)
15480 case AVX_U128:
15481 return ix86_avx_u128_mode_entry ();
15482 case I387_TRUNC:
15483 case I387_FLOOR:
15484 case I387_CEIL:
15485 case I387_MASK_PM:
15486 return I387_CW_ANY;
15487 default:
15488 gcc_unreachable ();
15492 static int
15493 ix86_avx_u128_mode_exit (void)
15495 rtx reg = crtl->return_rtx;
15497 /* Exit mode is set to AVX_U128_DIRTY if there are
15498 256bit modes used in the function return register. */
15499 if (reg && ix86_check_avx256_register (&reg, NULL))
15500 return AVX_U128_DIRTY;
15502 return AVX_U128_CLEAN;
15505 /* Return a mode that ENTITY is assumed to be
15506 switched to at function exit. */
15509 ix86_mode_exit (int entity)
15511 switch (entity)
15513 case AVX_U128:
15514 return ix86_avx_u128_mode_exit ();
15515 case I387_TRUNC:
15516 case I387_FLOOR:
15517 case I387_CEIL:
15518 case I387_MASK_PM:
15519 return I387_CW_ANY;
15520 default:
15521 gcc_unreachable ();
15525 /* Output code to initialize control word copies used by trunc?f?i and
15526 rounding patterns. CURRENT_MODE is set to current control word,
15527 while NEW_MODE is set to new control word. */
15529 static void
15530 emit_i387_cw_initialization (int mode)
15532 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
15533 rtx new_mode;
15535 enum ix86_stack_slot slot;
15537 rtx reg = gen_reg_rtx (HImode);
15539 emit_insn (gen_x86_fnstcw_1 (stored_mode));
15540 emit_move_insn (reg, copy_rtx (stored_mode));
15542 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
15543 || optimize_function_for_size_p (cfun))
15545 switch (mode)
15547 case I387_CW_TRUNC:
15548 /* round toward zero (truncate) */
15549 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
15550 slot = SLOT_CW_TRUNC;
15551 break;
15553 case I387_CW_FLOOR:
15554 /* round down toward -oo */
15555 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15556 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
15557 slot = SLOT_CW_FLOOR;
15558 break;
15560 case I387_CW_CEIL:
15561 /* round up toward +oo */
15562 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
15563 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
15564 slot = SLOT_CW_CEIL;
15565 break;
15567 case I387_CW_MASK_PM:
15568 /* mask precision exception for nearbyint() */
15569 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15570 slot = SLOT_CW_MASK_PM;
15571 break;
15573 default:
15574 gcc_unreachable ();
15577 else
15579 switch (mode)
15581 case I387_CW_TRUNC:
15582 /* round toward zero (truncate) */
15583 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
15584 slot = SLOT_CW_TRUNC;
15585 break;
15587 case I387_CW_FLOOR:
15588 /* round down toward -oo */
15589 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
15590 slot = SLOT_CW_FLOOR;
15591 break;
15593 case I387_CW_CEIL:
15594 /* round up toward +oo */
15595 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
15596 slot = SLOT_CW_CEIL;
15597 break;
15599 case I387_CW_MASK_PM:
15600 /* mask precision exception for nearbyint() */
15601 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
15602 slot = SLOT_CW_MASK_PM;
15603 break;
15605 default:
15606 gcc_unreachable ();
15610 gcc_assert (slot < MAX_386_STACK_LOCALS);
15612 new_mode = assign_386_stack_local (HImode, slot);
15613 emit_move_insn (new_mode, reg);
15616 /* Emit vzeroupper. */
15618 void
15619 ix86_avx_emit_vzeroupper (HARD_REG_SET regs_live)
15621 int i;
15623 /* Cancel automatic vzeroupper insertion if there are
15624 live call-saved SSE registers at the insertion point. */
15626 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
15627 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15628 return;
15630 if (TARGET_64BIT)
15631 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
15632 if (TEST_HARD_REG_BIT (regs_live, i) && !call_used_regs[i])
15633 return;
15635 emit_insn (gen_avx_vzeroupper ());
15638 /* Generate one or more insns to set ENTITY to MODE. */
15640 void
15641 ix86_emit_mode_set (int entity, int mode, HARD_REG_SET regs_live)
15643 switch (entity)
15645 case AVX_U128:
15646 if (mode == AVX_U128_CLEAN)
15647 ix86_avx_emit_vzeroupper (regs_live);
15648 break;
15649 case I387_TRUNC:
15650 case I387_FLOOR:
15651 case I387_CEIL:
15652 case I387_MASK_PM:
15653 if (mode != I387_CW_ANY
15654 && mode != I387_CW_UNINITIALIZED)
15655 emit_i387_cw_initialization (mode);
15656 break;
15657 default:
15658 gcc_unreachable ();
15662 /* Output code for INSN to convert a float to a signed int. OPERANDS
15663 are the insn operands. The output may be [HSD]Imode and the input
15664 operand may be [SDX]Fmode. */
15666 const char *
15667 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
15669 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15670 int dimode_p = GET_MODE (operands[0]) == DImode;
15671 int round_mode = get_attr_i387_cw (insn);
15673 /* Jump through a hoop or two for DImode, since the hardware has no
15674 non-popping instruction. We used to do this a different way, but
15675 that was somewhat fragile and broke with post-reload splitters. */
15676 if ((dimode_p || fisttp) && !stack_top_dies)
15677 output_asm_insn ("fld\t%y1", operands);
15679 gcc_assert (STACK_TOP_P (operands[1]));
15680 gcc_assert (MEM_P (operands[0]));
15681 gcc_assert (GET_MODE (operands[1]) != TFmode);
15683 if (fisttp)
15684 output_asm_insn ("fisttp%Z0\t%0", operands);
15685 else
15687 if (round_mode != I387_CW_ANY)
15688 output_asm_insn ("fldcw\t%3", operands);
15689 if (stack_top_dies || dimode_p)
15690 output_asm_insn ("fistp%Z0\t%0", operands);
15691 else
15692 output_asm_insn ("fist%Z0\t%0", operands);
15693 if (round_mode != I387_CW_ANY)
15694 output_asm_insn ("fldcw\t%2", operands);
15697 return "";
15700 /* Output code for x87 ffreep insn. The OPNO argument, which may only
15701 have the values zero or one, indicates the ffreep insn's operand
15702 from the OPERANDS array. */
15704 static const char *
15705 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
15707 if (TARGET_USE_FFREEP)
15708 #ifdef HAVE_AS_IX86_FFREEP
15709 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
15710 #else
15712 static char retval[32];
15713 int regno = REGNO (operands[opno]);
15715 gcc_assert (STACK_REGNO_P (regno));
15717 regno -= FIRST_STACK_REG;
15719 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
15720 return retval;
15722 #endif
15724 return opno ? "fstp\t%y1" : "fstp\t%y0";
15728 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
15729 should be used. UNORDERED_P is true when fucom should be used. */
15731 const char *
15732 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
15734 int stack_top_dies;
15735 rtx cmp_op0, cmp_op1;
15736 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
15738 if (eflags_p)
15740 cmp_op0 = operands[0];
15741 cmp_op1 = operands[1];
15743 else
15745 cmp_op0 = operands[1];
15746 cmp_op1 = operands[2];
15749 if (is_sse)
15751 if (GET_MODE (operands[0]) == SFmode)
15752 if (unordered_p)
15753 return "%vucomiss\t{%1, %0|%0, %1}";
15754 else
15755 return "%vcomiss\t{%1, %0|%0, %1}";
15756 else
15757 if (unordered_p)
15758 return "%vucomisd\t{%1, %0|%0, %1}";
15759 else
15760 return "%vcomisd\t{%1, %0|%0, %1}";
15763 gcc_assert (STACK_TOP_P (cmp_op0));
15765 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
15767 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
15769 if (stack_top_dies)
15771 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
15772 return output_387_ffreep (operands, 1);
15774 else
15775 return "ftst\n\tfnstsw\t%0";
15778 if (STACK_REG_P (cmp_op1)
15779 && stack_top_dies
15780 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
15781 && REGNO (cmp_op1) != FIRST_STACK_REG)
15783 /* If both the top of the 387 stack dies, and the other operand
15784 is also a stack register that dies, then this must be a
15785 `fcompp' float compare */
15787 if (eflags_p)
15789 /* There is no double popping fcomi variant. Fortunately,
15790 eflags is immune from the fstp's cc clobbering. */
15791 if (unordered_p)
15792 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
15793 else
15794 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
15795 return output_387_ffreep (operands, 0);
15797 else
15799 if (unordered_p)
15800 return "fucompp\n\tfnstsw\t%0";
15801 else
15802 return "fcompp\n\tfnstsw\t%0";
15805 else
15807 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
15809 static const char * const alt[16] =
15811 "fcom%Z2\t%y2\n\tfnstsw\t%0",
15812 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
15813 "fucom%Z2\t%y2\n\tfnstsw\t%0",
15814 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
15816 "ficom%Z2\t%y2\n\tfnstsw\t%0",
15817 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
15818 NULL,
15819 NULL,
15821 "fcomi\t{%y1, %0|%0, %y1}",
15822 "fcomip\t{%y1, %0|%0, %y1}",
15823 "fucomi\t{%y1, %0|%0, %y1}",
15824 "fucomip\t{%y1, %0|%0, %y1}",
15826 NULL,
15827 NULL,
15828 NULL,
15829 NULL
15832 int mask;
15833 const char *ret;
15835 mask = eflags_p << 3;
15836 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
15837 mask |= unordered_p << 1;
15838 mask |= stack_top_dies;
15840 gcc_assert (mask < 16);
15841 ret = alt[mask];
15842 gcc_assert (ret);
15844 return ret;
15848 void
15849 ix86_output_addr_vec_elt (FILE *file, int value)
15851 const char *directive = ASM_LONG;
15853 #ifdef ASM_QUAD
15854 if (TARGET_LP64)
15855 directive = ASM_QUAD;
15856 #else
15857 gcc_assert (!TARGET_64BIT);
15858 #endif
15860 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
15863 void
15864 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
15866 const char *directive = ASM_LONG;
15868 #ifdef ASM_QUAD
15869 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
15870 directive = ASM_QUAD;
15871 #else
15872 gcc_assert (!TARGET_64BIT);
15873 #endif
15874 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
15875 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
15876 fprintf (file, "%s%s%d-%s%d\n",
15877 directive, LPREFIX, value, LPREFIX, rel);
15878 else if (HAVE_AS_GOTOFF_IN_DATA)
15879 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
15880 #if TARGET_MACHO
15881 else if (TARGET_MACHO)
15883 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
15884 machopic_output_function_base_name (file);
15885 putc ('\n', file);
15887 #endif
15888 else
15889 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
15890 GOT_SYMBOL_NAME, LPREFIX, value);
15893 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
15894 for the target. */
15896 void
15897 ix86_expand_clear (rtx dest)
15899 rtx tmp;
15901 /* We play register width games, which are only valid after reload. */
15902 gcc_assert (reload_completed);
15904 /* Avoid HImode and its attendant prefix byte. */
15905 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
15906 dest = gen_rtx_REG (SImode, REGNO (dest));
15907 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
15909 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
15910 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
15912 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15913 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
15916 emit_insn (tmp);
15919 /* X is an unchanging MEM. If it is a constant pool reference, return
15920 the constant pool rtx, else NULL. */
15923 maybe_get_pool_constant (rtx x)
15925 x = ix86_delegitimize_address (XEXP (x, 0));
15927 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
15928 return get_pool_constant (x);
15930 return NULL_RTX;
15933 void
15934 ix86_expand_move (enum machine_mode mode, rtx operands[])
15936 rtx op0, op1;
15937 enum tls_model model;
15939 op0 = operands[0];
15940 op1 = operands[1];
15942 if (GET_CODE (op1) == SYMBOL_REF)
15944 model = SYMBOL_REF_TLS_MODEL (op1);
15945 if (model)
15947 op1 = legitimize_tls_address (op1, model, true);
15948 op1 = force_operand (op1, op0);
15949 if (op1 == op0)
15950 return;
15951 op1 = convert_to_mode (mode, op1, 1);
15953 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15954 && SYMBOL_REF_DLLIMPORT_P (op1))
15955 op1 = legitimize_dllimport_symbol (op1, false);
15957 else if (GET_CODE (op1) == CONST
15958 && GET_CODE (XEXP (op1, 0)) == PLUS
15959 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
15961 rtx addend = XEXP (XEXP (op1, 0), 1);
15962 rtx symbol = XEXP (XEXP (op1, 0), 0);
15963 rtx tmp = NULL;
15965 model = SYMBOL_REF_TLS_MODEL (symbol);
15966 if (model)
15967 tmp = legitimize_tls_address (symbol, model, true);
15968 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
15969 && SYMBOL_REF_DLLIMPORT_P (symbol))
15970 tmp = legitimize_dllimport_symbol (symbol, true);
15972 if (tmp)
15974 tmp = force_operand (tmp, NULL);
15975 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
15976 op0, 1, OPTAB_DIRECT);
15977 if (tmp == op0)
15978 return;
15979 op1 = convert_to_mode (mode, tmp, 1);
15983 if ((flag_pic || MACHOPIC_INDIRECT)
15984 && symbolic_operand (op1, mode))
15986 if (TARGET_MACHO && !TARGET_64BIT)
15988 #if TARGET_MACHO
15989 /* dynamic-no-pic */
15990 if (MACHOPIC_INDIRECT)
15992 rtx temp = ((reload_in_progress
15993 || ((op0 && REG_P (op0))
15994 && mode == Pmode))
15995 ? op0 : gen_reg_rtx (Pmode));
15996 op1 = machopic_indirect_data_reference (op1, temp);
15997 if (MACHOPIC_PURE)
15998 op1 = machopic_legitimize_pic_address (op1, mode,
15999 temp == op1 ? 0 : temp);
16001 if (op0 != op1 && GET_CODE (op0) != MEM)
16003 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
16004 emit_insn (insn);
16005 return;
16007 if (GET_CODE (op0) == MEM)
16008 op1 = force_reg (Pmode, op1);
16009 else
16011 rtx temp = op0;
16012 if (GET_CODE (temp) != REG)
16013 temp = gen_reg_rtx (Pmode);
16014 temp = legitimize_pic_address (op1, temp);
16015 if (temp == op0)
16016 return;
16017 op1 = temp;
16019 /* dynamic-no-pic */
16020 #endif
16022 else
16024 if (MEM_P (op0))
16025 op1 = force_reg (mode, op1);
16026 else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
16028 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
16029 op1 = legitimize_pic_address (op1, reg);
16030 if (op0 == op1)
16031 return;
16032 op1 = convert_to_mode (mode, op1, 1);
16036 else
16038 if (MEM_P (op0)
16039 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
16040 || !push_operand (op0, mode))
16041 && MEM_P (op1))
16042 op1 = force_reg (mode, op1);
16044 if (push_operand (op0, mode)
16045 && ! general_no_elim_operand (op1, mode))
16046 op1 = copy_to_mode_reg (mode, op1);
16048 /* Force large constants in 64bit compilation into register
16049 to get them CSEed. */
16050 if (can_create_pseudo_p ()
16051 && (mode == DImode) && TARGET_64BIT
16052 && immediate_operand (op1, mode)
16053 && !x86_64_zext_immediate_operand (op1, VOIDmode)
16054 && !register_operand (op0, mode)
16055 && optimize)
16056 op1 = copy_to_mode_reg (mode, op1);
16058 if (can_create_pseudo_p ()
16059 && FLOAT_MODE_P (mode)
16060 && GET_CODE (op1) == CONST_DOUBLE)
16062 /* If we are loading a floating point constant to a register,
16063 force the value to memory now, since we'll get better code
16064 out the back end. */
16066 op1 = validize_mem (force_const_mem (mode, op1));
16067 if (!register_operand (op0, mode))
16069 rtx temp = gen_reg_rtx (mode);
16070 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
16071 emit_move_insn (op0, temp);
16072 return;
16077 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16080 void
16081 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
16083 rtx op0 = operands[0], op1 = operands[1];
16084 unsigned int align = GET_MODE_ALIGNMENT (mode);
16086 /* Force constants other than zero into memory. We do not know how
16087 the instructions used to build constants modify the upper 64 bits
16088 of the register, once we have that information we may be able
16089 to handle some of them more efficiently. */
16090 if (can_create_pseudo_p ()
16091 && register_operand (op0, mode)
16092 && (CONSTANT_P (op1)
16093 || (GET_CODE (op1) == SUBREG
16094 && CONSTANT_P (SUBREG_REG (op1))))
16095 && !standard_sse_constant_p (op1))
16096 op1 = validize_mem (force_const_mem (mode, op1));
16098 /* We need to check memory alignment for SSE mode since attribute
16099 can make operands unaligned. */
16100 if (can_create_pseudo_p ()
16101 && SSE_REG_MODE_P (mode)
16102 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
16103 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
16105 rtx tmp[2];
16107 /* ix86_expand_vector_move_misalign() does not like constants ... */
16108 if (CONSTANT_P (op1)
16109 || (GET_CODE (op1) == SUBREG
16110 && CONSTANT_P (SUBREG_REG (op1))))
16111 op1 = validize_mem (force_const_mem (mode, op1));
16113 /* ... nor both arguments in memory. */
16114 if (!register_operand (op0, mode)
16115 && !register_operand (op1, mode))
16116 op1 = force_reg (mode, op1);
16118 tmp[0] = op0; tmp[1] = op1;
16119 ix86_expand_vector_move_misalign (mode, tmp);
16120 return;
16123 /* Make operand1 a register if it isn't already. */
16124 if (can_create_pseudo_p ()
16125 && !register_operand (op0, mode)
16126 && !register_operand (op1, mode))
16128 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
16129 return;
16132 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
16135 /* Split 32-byte AVX unaligned load and store if needed. */
16137 static void
16138 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
16140 rtx m;
16141 rtx (*extract) (rtx, rtx, rtx);
16142 rtx (*load_unaligned) (rtx, rtx);
16143 rtx (*store_unaligned) (rtx, rtx);
16144 enum machine_mode mode;
16146 switch (GET_MODE (op0))
16148 default:
16149 gcc_unreachable ();
16150 case V32QImode:
16151 extract = gen_avx_vextractf128v32qi;
16152 load_unaligned = gen_avx_loaddqu256;
16153 store_unaligned = gen_avx_storedqu256;
16154 mode = V16QImode;
16155 break;
16156 case V8SFmode:
16157 extract = gen_avx_vextractf128v8sf;
16158 load_unaligned = gen_avx_loadups256;
16159 store_unaligned = gen_avx_storeups256;
16160 mode = V4SFmode;
16161 break;
16162 case V4DFmode:
16163 extract = gen_avx_vextractf128v4df;
16164 load_unaligned = gen_avx_loadupd256;
16165 store_unaligned = gen_avx_storeupd256;
16166 mode = V2DFmode;
16167 break;
16170 if (MEM_P (op1))
16172 if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
16174 rtx r = gen_reg_rtx (mode);
16175 m = adjust_address (op1, mode, 0);
16176 emit_move_insn (r, m);
16177 m = adjust_address (op1, mode, 16);
16178 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
16179 emit_move_insn (op0, r);
16181 else
16182 emit_insn (load_unaligned (op0, op1));
16184 else if (MEM_P (op0))
16186 if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
16188 m = adjust_address (op0, mode, 0);
16189 emit_insn (extract (m, op1, const0_rtx));
16190 m = adjust_address (op0, mode, 16);
16191 emit_insn (extract (m, op1, const1_rtx));
16193 else
16194 emit_insn (store_unaligned (op0, op1));
16196 else
16197 gcc_unreachable ();
16200 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
16201 straight to ix86_expand_vector_move. */
16202 /* Code generation for scalar reg-reg moves of single and double precision data:
16203 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
16204 movaps reg, reg
16205 else
16206 movss reg, reg
16207 if (x86_sse_partial_reg_dependency == true)
16208 movapd reg, reg
16209 else
16210 movsd reg, reg
16212 Code generation for scalar loads of double precision data:
16213 if (x86_sse_split_regs == true)
16214 movlpd mem, reg (gas syntax)
16215 else
16216 movsd mem, reg
16218 Code generation for unaligned packed loads of single precision data
16219 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
16220 if (x86_sse_unaligned_move_optimal)
16221 movups mem, reg
16223 if (x86_sse_partial_reg_dependency == true)
16225 xorps reg, reg
16226 movlps mem, reg
16227 movhps mem+8, reg
16229 else
16231 movlps mem, reg
16232 movhps mem+8, reg
16235 Code generation for unaligned packed loads of double precision data
16236 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
16237 if (x86_sse_unaligned_move_optimal)
16238 movupd mem, reg
16240 if (x86_sse_split_regs == true)
16242 movlpd mem, reg
16243 movhpd mem+8, reg
16245 else
16247 movsd mem, reg
16248 movhpd mem+8, reg
16252 void
16253 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
16255 rtx op0, op1, m;
16257 op0 = operands[0];
16258 op1 = operands[1];
16260 if (TARGET_AVX
16261 && GET_MODE_SIZE (mode) == 32)
16263 switch (GET_MODE_CLASS (mode))
16265 case MODE_VECTOR_INT:
16266 case MODE_INT:
16267 op0 = gen_lowpart (V32QImode, op0);
16268 op1 = gen_lowpart (V32QImode, op1);
16269 /* FALLTHRU */
16271 case MODE_VECTOR_FLOAT:
16272 ix86_avx256_split_vector_move_misalign (op0, op1);
16273 break;
16275 default:
16276 gcc_unreachable ();
16279 return;
16282 if (MEM_P (op1))
16284 /* ??? If we have typed data, then it would appear that using
16285 movdqu is the only way to get unaligned data loaded with
16286 integer type. */
16287 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16289 op0 = gen_lowpart (V16QImode, op0);
16290 op1 = gen_lowpart (V16QImode, op1);
16291 /* We will eventually emit movups based on insn attributes. */
16292 emit_insn (gen_sse2_loaddqu (op0, op1));
16294 else if (TARGET_SSE2 && mode == V2DFmode)
16296 rtx zero;
16298 if (TARGET_AVX
16299 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16300 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16301 || optimize_function_for_size_p (cfun))
16303 /* We will eventually emit movups based on insn attributes. */
16304 emit_insn (gen_sse2_loadupd (op0, op1));
16305 return;
16308 /* When SSE registers are split into halves, we can avoid
16309 writing to the top half twice. */
16310 if (TARGET_SSE_SPLIT_REGS)
16312 emit_clobber (op0);
16313 zero = op0;
16315 else
16317 /* ??? Not sure about the best option for the Intel chips.
16318 The following would seem to satisfy; the register is
16319 entirely cleared, breaking the dependency chain. We
16320 then store to the upper half, with a dependency depth
16321 of one. A rumor has it that Intel recommends two movsd
16322 followed by an unpacklpd, but this is unconfirmed. And
16323 given that the dependency depth of the unpacklpd would
16324 still be one, I'm not sure why this would be better. */
16325 zero = CONST0_RTX (V2DFmode);
16328 m = adjust_address (op1, DFmode, 0);
16329 emit_insn (gen_sse2_loadlpd (op0, zero, m));
16330 m = adjust_address (op1, DFmode, 8);
16331 emit_insn (gen_sse2_loadhpd (op0, op0, m));
16333 else
16335 if (TARGET_AVX
16336 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
16337 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16338 || optimize_function_for_size_p (cfun))
16340 op0 = gen_lowpart (V4SFmode, op0);
16341 op1 = gen_lowpart (V4SFmode, op1);
16342 emit_insn (gen_sse_loadups (op0, op1));
16343 return;
16346 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
16347 emit_move_insn (op0, CONST0_RTX (mode));
16348 else
16349 emit_clobber (op0);
16351 if (mode != V4SFmode)
16352 op0 = gen_lowpart (V4SFmode, op0);
16354 m = adjust_address (op1, V2SFmode, 0);
16355 emit_insn (gen_sse_loadlps (op0, op0, m));
16356 m = adjust_address (op1, V2SFmode, 8);
16357 emit_insn (gen_sse_loadhps (op0, op0, m));
16360 else if (MEM_P (op0))
16362 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
16364 op0 = gen_lowpart (V16QImode, op0);
16365 op1 = gen_lowpart (V16QImode, op1);
16366 /* We will eventually emit movups based on insn attributes. */
16367 emit_insn (gen_sse2_storedqu (op0, op1));
16369 else if (TARGET_SSE2 && mode == V2DFmode)
16371 if (TARGET_AVX
16372 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16373 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16374 || optimize_function_for_size_p (cfun))
16375 /* We will eventually emit movups based on insn attributes. */
16376 emit_insn (gen_sse2_storeupd (op0, op1));
16377 else
16379 m = adjust_address (op0, DFmode, 0);
16380 emit_insn (gen_sse2_storelpd (m, op1));
16381 m = adjust_address (op0, DFmode, 8);
16382 emit_insn (gen_sse2_storehpd (m, op1));
16385 else
16387 if (mode != V4SFmode)
16388 op1 = gen_lowpart (V4SFmode, op1);
16390 if (TARGET_AVX
16391 || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
16392 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16393 || optimize_function_for_size_p (cfun))
16395 op0 = gen_lowpart (V4SFmode, op0);
16396 emit_insn (gen_sse_storeups (op0, op1));
16398 else
16400 m = adjust_address (op0, V2SFmode, 0);
16401 emit_insn (gen_sse_storelps (m, op1));
16402 m = adjust_address (op0, V2SFmode, 8);
16403 emit_insn (gen_sse_storehps (m, op1));
16407 else
16408 gcc_unreachable ();
16411 /* Expand a push in MODE. This is some mode for which we do not support
16412 proper push instructions, at least from the registers that we expect
16413 the value to live in. */
16415 void
16416 ix86_expand_push (enum machine_mode mode, rtx x)
16418 rtx tmp;
16420 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
16421 GEN_INT (-GET_MODE_SIZE (mode)),
16422 stack_pointer_rtx, 1, OPTAB_DIRECT);
16423 if (tmp != stack_pointer_rtx)
16424 emit_move_insn (stack_pointer_rtx, tmp);
16426 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
16428 /* When we push an operand onto stack, it has to be aligned at least
16429 at the function argument boundary. However since we don't have
16430 the argument type, we can't determine the actual argument
16431 boundary. */
16432 emit_move_insn (tmp, x);
16435 /* Helper function of ix86_fixup_binary_operands to canonicalize
16436 operand order. Returns true if the operands should be swapped. */
16438 static bool
16439 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
16440 rtx operands[])
16442 rtx dst = operands[0];
16443 rtx src1 = operands[1];
16444 rtx src2 = operands[2];
16446 /* If the operation is not commutative, we can't do anything. */
16447 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
16448 return false;
16450 /* Highest priority is that src1 should match dst. */
16451 if (rtx_equal_p (dst, src1))
16452 return false;
16453 if (rtx_equal_p (dst, src2))
16454 return true;
16456 /* Next highest priority is that immediate constants come second. */
16457 if (immediate_operand (src2, mode))
16458 return false;
16459 if (immediate_operand (src1, mode))
16460 return true;
16462 /* Lowest priority is that memory references should come second. */
16463 if (MEM_P (src2))
16464 return false;
16465 if (MEM_P (src1))
16466 return true;
16468 return false;
16472 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
16473 destination to use for the operation. If different from the true
16474 destination in operands[0], a copy operation will be required. */
16477 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
16478 rtx operands[])
16480 rtx dst = operands[0];
16481 rtx src1 = operands[1];
16482 rtx src2 = operands[2];
16484 /* Canonicalize operand order. */
16485 if (ix86_swap_binary_operands_p (code, mode, operands))
16487 rtx temp;
16489 /* It is invalid to swap operands of different modes. */
16490 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
16492 temp = src1;
16493 src1 = src2;
16494 src2 = temp;
16497 /* Both source operands cannot be in memory. */
16498 if (MEM_P (src1) && MEM_P (src2))
16500 /* Optimization: Only read from memory once. */
16501 if (rtx_equal_p (src1, src2))
16503 src2 = force_reg (mode, src2);
16504 src1 = src2;
16506 else
16507 src2 = force_reg (mode, src2);
16510 /* If the destination is memory, and we do not have matching source
16511 operands, do things in registers. */
16512 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16513 dst = gen_reg_rtx (mode);
16515 /* Source 1 cannot be a constant. */
16516 if (CONSTANT_P (src1))
16517 src1 = force_reg (mode, src1);
16519 /* Source 1 cannot be a non-matching memory. */
16520 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16521 src1 = force_reg (mode, src1);
16523 /* Improve address combine. */
16524 if (code == PLUS
16525 && GET_MODE_CLASS (mode) == MODE_INT
16526 && MEM_P (src2))
16527 src2 = force_reg (mode, src2);
16529 operands[1] = src1;
16530 operands[2] = src2;
16531 return dst;
16534 /* Similarly, but assume that the destination has already been
16535 set up properly. */
16537 void
16538 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
16539 enum machine_mode mode, rtx operands[])
16541 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
16542 gcc_assert (dst == operands[0]);
16545 /* Attempt to expand a binary operator. Make the expansion closer to the
16546 actual machine, then just general_operand, which will allow 3 separate
16547 memory references (one output, two input) in a single insn. */
16549 void
16550 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
16551 rtx operands[])
16553 rtx src1, src2, dst, op, clob;
16555 dst = ix86_fixup_binary_operands (code, mode, operands);
16556 src1 = operands[1];
16557 src2 = operands[2];
16559 /* Emit the instruction. */
16561 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
16562 if (reload_in_progress)
16564 /* Reload doesn't know about the flags register, and doesn't know that
16565 it doesn't want to clobber it. We can only do this with PLUS. */
16566 gcc_assert (code == PLUS);
16567 emit_insn (op);
16569 else if (reload_completed
16570 && code == PLUS
16571 && !rtx_equal_p (dst, src1))
16573 /* This is going to be an LEA; avoid splitting it later. */
16574 emit_insn (op);
16576 else
16578 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16579 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16582 /* Fix up the destination if needed. */
16583 if (dst != operands[0])
16584 emit_move_insn (operands[0], dst);
16587 /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
16588 the given OPERANDS. */
16590 void
16591 ix86_expand_vector_logical_operator (enum rtx_code code, enum machine_mode mode,
16592 rtx operands[])
16594 rtx op1 = NULL_RTX, op2 = NULL_RTX;
16595 if (GET_CODE (operands[1]) == SUBREG)
16597 op1 = operands[1];
16598 op2 = operands[2];
16600 else if (GET_CODE (operands[2]) == SUBREG)
16602 op1 = operands[2];
16603 op2 = operands[1];
16605 /* Optimize (__m128i) d | (__m128i) e and similar code
16606 when d and e are float vectors into float vector logical
16607 insn. In C/C++ without using intrinsics there is no other way
16608 to express vector logical operation on float vectors than
16609 to cast them temporarily to integer vectors. */
16610 if (op1
16611 && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
16612 && ((GET_CODE (op2) == SUBREG || GET_CODE (op2) == CONST_VECTOR))
16613 && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
16614 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
16615 && SUBREG_BYTE (op1) == 0
16616 && (GET_CODE (op2) == CONST_VECTOR
16617 || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
16618 && SUBREG_BYTE (op2) == 0))
16619 && can_create_pseudo_p ())
16621 rtx dst;
16622 switch (GET_MODE (SUBREG_REG (op1)))
16624 case V4SFmode:
16625 case V8SFmode:
16626 case V2DFmode:
16627 case V4DFmode:
16628 dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
16629 if (GET_CODE (op2) == CONST_VECTOR)
16631 op2 = gen_lowpart (GET_MODE (dst), op2);
16632 op2 = force_reg (GET_MODE (dst), op2);
16634 else
16636 op1 = operands[1];
16637 op2 = SUBREG_REG (operands[2]);
16638 if (!nonimmediate_operand (op2, GET_MODE (dst)))
16639 op2 = force_reg (GET_MODE (dst), op2);
16641 op1 = SUBREG_REG (op1);
16642 if (!nonimmediate_operand (op1, GET_MODE (dst)))
16643 op1 = force_reg (GET_MODE (dst), op1);
16644 emit_insn (gen_rtx_SET (VOIDmode, dst,
16645 gen_rtx_fmt_ee (code, GET_MODE (dst),
16646 op1, op2)));
16647 emit_move_insn (operands[0], gen_lowpart (mode, dst));
16648 return;
16649 default:
16650 break;
16653 if (!nonimmediate_operand (operands[1], mode))
16654 operands[1] = force_reg (mode, operands[1]);
16655 if (!nonimmediate_operand (operands[2], mode))
16656 operands[2] = force_reg (mode, operands[2]);
16657 ix86_fixup_binary_operands_no_copy (code, mode, operands);
16658 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
16659 gen_rtx_fmt_ee (code, mode, operands[1],
16660 operands[2])));
16663 /* Return TRUE or FALSE depending on whether the binary operator meets the
16664 appropriate constraints. */
16666 bool
16667 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
16668 rtx operands[3])
16670 rtx dst = operands[0];
16671 rtx src1 = operands[1];
16672 rtx src2 = operands[2];
16674 /* Both source operands cannot be in memory. */
16675 if (MEM_P (src1) && MEM_P (src2))
16676 return false;
16678 /* Canonicalize operand order for commutative operators. */
16679 if (ix86_swap_binary_operands_p (code, mode, operands))
16681 rtx temp = src1;
16682 src1 = src2;
16683 src2 = temp;
16686 /* If the destination is memory, we must have a matching source operand. */
16687 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
16688 return false;
16690 /* Source 1 cannot be a constant. */
16691 if (CONSTANT_P (src1))
16692 return false;
16694 /* Source 1 cannot be a non-matching memory. */
16695 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
16696 /* Support "andhi/andsi/anddi" as a zero-extending move. */
16697 return (code == AND
16698 && (mode == HImode
16699 || mode == SImode
16700 || (TARGET_64BIT && mode == DImode))
16701 && satisfies_constraint_L (src2));
16703 return true;
16706 /* Attempt to expand a unary operator. Make the expansion closer to the
16707 actual machine, then just general_operand, which will allow 2 separate
16708 memory references (one output, one input) in a single insn. */
16710 void
16711 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
16712 rtx operands[])
16714 int matching_memory;
16715 rtx src, dst, op, clob;
16717 dst = operands[0];
16718 src = operands[1];
16720 /* If the destination is memory, and we do not have matching source
16721 operands, do things in registers. */
16722 matching_memory = 0;
16723 if (MEM_P (dst))
16725 if (rtx_equal_p (dst, src))
16726 matching_memory = 1;
16727 else
16728 dst = gen_reg_rtx (mode);
16731 /* When source operand is memory, destination must match. */
16732 if (MEM_P (src) && !matching_memory)
16733 src = force_reg (mode, src);
16735 /* Emit the instruction. */
16737 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
16738 if (reload_in_progress || code == NOT)
16740 /* Reload doesn't know about the flags register, and doesn't know that
16741 it doesn't want to clobber it. */
16742 gcc_assert (code == NOT);
16743 emit_insn (op);
16745 else
16747 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16748 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
16751 /* Fix up the destination if needed. */
16752 if (dst != operands[0])
16753 emit_move_insn (operands[0], dst);
16756 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
16757 divisor are within the range [0-255]. */
16759 void
16760 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
16761 bool signed_p)
16763 rtx end_label, qimode_label;
16764 rtx insn, div, mod;
16765 rtx scratch, tmp0, tmp1, tmp2;
16766 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
16767 rtx (*gen_zero_extend) (rtx, rtx);
16768 rtx (*gen_test_ccno_1) (rtx, rtx);
16770 switch (mode)
16772 case SImode:
16773 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
16774 gen_test_ccno_1 = gen_testsi_ccno_1;
16775 gen_zero_extend = gen_zero_extendqisi2;
16776 break;
16777 case DImode:
16778 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
16779 gen_test_ccno_1 = gen_testdi_ccno_1;
16780 gen_zero_extend = gen_zero_extendqidi2;
16781 break;
16782 default:
16783 gcc_unreachable ();
16786 end_label = gen_label_rtx ();
16787 qimode_label = gen_label_rtx ();
16789 scratch = gen_reg_rtx (mode);
16791 /* Use 8bit unsigned divimod if dividend and divisor are within
16792 the range [0-255]. */
16793 emit_move_insn (scratch, operands[2]);
16794 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
16795 scratch, 1, OPTAB_DIRECT);
16796 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
16797 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
16798 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
16799 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
16800 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
16801 pc_rtx);
16802 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
16803 predict_jump (REG_BR_PROB_BASE * 50 / 100);
16804 JUMP_LABEL (insn) = qimode_label;
16806 /* Generate original signed/unsigned divimod. */
16807 div = gen_divmod4_1 (operands[0], operands[1],
16808 operands[2], operands[3]);
16809 emit_insn (div);
16811 /* Branch to the end. */
16812 emit_jump_insn (gen_jump (end_label));
16813 emit_barrier ();
16815 /* Generate 8bit unsigned divide. */
16816 emit_label (qimode_label);
16817 /* Don't use operands[0] for result of 8bit divide since not all
16818 registers support QImode ZERO_EXTRACT. */
16819 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
16820 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
16821 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
16822 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
16824 if (signed_p)
16826 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
16827 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
16829 else
16831 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
16832 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
16835 /* Extract remainder from AH. */
16836 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
16837 if (REG_P (operands[1]))
16838 insn = emit_move_insn (operands[1], tmp1);
16839 else
16841 /* Need a new scratch register since the old one has result
16842 of 8bit divide. */
16843 scratch = gen_reg_rtx (mode);
16844 emit_move_insn (scratch, tmp1);
16845 insn = emit_move_insn (operands[1], scratch);
16847 set_unique_reg_note (insn, REG_EQUAL, mod);
16849 /* Zero extend quotient from AL. */
16850 tmp1 = gen_lowpart (QImode, tmp0);
16851 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
16852 set_unique_reg_note (insn, REG_EQUAL, div);
16854 emit_label (end_label);
16857 #define LEA_MAX_STALL (3)
16858 #define LEA_SEARCH_THRESHOLD (LEA_MAX_STALL << 1)
16860 /* Increase given DISTANCE in half-cycles according to
16861 dependencies between PREV and NEXT instructions.
16862 Add 1 half-cycle if there is no dependency and
16863 go to next cycle if there is some dependecy. */
16865 static unsigned int
16866 increase_distance (rtx prev, rtx next, unsigned int distance)
16868 df_ref *use_rec;
16869 df_ref *def_rec;
16871 if (!prev || !next)
16872 return distance + (distance & 1) + 2;
16874 if (!DF_INSN_USES (next) || !DF_INSN_DEFS (prev))
16875 return distance + 1;
16877 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
16878 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
16879 if (!DF_REF_IS_ARTIFICIAL (*def_rec)
16880 && DF_REF_REGNO (*use_rec) == DF_REF_REGNO (*def_rec))
16881 return distance + (distance & 1) + 2;
16883 return distance + 1;
16886 /* Function checks if instruction INSN defines register number
16887 REGNO1 or REGNO2. */
16889 static bool
16890 insn_defines_reg (unsigned int regno1, unsigned int regno2,
16891 rtx insn)
16893 df_ref *def_rec;
16895 for (def_rec = DF_INSN_DEFS (insn); *def_rec; def_rec++)
16896 if (DF_REF_REG_DEF_P (*def_rec)
16897 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16898 && (regno1 == DF_REF_REGNO (*def_rec)
16899 || regno2 == DF_REF_REGNO (*def_rec)))
16901 return true;
16904 return false;
16907 /* Function checks if instruction INSN uses register number
16908 REGNO as a part of address expression. */
16910 static bool
16911 insn_uses_reg_mem (unsigned int regno, rtx insn)
16913 df_ref *use_rec;
16915 for (use_rec = DF_INSN_USES (insn); *use_rec; use_rec++)
16916 if (DF_REF_REG_MEM_P (*use_rec) && regno == DF_REF_REGNO (*use_rec))
16917 return true;
16919 return false;
16922 /* Search backward for non-agu definition of register number REGNO1
16923 or register number REGNO2 in basic block starting from instruction
16924 START up to head of basic block or instruction INSN.
16926 Function puts true value into *FOUND var if definition was found
16927 and false otherwise.
16929 Distance in half-cycles between START and found instruction or head
16930 of BB is added to DISTANCE and returned. */
16932 static int
16933 distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
16934 rtx insn, int distance,
16935 rtx start, bool *found)
16937 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
16938 rtx prev = start;
16939 rtx next = NULL;
16941 *found = false;
16943 while (prev
16944 && prev != insn
16945 && distance < LEA_SEARCH_THRESHOLD)
16947 if (NONDEBUG_INSN_P (prev) && NONJUMP_INSN_P (prev))
16949 distance = increase_distance (prev, next, distance);
16950 if (insn_defines_reg (regno1, regno2, prev))
16952 if (recog_memoized (prev) < 0
16953 || get_attr_type (prev) != TYPE_LEA)
16955 *found = true;
16956 return distance;
16960 next = prev;
16962 if (prev == BB_HEAD (bb))
16963 break;
16965 prev = PREV_INSN (prev);
16968 return distance;
16971 /* Search backward for non-agu definition of register number REGNO1
16972 or register number REGNO2 in INSN's basic block until
16973 1. Pass LEA_SEARCH_THRESHOLD instructions, or
16974 2. Reach neighbour BBs boundary, or
16975 3. Reach agu definition.
16976 Returns the distance between the non-agu definition point and INSN.
16977 If no definition point, returns -1. */
16979 static int
16980 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
16981 rtx insn)
16983 basic_block bb = BLOCK_FOR_INSN (insn);
16984 int distance = 0;
16985 bool found = false;
16987 if (insn != BB_HEAD (bb))
16988 distance = distance_non_agu_define_in_bb (regno1, regno2, insn,
16989 distance, PREV_INSN (insn),
16990 &found);
16992 if (!found && distance < LEA_SEARCH_THRESHOLD)
16994 edge e;
16995 edge_iterator ei;
16996 bool simple_loop = false;
16998 FOR_EACH_EDGE (e, ei, bb->preds)
16999 if (e->src == bb)
17001 simple_loop = true;
17002 break;
17005 if (simple_loop)
17006 distance = distance_non_agu_define_in_bb (regno1, regno2,
17007 insn, distance,
17008 BB_END (bb), &found);
17009 else
17011 int shortest_dist = -1;
17012 bool found_in_bb = false;
17014 FOR_EACH_EDGE (e, ei, bb->preds)
17016 int bb_dist
17017 = distance_non_agu_define_in_bb (regno1, regno2,
17018 insn, distance,
17019 BB_END (e->src),
17020 &found_in_bb);
17021 if (found_in_bb)
17023 if (shortest_dist < 0)
17024 shortest_dist = bb_dist;
17025 else if (bb_dist > 0)
17026 shortest_dist = MIN (bb_dist, shortest_dist);
17028 found = true;
17032 distance = shortest_dist;
17036 /* get_attr_type may modify recog data. We want to make sure
17037 that recog data is valid for instruction INSN, on which
17038 distance_non_agu_define is called. INSN is unchanged here. */
17039 extract_insn_cached (insn);
17041 if (!found)
17042 return -1;
17044 return distance >> 1;
17047 /* Return the distance in half-cycles between INSN and the next
17048 insn that uses register number REGNO in memory address added
17049 to DISTANCE. Return -1 if REGNO0 is set.
17051 Put true value into *FOUND if register usage was found and
17052 false otherwise.
17053 Put true value into *REDEFINED if register redefinition was
17054 found and false otherwise. */
17056 static int
17057 distance_agu_use_in_bb (unsigned int regno,
17058 rtx insn, int distance, rtx start,
17059 bool *found, bool *redefined)
17061 basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
17062 rtx next = start;
17063 rtx prev = NULL;
17065 *found = false;
17066 *redefined = false;
17068 while (next
17069 && next != insn
17070 && distance < LEA_SEARCH_THRESHOLD)
17072 if (NONDEBUG_INSN_P (next) && NONJUMP_INSN_P (next))
17074 distance = increase_distance(prev, next, distance);
17075 if (insn_uses_reg_mem (regno, next))
17077 /* Return DISTANCE if OP0 is used in memory
17078 address in NEXT. */
17079 *found = true;
17080 return distance;
17083 if (insn_defines_reg (regno, INVALID_REGNUM, next))
17085 /* Return -1 if OP0 is set in NEXT. */
17086 *redefined = true;
17087 return -1;
17090 prev = next;
17093 if (next == BB_END (bb))
17094 break;
17096 next = NEXT_INSN (next);
17099 return distance;
17102 /* Return the distance between INSN and the next insn that uses
17103 register number REGNO0 in memory address. Return -1 if no such
17104 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
17106 static int
17107 distance_agu_use (unsigned int regno0, rtx insn)
17109 basic_block bb = BLOCK_FOR_INSN (insn);
17110 int distance = 0;
17111 bool found = false;
17112 bool redefined = false;
17114 if (insn != BB_END (bb))
17115 distance = distance_agu_use_in_bb (regno0, insn, distance,
17116 NEXT_INSN (insn),
17117 &found, &redefined);
17119 if (!found && !redefined && distance < LEA_SEARCH_THRESHOLD)
17121 edge e;
17122 edge_iterator ei;
17123 bool simple_loop = false;
17125 FOR_EACH_EDGE (e, ei, bb->succs)
17126 if (e->dest == bb)
17128 simple_loop = true;
17129 break;
17132 if (simple_loop)
17133 distance = distance_agu_use_in_bb (regno0, insn,
17134 distance, BB_HEAD (bb),
17135 &found, &redefined);
17136 else
17138 int shortest_dist = -1;
17139 bool found_in_bb = false;
17140 bool redefined_in_bb = false;
17142 FOR_EACH_EDGE (e, ei, bb->succs)
17144 int bb_dist
17145 = distance_agu_use_in_bb (regno0, insn,
17146 distance, BB_HEAD (e->dest),
17147 &found_in_bb, &redefined_in_bb);
17148 if (found_in_bb)
17150 if (shortest_dist < 0)
17151 shortest_dist = bb_dist;
17152 else if (bb_dist > 0)
17153 shortest_dist = MIN (bb_dist, shortest_dist);
17155 found = true;
17159 distance = shortest_dist;
17163 if (!found || redefined)
17164 return -1;
17166 return distance >> 1;
17169 /* Define this macro to tune LEA priority vs ADD, it take effect when
17170 there is a dilemma of choicing LEA or ADD
17171 Negative value: ADD is more preferred than LEA
17172 Zero: Netrual
17173 Positive value: LEA is more preferred than ADD*/
17174 #define IX86_LEA_PRIORITY 0
17176 /* Return true if usage of lea INSN has performance advantage
17177 over a sequence of instructions. Instructions sequence has
17178 SPLIT_COST cycles higher latency than lea latency. */
17180 static bool
17181 ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1,
17182 unsigned int regno2, int split_cost)
17184 int dist_define, dist_use;
17186 dist_define = distance_non_agu_define (regno1, regno2, insn);
17187 dist_use = distance_agu_use (regno0, insn);
17189 if (dist_define < 0 || dist_define >= LEA_MAX_STALL)
17191 /* If there is no non AGU operand definition, no AGU
17192 operand usage and split cost is 0 then both lea
17193 and non lea variants have same priority. Currently
17194 we prefer lea for 64 bit code and non lea on 32 bit
17195 code. */
17196 if (dist_use < 0 && split_cost == 0)
17197 return TARGET_64BIT || IX86_LEA_PRIORITY;
17198 else
17199 return true;
17202 /* With longer definitions distance lea is more preferable.
17203 Here we change it to take into account splitting cost and
17204 lea priority. */
17205 dist_define += split_cost + IX86_LEA_PRIORITY;
17207 /* If there is no use in memory addess then we just check
17208 that split cost exceeds AGU stall. */
17209 if (dist_use < 0)
17210 return dist_define > LEA_MAX_STALL;
17212 /* If this insn has both backward non-agu dependence and forward
17213 agu dependence, the one with short distance takes effect. */
17214 return dist_define >= dist_use;
17217 /* Return true if it is legal to clobber flags by INSN and
17218 false otherwise. */
17220 static bool
17221 ix86_ok_to_clobber_flags (rtx insn)
17223 basic_block bb = BLOCK_FOR_INSN (insn);
17224 df_ref *use;
17225 bitmap live;
17227 while (insn)
17229 if (NONDEBUG_INSN_P (insn))
17231 for (use = DF_INSN_USES (insn); *use; use++)
17232 if (DF_REF_REG_USE_P (*use) && DF_REF_REGNO (*use) == FLAGS_REG)
17233 return false;
17235 if (insn_defines_reg (FLAGS_REG, INVALID_REGNUM, insn))
17236 return true;
17239 if (insn == BB_END (bb))
17240 break;
17242 insn = NEXT_INSN (insn);
17245 live = df_get_live_out(bb);
17246 return !REGNO_REG_SET_P (live, FLAGS_REG);
17249 /* Return true if we need to split op0 = op1 + op2 into a sequence of
17250 move and add to avoid AGU stalls. */
17252 bool
17253 ix86_avoid_lea_for_add (rtx insn, rtx operands[])
17255 unsigned int regno0, regno1, regno2;
17257 /* Check if we need to optimize. */
17258 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17259 return false;
17261 /* Check it is correct to split here. */
17262 if (!ix86_ok_to_clobber_flags(insn))
17263 return false;
17265 regno0 = true_regnum (operands[0]);
17266 regno1 = true_regnum (operands[1]);
17267 regno2 = true_regnum (operands[2]);
17269 /* We need to split only adds with non destructive
17270 destination operand. */
17271 if (regno0 == regno1 || regno0 == regno2)
17272 return false;
17273 else
17274 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
17277 /* Return true if we should emit lea instruction instead of mov
17278 instruction. */
17280 bool
17281 ix86_use_lea_for_mov (rtx insn, rtx operands[])
17283 unsigned int regno0, regno1;
17285 /* Check if we need to optimize. */
17286 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17287 return false;
17289 /* Use lea for reg to reg moves only. */
17290 if (!REG_P (operands[0]) || !REG_P (operands[1]))
17291 return false;
17293 regno0 = true_regnum (operands[0]);
17294 regno1 = true_regnum (operands[1]);
17296 return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0);
17299 /* Return true if we need to split lea into a sequence of
17300 instructions to avoid AGU stalls. */
17302 bool
17303 ix86_avoid_lea_for_addr (rtx insn, rtx operands[])
17305 unsigned int regno0, regno1, regno2;
17306 int split_cost;
17307 struct ix86_address parts;
17308 int ok;
17310 /* Check we need to optimize. */
17311 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17312 return false;
17314 /* The "at least two components" test below might not catch simple
17315 move or zero extension insns if parts.base is non-NULL and parts.disp
17316 is const0_rtx as the only components in the address, e.g. if the
17317 register is %rbp or %r13. As this test is much cheaper and moves or
17318 zero extensions are the common case, do this check first. */
17319 if (REG_P (operands[1])
17320 || (SImode_address_operand (operands[1], VOIDmode)
17321 && REG_P (XEXP (operands[1], 0))))
17322 return false;
17324 /* Check if it is OK to split here. */
17325 if (!ix86_ok_to_clobber_flags (insn))
17326 return false;
17328 ok = ix86_decompose_address (operands[1], &parts);
17329 gcc_assert (ok);
17331 /* There should be at least two components in the address. */
17332 if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX)
17333 + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2)
17334 return false;
17336 /* We should not split into add if non legitimate pic
17337 operand is used as displacement. */
17338 if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp))
17339 return false;
17341 regno0 = true_regnum (operands[0]) ;
17342 regno1 = INVALID_REGNUM;
17343 regno2 = INVALID_REGNUM;
17345 if (parts.base)
17346 regno1 = true_regnum (parts.base);
17347 if (parts.index)
17348 regno2 = true_regnum (parts.index);
17350 split_cost = 0;
17352 /* Compute how many cycles we will add to execution time
17353 if split lea into a sequence of instructions. */
17354 if (parts.base || parts.index)
17356 /* Have to use mov instruction if non desctructive
17357 destination form is used. */
17358 if (regno1 != regno0 && regno2 != regno0)
17359 split_cost += 1;
17361 /* Have to add index to base if both exist. */
17362 if (parts.base && parts.index)
17363 split_cost += 1;
17365 /* Have to use shift and adds if scale is 2 or greater. */
17366 if (parts.scale > 1)
17368 if (regno0 != regno1)
17369 split_cost += 1;
17370 else if (regno2 == regno0)
17371 split_cost += 4;
17372 else
17373 split_cost += parts.scale;
17376 /* Have to use add instruction with immediate if
17377 disp is non zero. */
17378 if (parts.disp && parts.disp != const0_rtx)
17379 split_cost += 1;
17381 /* Subtract the price of lea. */
17382 split_cost -= 1;
17385 return !ix86_lea_outperforms (insn, regno0, regno1, regno2, split_cost);
17388 /* Emit x86 binary operand CODE in mode MODE, where the first operand
17389 matches destination. RTX includes clobber of FLAGS_REG. */
17391 static void
17392 ix86_emit_binop (enum rtx_code code, enum machine_mode mode,
17393 rtx dst, rtx src)
17395 rtx op, clob;
17397 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, dst, src));
17398 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
17400 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
17403 /* Return true if regno1 def is nearest to the insn. */
17405 static bool
17406 find_nearest_reg_def (rtx insn, int regno1, int regno2)
17408 rtx prev = insn;
17409 rtx start = BB_HEAD (BLOCK_FOR_INSN (insn));
17411 if (insn == start)
17412 return false;
17413 while (prev && prev != start)
17415 if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
17417 prev = PREV_INSN (prev);
17418 continue;
17420 if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
17421 return true;
17422 else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
17423 return false;
17424 prev = PREV_INSN (prev);
17427 /* None of the regs is defined in the bb. */
17428 return false;
17431 /* Split lea instructions into a sequence of instructions
17432 which are executed on ALU to avoid AGU stalls.
17433 It is assumed that it is allowed to clobber flags register
17434 at lea position. */
17436 void
17437 ix86_split_lea_for_addr (rtx insn, rtx operands[], enum machine_mode mode)
17439 unsigned int regno0, regno1, regno2;
17440 struct ix86_address parts;
17441 rtx target, tmp;
17442 int ok, adds;
17444 ok = ix86_decompose_address (operands[1], &parts);
17445 gcc_assert (ok);
17447 target = gen_lowpart (mode, operands[0]);
17449 regno0 = true_regnum (target);
17450 regno1 = INVALID_REGNUM;
17451 regno2 = INVALID_REGNUM;
17453 if (parts.base)
17455 parts.base = gen_lowpart (mode, parts.base);
17456 regno1 = true_regnum (parts.base);
17459 if (parts.index)
17461 parts.index = gen_lowpart (mode, parts.index);
17462 regno2 = true_regnum (parts.index);
17465 if (parts.disp)
17466 parts.disp = gen_lowpart (mode, parts.disp);
17468 if (parts.scale > 1)
17470 /* Case r1 = r1 + ... */
17471 if (regno1 == regno0)
17473 /* If we have a case r1 = r1 + C * r1 then we
17474 should use multiplication which is very
17475 expensive. Assume cost model is wrong if we
17476 have such case here. */
17477 gcc_assert (regno2 != regno0);
17479 for (adds = parts.scale; adds > 0; adds--)
17480 ix86_emit_binop (PLUS, mode, target, parts.index);
17482 else
17484 /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
17485 if (regno0 != regno2)
17486 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17488 /* Use shift for scaling. */
17489 ix86_emit_binop (ASHIFT, mode, target,
17490 GEN_INT (exact_log2 (parts.scale)));
17492 if (parts.base)
17493 ix86_emit_binop (PLUS, mode, target, parts.base);
17495 if (parts.disp && parts.disp != const0_rtx)
17496 ix86_emit_binop (PLUS, mode, target, parts.disp);
17499 else if (!parts.base && !parts.index)
17501 gcc_assert(parts.disp);
17502 emit_insn (gen_rtx_SET (VOIDmode, target, parts.disp));
17504 else
17506 if (!parts.base)
17508 if (regno0 != regno2)
17509 emit_insn (gen_rtx_SET (VOIDmode, target, parts.index));
17511 else if (!parts.index)
17513 if (regno0 != regno1)
17514 emit_insn (gen_rtx_SET (VOIDmode, target, parts.base));
17516 else
17518 if (regno0 == regno1)
17519 tmp = parts.index;
17520 else if (regno0 == regno2)
17521 tmp = parts.base;
17522 else
17524 rtx tmp1;
17526 /* Find better operand for SET instruction, depending
17527 on which definition is farther from the insn. */
17528 if (find_nearest_reg_def (insn, regno1, regno2))
17529 tmp = parts.index, tmp1 = parts.base;
17530 else
17531 tmp = parts.base, tmp1 = parts.index;
17533 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
17535 if (parts.disp && parts.disp != const0_rtx)
17536 ix86_emit_binop (PLUS, mode, target, parts.disp);
17538 ix86_emit_binop (PLUS, mode, target, tmp1);
17539 return;
17542 ix86_emit_binop (PLUS, mode, target, tmp);
17545 if (parts.disp && parts.disp != const0_rtx)
17546 ix86_emit_binop (PLUS, mode, target, parts.disp);
17550 /* Return true if it is ok to optimize an ADD operation to LEA
17551 operation to avoid flag register consumation. For most processors,
17552 ADD is faster than LEA. For the processors like ATOM, if the
17553 destination register of LEA holds an actual address which will be
17554 used soon, LEA is better and otherwise ADD is better. */
17556 bool
17557 ix86_lea_for_add_ok (rtx insn, rtx operands[])
17559 unsigned int regno0 = true_regnum (operands[0]);
17560 unsigned int regno1 = true_regnum (operands[1]);
17561 unsigned int regno2 = true_regnum (operands[2]);
17563 /* If a = b + c, (a!=b && a!=c), must use lea form. */
17564 if (regno0 != regno1 && regno0 != regno2)
17565 return true;
17567 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
17568 return false;
17570 return ix86_lea_outperforms (insn, regno0, regno1, regno2, 0);
17573 /* Return true if destination reg of SET_BODY is shift count of
17574 USE_BODY. */
17576 static bool
17577 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
17579 rtx set_dest;
17580 rtx shift_rtx;
17581 int i;
17583 /* Retrieve destination of SET_BODY. */
17584 switch (GET_CODE (set_body))
17586 case SET:
17587 set_dest = SET_DEST (set_body);
17588 if (!set_dest || !REG_P (set_dest))
17589 return false;
17590 break;
17591 case PARALLEL:
17592 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
17593 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
17594 use_body))
17595 return true;
17596 default:
17597 return false;
17598 break;
17601 /* Retrieve shift count of USE_BODY. */
17602 switch (GET_CODE (use_body))
17604 case SET:
17605 shift_rtx = XEXP (use_body, 1);
17606 break;
17607 case PARALLEL:
17608 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
17609 if (ix86_dep_by_shift_count_body (set_body,
17610 XVECEXP (use_body, 0, i)))
17611 return true;
17612 default:
17613 return false;
17614 break;
17617 if (shift_rtx
17618 && (GET_CODE (shift_rtx) == ASHIFT
17619 || GET_CODE (shift_rtx) == LSHIFTRT
17620 || GET_CODE (shift_rtx) == ASHIFTRT
17621 || GET_CODE (shift_rtx) == ROTATE
17622 || GET_CODE (shift_rtx) == ROTATERT))
17624 rtx shift_count = XEXP (shift_rtx, 1);
17626 /* Return true if shift count is dest of SET_BODY. */
17627 if (REG_P (shift_count))
17629 /* Add check since it can be invoked before register
17630 allocation in pre-reload schedule. */
17631 if (reload_completed
17632 && true_regnum (set_dest) == true_regnum (shift_count))
17633 return true;
17634 else if (REGNO(set_dest) == REGNO(shift_count))
17635 return true;
17639 return false;
17642 /* Return true if destination reg of SET_INSN is shift count of
17643 USE_INSN. */
17645 bool
17646 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
17648 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
17649 PATTERN (use_insn));
17652 /* Return TRUE or FALSE depending on whether the unary operator meets the
17653 appropriate constraints. */
17655 bool
17656 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
17657 enum machine_mode mode ATTRIBUTE_UNUSED,
17658 rtx operands[2] ATTRIBUTE_UNUSED)
17660 /* If one of operands is memory, source and destination must match. */
17661 if ((MEM_P (operands[0])
17662 || MEM_P (operands[1]))
17663 && ! rtx_equal_p (operands[0], operands[1]))
17664 return false;
17665 return true;
17668 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
17669 are ok, keeping in mind the possible movddup alternative. */
17671 bool
17672 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
17674 if (MEM_P (operands[0]))
17675 return rtx_equal_p (operands[0], operands[1 + high]);
17676 if (MEM_P (operands[1]) && MEM_P (operands[2]))
17677 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
17678 return true;
17681 /* Post-reload splitter for converting an SF or DFmode value in an
17682 SSE register into an unsigned SImode. */
17684 void
17685 ix86_split_convert_uns_si_sse (rtx operands[])
17687 enum machine_mode vecmode;
17688 rtx value, large, zero_or_two31, input, two31, x;
17690 large = operands[1];
17691 zero_or_two31 = operands[2];
17692 input = operands[3];
17693 two31 = operands[4];
17694 vecmode = GET_MODE (large);
17695 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
17697 /* Load up the value into the low element. We must ensure that the other
17698 elements are valid floats -- zero is the easiest such value. */
17699 if (MEM_P (input))
17701 if (vecmode == V4SFmode)
17702 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
17703 else
17704 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
17706 else
17708 input = gen_rtx_REG (vecmode, REGNO (input));
17709 emit_move_insn (value, CONST0_RTX (vecmode));
17710 if (vecmode == V4SFmode)
17711 emit_insn (gen_sse_movss (value, value, input));
17712 else
17713 emit_insn (gen_sse2_movsd (value, value, input));
17716 emit_move_insn (large, two31);
17717 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
17719 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
17720 emit_insn (gen_rtx_SET (VOIDmode, large, x));
17722 x = gen_rtx_AND (vecmode, zero_or_two31, large);
17723 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
17725 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
17726 emit_insn (gen_rtx_SET (VOIDmode, value, x));
17728 large = gen_rtx_REG (V4SImode, REGNO (large));
17729 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
17731 x = gen_rtx_REG (V4SImode, REGNO (value));
17732 if (vecmode == V4SFmode)
17733 emit_insn (gen_fix_truncv4sfv4si2 (x, value));
17734 else
17735 emit_insn (gen_sse2_cvttpd2dq (x, value));
17736 value = x;
17738 emit_insn (gen_xorv4si3 (value, value, large));
17741 /* Convert an unsigned DImode value into a DFmode, using only SSE.
17742 Expects the 64-bit DImode to be supplied in a pair of integral
17743 registers. Requires SSE2; will use SSE3 if available. For x86_32,
17744 -mfpmath=sse, !optimize_size only. */
17746 void
17747 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
17749 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
17750 rtx int_xmm, fp_xmm;
17751 rtx biases, exponents;
17752 rtx x;
17754 int_xmm = gen_reg_rtx (V4SImode);
17755 if (TARGET_INTER_UNIT_MOVES)
17756 emit_insn (gen_movdi_to_sse (int_xmm, input));
17757 else if (TARGET_SSE_SPLIT_REGS)
17759 emit_clobber (int_xmm);
17760 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
17762 else
17764 x = gen_reg_rtx (V2DImode);
17765 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
17766 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
17769 x = gen_rtx_CONST_VECTOR (V4SImode,
17770 gen_rtvec (4, GEN_INT (0x43300000UL),
17771 GEN_INT (0x45300000UL),
17772 const0_rtx, const0_rtx));
17773 exponents = validize_mem (force_const_mem (V4SImode, x));
17775 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
17776 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
17778 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
17779 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
17780 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
17781 (0x1.0p84 + double(fp_value_hi_xmm)).
17782 Note these exponents differ by 32. */
17784 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
17786 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
17787 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
17788 real_ldexp (&bias_lo_rvt, &dconst1, 52);
17789 real_ldexp (&bias_hi_rvt, &dconst1, 84);
17790 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
17791 x = const_double_from_real_value (bias_hi_rvt, DFmode);
17792 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
17793 biases = validize_mem (force_const_mem (V2DFmode, biases));
17794 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
17796 /* Add the upper and lower DFmode values together. */
17797 if (TARGET_SSE3)
17798 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
17799 else
17801 x = copy_to_mode_reg (V2DFmode, fp_xmm);
17802 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
17803 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
17806 ix86_expand_vector_extract (false, target, fp_xmm, 0);
17809 /* Not used, but eases macroization of patterns. */
17810 void
17811 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
17812 rtx input ATTRIBUTE_UNUSED)
17814 gcc_unreachable ();
17817 /* Convert an unsigned SImode value into a DFmode. Only currently used
17818 for SSE, but applicable anywhere. */
17820 void
17821 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
17823 REAL_VALUE_TYPE TWO31r;
17824 rtx x, fp;
17826 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
17827 NULL, 1, OPTAB_DIRECT);
17829 fp = gen_reg_rtx (DFmode);
17830 emit_insn (gen_floatsidf2 (fp, x));
17832 real_ldexp (&TWO31r, &dconst1, 31);
17833 x = const_double_from_real_value (TWO31r, DFmode);
17835 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
17836 if (x != target)
17837 emit_move_insn (target, x);
17840 /* Convert a signed DImode value into a DFmode. Only used for SSE in
17841 32-bit mode; otherwise we have a direct convert instruction. */
17843 void
17844 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
17846 REAL_VALUE_TYPE TWO32r;
17847 rtx fp_lo, fp_hi, x;
17849 fp_lo = gen_reg_rtx (DFmode);
17850 fp_hi = gen_reg_rtx (DFmode);
17852 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
17854 real_ldexp (&TWO32r, &dconst1, 32);
17855 x = const_double_from_real_value (TWO32r, DFmode);
17856 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
17858 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
17860 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
17861 0, OPTAB_DIRECT);
17862 if (x != target)
17863 emit_move_insn (target, x);
17866 /* Convert an unsigned SImode value into a SFmode, using only SSE.
17867 For x86_32, -mfpmath=sse, !optimize_size only. */
17868 void
17869 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
17871 REAL_VALUE_TYPE ONE16r;
17872 rtx fp_hi, fp_lo, int_hi, int_lo, x;
17874 real_ldexp (&ONE16r, &dconst1, 16);
17875 x = const_double_from_real_value (ONE16r, SFmode);
17876 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
17877 NULL, 0, OPTAB_DIRECT);
17878 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
17879 NULL, 0, OPTAB_DIRECT);
17880 fp_hi = gen_reg_rtx (SFmode);
17881 fp_lo = gen_reg_rtx (SFmode);
17882 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
17883 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
17884 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
17885 0, OPTAB_DIRECT);
17886 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
17887 0, OPTAB_DIRECT);
17888 if (!rtx_equal_p (target, fp_hi))
17889 emit_move_insn (target, fp_hi);
17892 /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
17893 a vector of unsigned ints VAL to vector of floats TARGET. */
17895 void
17896 ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
17898 rtx tmp[8];
17899 REAL_VALUE_TYPE TWO16r;
17900 enum machine_mode intmode = GET_MODE (val);
17901 enum machine_mode fltmode = GET_MODE (target);
17902 rtx (*cvt) (rtx, rtx);
17904 if (intmode == V4SImode)
17905 cvt = gen_floatv4siv4sf2;
17906 else
17907 cvt = gen_floatv8siv8sf2;
17908 tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
17909 tmp[0] = force_reg (intmode, tmp[0]);
17910 tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
17911 OPTAB_DIRECT);
17912 tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
17913 NULL_RTX, 1, OPTAB_DIRECT);
17914 tmp[3] = gen_reg_rtx (fltmode);
17915 emit_insn (cvt (tmp[3], tmp[1]));
17916 tmp[4] = gen_reg_rtx (fltmode);
17917 emit_insn (cvt (tmp[4], tmp[2]));
17918 real_ldexp (&TWO16r, &dconst1, 16);
17919 tmp[5] = const_double_from_real_value (TWO16r, SFmode);
17920 tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
17921 tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
17922 OPTAB_DIRECT);
17923 tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
17924 OPTAB_DIRECT);
17925 if (tmp[7] != target)
17926 emit_move_insn (target, tmp[7]);
17929 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
17930 pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
17931 This is done by doing just signed conversion if < 0x1p31, and otherwise by
17932 subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
17935 ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
17937 REAL_VALUE_TYPE TWO31r;
17938 rtx two31r, tmp[4];
17939 enum machine_mode mode = GET_MODE (val);
17940 enum machine_mode scalarmode = GET_MODE_INNER (mode);
17941 enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
17942 rtx (*cmp) (rtx, rtx, rtx, rtx);
17943 int i;
17945 for (i = 0; i < 3; i++)
17946 tmp[i] = gen_reg_rtx (mode);
17947 real_ldexp (&TWO31r, &dconst1, 31);
17948 two31r = const_double_from_real_value (TWO31r, scalarmode);
17949 two31r = ix86_build_const_vector (mode, 1, two31r);
17950 two31r = force_reg (mode, two31r);
17951 switch (mode)
17953 case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
17954 case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
17955 case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
17956 case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
17957 default: gcc_unreachable ();
17959 tmp[3] = gen_rtx_LE (mode, two31r, val);
17960 emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
17961 tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
17962 0, OPTAB_DIRECT);
17963 if (intmode == V4SImode || TARGET_AVX2)
17964 *xorp = expand_simple_binop (intmode, ASHIFT,
17965 gen_lowpart (intmode, tmp[0]),
17966 GEN_INT (31), NULL_RTX, 0,
17967 OPTAB_DIRECT);
17968 else
17970 rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
17971 two31 = ix86_build_const_vector (intmode, 1, two31);
17972 *xorp = expand_simple_binop (intmode, AND,
17973 gen_lowpart (intmode, tmp[0]),
17974 two31, NULL_RTX, 0,
17975 OPTAB_DIRECT);
17977 return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
17978 0, OPTAB_DIRECT);
17981 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
17982 then replicate the value for all elements of the vector
17983 register. */
17986 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
17988 int i, n_elt;
17989 rtvec v;
17990 enum machine_mode scalar_mode;
17992 switch (mode)
17994 case V32QImode:
17995 case V16QImode:
17996 case V16HImode:
17997 case V8HImode:
17998 case V8SImode:
17999 case V4SImode:
18000 case V4DImode:
18001 case V2DImode:
18002 gcc_assert (vect);
18003 case V8SFmode:
18004 case V4SFmode:
18005 case V4DFmode:
18006 case V2DFmode:
18007 n_elt = GET_MODE_NUNITS (mode);
18008 v = rtvec_alloc (n_elt);
18009 scalar_mode = GET_MODE_INNER (mode);
18011 RTVEC_ELT (v, 0) = value;
18013 for (i = 1; i < n_elt; ++i)
18014 RTVEC_ELT (v, i) = vect ? value : CONST0_RTX (scalar_mode);
18016 return gen_rtx_CONST_VECTOR (mode, v);
18018 default:
18019 gcc_unreachable ();
18023 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
18024 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
18025 for an SSE register. If VECT is true, then replicate the mask for
18026 all elements of the vector register. If INVERT is true, then create
18027 a mask excluding the sign bit. */
18030 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
18032 enum machine_mode vec_mode, imode;
18033 HOST_WIDE_INT hi, lo;
18034 int shift = 63;
18035 rtx v;
18036 rtx mask;
18038 /* Find the sign bit, sign extended to 2*HWI. */
18039 switch (mode)
18041 case V8SImode:
18042 case V4SImode:
18043 case V8SFmode:
18044 case V4SFmode:
18045 vec_mode = mode;
18046 mode = GET_MODE_INNER (mode);
18047 imode = SImode;
18048 lo = 0x80000000, hi = lo < 0;
18049 break;
18051 case V4DImode:
18052 case V2DImode:
18053 case V4DFmode:
18054 case V2DFmode:
18055 vec_mode = mode;
18056 mode = GET_MODE_INNER (mode);
18057 imode = DImode;
18058 if (HOST_BITS_PER_WIDE_INT >= 64)
18059 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
18060 else
18061 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18062 break;
18064 case TImode:
18065 case TFmode:
18066 vec_mode = VOIDmode;
18067 if (HOST_BITS_PER_WIDE_INT >= 64)
18069 imode = TImode;
18070 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
18072 else
18074 rtvec vec;
18076 imode = DImode;
18077 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
18079 if (invert)
18081 lo = ~lo, hi = ~hi;
18082 v = constm1_rtx;
18084 else
18085 v = const0_rtx;
18087 mask = immed_double_const (lo, hi, imode);
18089 vec = gen_rtvec (2, v, mask);
18090 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
18091 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
18093 return v;
18095 break;
18097 default:
18098 gcc_unreachable ();
18101 if (invert)
18102 lo = ~lo, hi = ~hi;
18104 /* Force this value into the low part of a fp vector constant. */
18105 mask = immed_double_const (lo, hi, imode);
18106 mask = gen_lowpart (mode, mask);
18108 if (vec_mode == VOIDmode)
18109 return force_reg (mode, mask);
18111 v = ix86_build_const_vector (vec_mode, vect, mask);
18112 return force_reg (vec_mode, v);
18115 /* Generate code for floating point ABS or NEG. */
18117 void
18118 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
18119 rtx operands[])
18121 rtx mask, set, dst, src;
18122 bool use_sse = false;
18123 bool vector_mode = VECTOR_MODE_P (mode);
18124 enum machine_mode vmode = mode;
18126 if (vector_mode)
18127 use_sse = true;
18128 else if (mode == TFmode)
18129 use_sse = true;
18130 else if (TARGET_SSE_MATH)
18132 use_sse = SSE_FLOAT_MODE_P (mode);
18133 if (mode == SFmode)
18134 vmode = V4SFmode;
18135 else if (mode == DFmode)
18136 vmode = V2DFmode;
18139 /* NEG and ABS performed with SSE use bitwise mask operations.
18140 Create the appropriate mask now. */
18141 if (use_sse)
18142 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
18143 else
18144 mask = NULL_RTX;
18146 dst = operands[0];
18147 src = operands[1];
18149 set = gen_rtx_fmt_e (code, mode, src);
18150 set = gen_rtx_SET (VOIDmode, dst, set);
18152 if (mask)
18154 rtx use, clob;
18155 rtvec par;
18157 use = gen_rtx_USE (VOIDmode, mask);
18158 if (vector_mode)
18159 par = gen_rtvec (2, set, use);
18160 else
18162 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
18163 par = gen_rtvec (3, set, use, clob);
18165 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
18167 else
18168 emit_insn (set);
18171 /* Expand a copysign operation. Special case operand 0 being a constant. */
18173 void
18174 ix86_expand_copysign (rtx operands[])
18176 enum machine_mode mode, vmode;
18177 rtx dest, op0, op1, mask, nmask;
18179 dest = operands[0];
18180 op0 = operands[1];
18181 op1 = operands[2];
18183 mode = GET_MODE (dest);
18185 if (mode == SFmode)
18186 vmode = V4SFmode;
18187 else if (mode == DFmode)
18188 vmode = V2DFmode;
18189 else
18190 vmode = mode;
18192 if (GET_CODE (op0) == CONST_DOUBLE)
18194 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
18196 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
18197 op0 = simplify_unary_operation (ABS, mode, op0, mode);
18199 if (mode == SFmode || mode == DFmode)
18201 if (op0 == CONST0_RTX (mode))
18202 op0 = CONST0_RTX (vmode);
18203 else
18205 rtx v = ix86_build_const_vector (vmode, false, op0);
18207 op0 = force_reg (vmode, v);
18210 else if (op0 != CONST0_RTX (mode))
18211 op0 = force_reg (mode, op0);
18213 mask = ix86_build_signbit_mask (vmode, 0, 0);
18215 if (mode == SFmode)
18216 copysign_insn = gen_copysignsf3_const;
18217 else if (mode == DFmode)
18218 copysign_insn = gen_copysigndf3_const;
18219 else
18220 copysign_insn = gen_copysigntf3_const;
18222 emit_insn (copysign_insn (dest, op0, op1, mask));
18224 else
18226 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
18228 nmask = ix86_build_signbit_mask (vmode, 0, 1);
18229 mask = ix86_build_signbit_mask (vmode, 0, 0);
18231 if (mode == SFmode)
18232 copysign_insn = gen_copysignsf3_var;
18233 else if (mode == DFmode)
18234 copysign_insn = gen_copysigndf3_var;
18235 else
18236 copysign_insn = gen_copysigntf3_var;
18238 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
18242 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
18243 be a constant, and so has already been expanded into a vector constant. */
18245 void
18246 ix86_split_copysign_const (rtx operands[])
18248 enum machine_mode mode, vmode;
18249 rtx dest, op0, mask, x;
18251 dest = operands[0];
18252 op0 = operands[1];
18253 mask = operands[3];
18255 mode = GET_MODE (dest);
18256 vmode = GET_MODE (mask);
18258 dest = simplify_gen_subreg (vmode, dest, mode, 0);
18259 x = gen_rtx_AND (vmode, dest, mask);
18260 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18262 if (op0 != CONST0_RTX (vmode))
18264 x = gen_rtx_IOR (vmode, dest, op0);
18265 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18269 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
18270 so we have to do two masks. */
18272 void
18273 ix86_split_copysign_var (rtx operands[])
18275 enum machine_mode mode, vmode;
18276 rtx dest, scratch, op0, op1, mask, nmask, x;
18278 dest = operands[0];
18279 scratch = operands[1];
18280 op0 = operands[2];
18281 op1 = operands[3];
18282 nmask = operands[4];
18283 mask = operands[5];
18285 mode = GET_MODE (dest);
18286 vmode = GET_MODE (mask);
18288 if (rtx_equal_p (op0, op1))
18290 /* Shouldn't happen often (it's useless, obviously), but when it does
18291 we'd generate incorrect code if we continue below. */
18292 emit_move_insn (dest, op0);
18293 return;
18296 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
18298 gcc_assert (REGNO (op1) == REGNO (scratch));
18300 x = gen_rtx_AND (vmode, scratch, mask);
18301 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18303 dest = mask;
18304 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18305 x = gen_rtx_NOT (vmode, dest);
18306 x = gen_rtx_AND (vmode, x, op0);
18307 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18309 else
18311 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
18313 x = gen_rtx_AND (vmode, scratch, mask);
18315 else /* alternative 2,4 */
18317 gcc_assert (REGNO (mask) == REGNO (scratch));
18318 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
18319 x = gen_rtx_AND (vmode, scratch, op1);
18321 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
18323 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
18325 dest = simplify_gen_subreg (vmode, op0, mode, 0);
18326 x = gen_rtx_AND (vmode, dest, nmask);
18328 else /* alternative 3,4 */
18330 gcc_assert (REGNO (nmask) == REGNO (dest));
18331 dest = nmask;
18332 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
18333 x = gen_rtx_AND (vmode, dest, op0);
18335 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18338 x = gen_rtx_IOR (vmode, dest, scratch);
18339 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18342 /* Return TRUE or FALSE depending on whether the first SET in INSN
18343 has source and destination with matching CC modes, and that the
18344 CC mode is at least as constrained as REQ_MODE. */
18346 bool
18347 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
18349 rtx set;
18350 enum machine_mode set_mode;
18352 set = PATTERN (insn);
18353 if (GET_CODE (set) == PARALLEL)
18354 set = XVECEXP (set, 0, 0);
18355 gcc_assert (GET_CODE (set) == SET);
18356 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
18358 set_mode = GET_MODE (SET_DEST (set));
18359 switch (set_mode)
18361 case CCNOmode:
18362 if (req_mode != CCNOmode
18363 && (req_mode != CCmode
18364 || XEXP (SET_SRC (set), 1) != const0_rtx))
18365 return false;
18366 break;
18367 case CCmode:
18368 if (req_mode == CCGCmode)
18369 return false;
18370 /* FALLTHRU */
18371 case CCGCmode:
18372 if (req_mode == CCGOCmode || req_mode == CCNOmode)
18373 return false;
18374 /* FALLTHRU */
18375 case CCGOCmode:
18376 if (req_mode == CCZmode)
18377 return false;
18378 /* FALLTHRU */
18379 case CCZmode:
18380 break;
18382 case CCAmode:
18383 case CCCmode:
18384 case CCOmode:
18385 case CCSmode:
18386 if (set_mode != req_mode)
18387 return false;
18388 break;
18390 default:
18391 gcc_unreachable ();
18394 return GET_MODE (SET_SRC (set)) == set_mode;
18397 /* Generate insn patterns to do an integer compare of OPERANDS. */
18399 static rtx
18400 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
18402 enum machine_mode cmpmode;
18403 rtx tmp, flags;
18405 cmpmode = SELECT_CC_MODE (code, op0, op1);
18406 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
18408 /* This is very simple, but making the interface the same as in the
18409 FP case makes the rest of the code easier. */
18410 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
18411 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
18413 /* Return the test that should be put into the flags user, i.e.
18414 the bcc, scc, or cmov instruction. */
18415 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
18418 /* Figure out whether to use ordered or unordered fp comparisons.
18419 Return the appropriate mode to use. */
18421 enum machine_mode
18422 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
18424 /* ??? In order to make all comparisons reversible, we do all comparisons
18425 non-trapping when compiling for IEEE. Once gcc is able to distinguish
18426 all forms trapping and nontrapping comparisons, we can make inequality
18427 comparisons trapping again, since it results in better code when using
18428 FCOM based compares. */
18429 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
18432 enum machine_mode
18433 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
18435 enum machine_mode mode = GET_MODE (op0);
18437 if (SCALAR_FLOAT_MODE_P (mode))
18439 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
18440 return ix86_fp_compare_mode (code);
18443 switch (code)
18445 /* Only zero flag is needed. */
18446 case EQ: /* ZF=0 */
18447 case NE: /* ZF!=0 */
18448 return CCZmode;
18449 /* Codes needing carry flag. */
18450 case GEU: /* CF=0 */
18451 case LTU: /* CF=1 */
18452 /* Detect overflow checks. They need just the carry flag. */
18453 if (GET_CODE (op0) == PLUS
18454 && rtx_equal_p (op1, XEXP (op0, 0)))
18455 return CCCmode;
18456 else
18457 return CCmode;
18458 case GTU: /* CF=0 & ZF=0 */
18459 case LEU: /* CF=1 | ZF=1 */
18460 return CCmode;
18461 /* Codes possibly doable only with sign flag when
18462 comparing against zero. */
18463 case GE: /* SF=OF or SF=0 */
18464 case LT: /* SF<>OF or SF=1 */
18465 if (op1 == const0_rtx)
18466 return CCGOCmode;
18467 else
18468 /* For other cases Carry flag is not required. */
18469 return CCGCmode;
18470 /* Codes doable only with sign flag when comparing
18471 against zero, but we miss jump instruction for it
18472 so we need to use relational tests against overflow
18473 that thus needs to be zero. */
18474 case GT: /* ZF=0 & SF=OF */
18475 case LE: /* ZF=1 | SF<>OF */
18476 if (op1 == const0_rtx)
18477 return CCNOmode;
18478 else
18479 return CCGCmode;
18480 /* strcmp pattern do (use flags) and combine may ask us for proper
18481 mode. */
18482 case USE:
18483 return CCmode;
18484 default:
18485 gcc_unreachable ();
18489 /* Return the fixed registers used for condition codes. */
18491 static bool
18492 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
18494 *p1 = FLAGS_REG;
18495 *p2 = FPSR_REG;
18496 return true;
18499 /* If two condition code modes are compatible, return a condition code
18500 mode which is compatible with both. Otherwise, return
18501 VOIDmode. */
18503 static enum machine_mode
18504 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
18506 if (m1 == m2)
18507 return m1;
18509 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
18510 return VOIDmode;
18512 if ((m1 == CCGCmode && m2 == CCGOCmode)
18513 || (m1 == CCGOCmode && m2 == CCGCmode))
18514 return CCGCmode;
18516 if (m1 == CCZmode && (m2 == CCGCmode || m2 == CCGOCmode))
18517 return m2;
18518 else if (m2 == CCZmode && (m1 == CCGCmode || m1 == CCGOCmode))
18519 return m1;
18521 switch (m1)
18523 default:
18524 gcc_unreachable ();
18526 case CCmode:
18527 case CCGCmode:
18528 case CCGOCmode:
18529 case CCNOmode:
18530 case CCAmode:
18531 case CCCmode:
18532 case CCOmode:
18533 case CCSmode:
18534 case CCZmode:
18535 switch (m2)
18537 default:
18538 return VOIDmode;
18540 case CCmode:
18541 case CCGCmode:
18542 case CCGOCmode:
18543 case CCNOmode:
18544 case CCAmode:
18545 case CCCmode:
18546 case CCOmode:
18547 case CCSmode:
18548 case CCZmode:
18549 return CCmode;
18552 case CCFPmode:
18553 case CCFPUmode:
18554 /* These are only compatible with themselves, which we already
18555 checked above. */
18556 return VOIDmode;
18561 /* Return a comparison we can do and that it is equivalent to
18562 swap_condition (code) apart possibly from orderedness.
18563 But, never change orderedness if TARGET_IEEE_FP, returning
18564 UNKNOWN in that case if necessary. */
18566 static enum rtx_code
18567 ix86_fp_swap_condition (enum rtx_code code)
18569 switch (code)
18571 case GT: /* GTU - CF=0 & ZF=0 */
18572 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
18573 case GE: /* GEU - CF=0 */
18574 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
18575 case UNLT: /* LTU - CF=1 */
18576 return TARGET_IEEE_FP ? UNKNOWN : GT;
18577 case UNLE: /* LEU - CF=1 | ZF=1 */
18578 return TARGET_IEEE_FP ? UNKNOWN : GE;
18579 default:
18580 return swap_condition (code);
18584 /* Return cost of comparison CODE using the best strategy for performance.
18585 All following functions do use number of instructions as a cost metrics.
18586 In future this should be tweaked to compute bytes for optimize_size and
18587 take into account performance of various instructions on various CPUs. */
18589 static int
18590 ix86_fp_comparison_cost (enum rtx_code code)
18592 int arith_cost;
18594 /* The cost of code using bit-twiddling on %ah. */
18595 switch (code)
18597 case UNLE:
18598 case UNLT:
18599 case LTGT:
18600 case GT:
18601 case GE:
18602 case UNORDERED:
18603 case ORDERED:
18604 case UNEQ:
18605 arith_cost = 4;
18606 break;
18607 case LT:
18608 case NE:
18609 case EQ:
18610 case UNGE:
18611 arith_cost = TARGET_IEEE_FP ? 5 : 4;
18612 break;
18613 case LE:
18614 case UNGT:
18615 arith_cost = TARGET_IEEE_FP ? 6 : 4;
18616 break;
18617 default:
18618 gcc_unreachable ();
18621 switch (ix86_fp_comparison_strategy (code))
18623 case IX86_FPCMP_COMI:
18624 return arith_cost > 4 ? 3 : 2;
18625 case IX86_FPCMP_SAHF:
18626 return arith_cost > 4 ? 4 : 3;
18627 default:
18628 return arith_cost;
18632 /* Return strategy to use for floating-point. We assume that fcomi is always
18633 preferrable where available, since that is also true when looking at size
18634 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
18636 enum ix86_fpcmp_strategy
18637 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
18639 /* Do fcomi/sahf based test when profitable. */
18641 if (TARGET_CMOVE)
18642 return IX86_FPCMP_COMI;
18644 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
18645 return IX86_FPCMP_SAHF;
18647 return IX86_FPCMP_ARITH;
18650 /* Swap, force into registers, or otherwise massage the two operands
18651 to a fp comparison. The operands are updated in place; the new
18652 comparison code is returned. */
18654 static enum rtx_code
18655 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
18657 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
18658 rtx op0 = *pop0, op1 = *pop1;
18659 enum machine_mode op_mode = GET_MODE (op0);
18660 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
18662 /* All of the unordered compare instructions only work on registers.
18663 The same is true of the fcomi compare instructions. The XFmode
18664 compare instructions require registers except when comparing
18665 against zero or when converting operand 1 from fixed point to
18666 floating point. */
18668 if (!is_sse
18669 && (fpcmp_mode == CCFPUmode
18670 || (op_mode == XFmode
18671 && ! (standard_80387_constant_p (op0) == 1
18672 || standard_80387_constant_p (op1) == 1)
18673 && GET_CODE (op1) != FLOAT)
18674 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
18676 op0 = force_reg (op_mode, op0);
18677 op1 = force_reg (op_mode, op1);
18679 else
18681 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
18682 things around if they appear profitable, otherwise force op0
18683 into a register. */
18685 if (standard_80387_constant_p (op0) == 0
18686 || (MEM_P (op0)
18687 && ! (standard_80387_constant_p (op1) == 0
18688 || MEM_P (op1))))
18690 enum rtx_code new_code = ix86_fp_swap_condition (code);
18691 if (new_code != UNKNOWN)
18693 rtx tmp;
18694 tmp = op0, op0 = op1, op1 = tmp;
18695 code = new_code;
18699 if (!REG_P (op0))
18700 op0 = force_reg (op_mode, op0);
18702 if (CONSTANT_P (op1))
18704 int tmp = standard_80387_constant_p (op1);
18705 if (tmp == 0)
18706 op1 = validize_mem (force_const_mem (op_mode, op1));
18707 else if (tmp == 1)
18709 if (TARGET_CMOVE)
18710 op1 = force_reg (op_mode, op1);
18712 else
18713 op1 = force_reg (op_mode, op1);
18717 /* Try to rearrange the comparison to make it cheaper. */
18718 if (ix86_fp_comparison_cost (code)
18719 > ix86_fp_comparison_cost (swap_condition (code))
18720 && (REG_P (op1) || can_create_pseudo_p ()))
18722 rtx tmp;
18723 tmp = op0, op0 = op1, op1 = tmp;
18724 code = swap_condition (code);
18725 if (!REG_P (op0))
18726 op0 = force_reg (op_mode, op0);
18729 *pop0 = op0;
18730 *pop1 = op1;
18731 return code;
18734 /* Convert comparison codes we use to represent FP comparison to integer
18735 code that will result in proper branch. Return UNKNOWN if no such code
18736 is available. */
18738 enum rtx_code
18739 ix86_fp_compare_code_to_integer (enum rtx_code code)
18741 switch (code)
18743 case GT:
18744 return GTU;
18745 case GE:
18746 return GEU;
18747 case ORDERED:
18748 case UNORDERED:
18749 return code;
18750 break;
18751 case UNEQ:
18752 return EQ;
18753 break;
18754 case UNLT:
18755 return LTU;
18756 break;
18757 case UNLE:
18758 return LEU;
18759 break;
18760 case LTGT:
18761 return NE;
18762 break;
18763 default:
18764 return UNKNOWN;
18768 /* Generate insn patterns to do a floating point compare of OPERANDS. */
18770 static rtx
18771 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
18773 enum machine_mode fpcmp_mode, intcmp_mode;
18774 rtx tmp, tmp2;
18776 fpcmp_mode = ix86_fp_compare_mode (code);
18777 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
18779 /* Do fcomi/sahf based test when profitable. */
18780 switch (ix86_fp_comparison_strategy (code))
18782 case IX86_FPCMP_COMI:
18783 intcmp_mode = fpcmp_mode;
18784 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18785 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18786 tmp);
18787 emit_insn (tmp);
18788 break;
18790 case IX86_FPCMP_SAHF:
18791 intcmp_mode = fpcmp_mode;
18792 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18793 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
18794 tmp);
18796 if (!scratch)
18797 scratch = gen_reg_rtx (HImode);
18798 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
18799 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
18800 break;
18802 case IX86_FPCMP_ARITH:
18803 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
18804 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
18805 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
18806 if (!scratch)
18807 scratch = gen_reg_rtx (HImode);
18808 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
18810 /* In the unordered case, we have to check C2 for NaN's, which
18811 doesn't happen to work out to anything nice combination-wise.
18812 So do some bit twiddling on the value we've got in AH to come
18813 up with an appropriate set of condition codes. */
18815 intcmp_mode = CCNOmode;
18816 switch (code)
18818 case GT:
18819 case UNGT:
18820 if (code == GT || !TARGET_IEEE_FP)
18822 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18823 code = EQ;
18825 else
18827 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18828 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18829 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
18830 intcmp_mode = CCmode;
18831 code = GEU;
18833 break;
18834 case LT:
18835 case UNLT:
18836 if (code == LT && TARGET_IEEE_FP)
18838 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18839 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
18840 intcmp_mode = CCmode;
18841 code = EQ;
18843 else
18845 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
18846 code = NE;
18848 break;
18849 case GE:
18850 case UNGE:
18851 if (code == GE || !TARGET_IEEE_FP)
18853 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
18854 code = EQ;
18856 else
18858 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18859 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
18860 code = NE;
18862 break;
18863 case LE:
18864 case UNLE:
18865 if (code == LE && TARGET_IEEE_FP)
18867 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18868 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
18869 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18870 intcmp_mode = CCmode;
18871 code = LTU;
18873 else
18875 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
18876 code = NE;
18878 break;
18879 case EQ:
18880 case UNEQ:
18881 if (code == EQ && TARGET_IEEE_FP)
18883 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18884 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
18885 intcmp_mode = CCmode;
18886 code = EQ;
18888 else
18890 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18891 code = NE;
18893 break;
18894 case NE:
18895 case LTGT:
18896 if (code == NE && TARGET_IEEE_FP)
18898 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
18899 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
18900 GEN_INT (0x40)));
18901 code = NE;
18903 else
18905 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
18906 code = EQ;
18908 break;
18910 case UNORDERED:
18911 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18912 code = NE;
18913 break;
18914 case ORDERED:
18915 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
18916 code = EQ;
18917 break;
18919 default:
18920 gcc_unreachable ();
18922 break;
18924 default:
18925 gcc_unreachable();
18928 /* Return the test that should be put into the flags user, i.e.
18929 the bcc, scc, or cmov instruction. */
18930 return gen_rtx_fmt_ee (code, VOIDmode,
18931 gen_rtx_REG (intcmp_mode, FLAGS_REG),
18932 const0_rtx);
18935 static rtx
18936 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
18938 rtx ret;
18940 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
18941 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
18943 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
18945 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
18946 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
18948 else
18949 ret = ix86_expand_int_compare (code, op0, op1);
18951 return ret;
18954 void
18955 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
18957 enum machine_mode mode = GET_MODE (op0);
18958 rtx tmp;
18960 switch (mode)
18962 case SFmode:
18963 case DFmode:
18964 case XFmode:
18965 case QImode:
18966 case HImode:
18967 case SImode:
18968 simple:
18969 tmp = ix86_expand_compare (code, op0, op1);
18970 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
18971 gen_rtx_LABEL_REF (VOIDmode, label),
18972 pc_rtx);
18973 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
18974 return;
18976 case DImode:
18977 if (TARGET_64BIT)
18978 goto simple;
18979 case TImode:
18980 /* Expand DImode branch into multiple compare+branch. */
18982 rtx lo[2], hi[2], label2;
18983 enum rtx_code code1, code2, code3;
18984 enum machine_mode submode;
18986 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
18988 tmp = op0, op0 = op1, op1 = tmp;
18989 code = swap_condition (code);
18992 split_double_mode (mode, &op0, 1, lo+0, hi+0);
18993 split_double_mode (mode, &op1, 1, lo+1, hi+1);
18995 submode = mode == DImode ? SImode : DImode;
18997 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
18998 avoid two branches. This costs one extra insn, so disable when
18999 optimizing for size. */
19001 if ((code == EQ || code == NE)
19002 && (!optimize_insn_for_size_p ()
19003 || hi[1] == const0_rtx || lo[1] == const0_rtx))
19005 rtx xor0, xor1;
19007 xor1 = hi[0];
19008 if (hi[1] != const0_rtx)
19009 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
19010 NULL_RTX, 0, OPTAB_WIDEN);
19012 xor0 = lo[0];
19013 if (lo[1] != const0_rtx)
19014 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
19015 NULL_RTX, 0, OPTAB_WIDEN);
19017 tmp = expand_binop (submode, ior_optab, xor1, xor0,
19018 NULL_RTX, 0, OPTAB_WIDEN);
19020 ix86_expand_branch (code, tmp, const0_rtx, label);
19021 return;
19024 /* Otherwise, if we are doing less-than or greater-or-equal-than,
19025 op1 is a constant and the low word is zero, then we can just
19026 examine the high word. Similarly for low word -1 and
19027 less-or-equal-than or greater-than. */
19029 if (CONST_INT_P (hi[1]))
19030 switch (code)
19032 case LT: case LTU: case GE: case GEU:
19033 if (lo[1] == const0_rtx)
19035 ix86_expand_branch (code, hi[0], hi[1], label);
19036 return;
19038 break;
19039 case LE: case LEU: case GT: case GTU:
19040 if (lo[1] == constm1_rtx)
19042 ix86_expand_branch (code, hi[0], hi[1], label);
19043 return;
19045 break;
19046 default:
19047 break;
19050 /* Otherwise, we need two or three jumps. */
19052 label2 = gen_label_rtx ();
19054 code1 = code;
19055 code2 = swap_condition (code);
19056 code3 = unsigned_condition (code);
19058 switch (code)
19060 case LT: case GT: case LTU: case GTU:
19061 break;
19063 case LE: code1 = LT; code2 = GT; break;
19064 case GE: code1 = GT; code2 = LT; break;
19065 case LEU: code1 = LTU; code2 = GTU; break;
19066 case GEU: code1 = GTU; code2 = LTU; break;
19068 case EQ: code1 = UNKNOWN; code2 = NE; break;
19069 case NE: code2 = UNKNOWN; break;
19071 default:
19072 gcc_unreachable ();
19076 * a < b =>
19077 * if (hi(a) < hi(b)) goto true;
19078 * if (hi(a) > hi(b)) goto false;
19079 * if (lo(a) < lo(b)) goto true;
19080 * false:
19083 if (code1 != UNKNOWN)
19084 ix86_expand_branch (code1, hi[0], hi[1], label);
19085 if (code2 != UNKNOWN)
19086 ix86_expand_branch (code2, hi[0], hi[1], label2);
19088 ix86_expand_branch (code3, lo[0], lo[1], label);
19090 if (code2 != UNKNOWN)
19091 emit_label (label2);
19092 return;
19095 default:
19096 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
19097 goto simple;
19101 /* Split branch based on floating point condition. */
19102 void
19103 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
19104 rtx target1, rtx target2, rtx tmp, rtx pushed)
19106 rtx condition;
19107 rtx i;
19109 if (target2 != pc_rtx)
19111 rtx tmp = target2;
19112 code = reverse_condition_maybe_unordered (code);
19113 target2 = target1;
19114 target1 = tmp;
19117 condition = ix86_expand_fp_compare (code, op1, op2,
19118 tmp);
19120 /* Remove pushed operand from stack. */
19121 if (pushed)
19122 ix86_free_from_memory (GET_MODE (pushed));
19124 i = emit_jump_insn (gen_rtx_SET
19125 (VOIDmode, pc_rtx,
19126 gen_rtx_IF_THEN_ELSE (VOIDmode,
19127 condition, target1, target2)));
19128 if (split_branch_probability >= 0)
19129 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
19132 void
19133 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
19135 rtx ret;
19137 gcc_assert (GET_MODE (dest) == QImode);
19139 ret = ix86_expand_compare (code, op0, op1);
19140 PUT_MODE (ret, QImode);
19141 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
19144 /* Expand comparison setting or clearing carry flag. Return true when
19145 successful and set pop for the operation. */
19146 static bool
19147 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
19149 enum machine_mode mode =
19150 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
19152 /* Do not handle double-mode compares that go through special path. */
19153 if (mode == (TARGET_64BIT ? TImode : DImode))
19154 return false;
19156 if (SCALAR_FLOAT_MODE_P (mode))
19158 rtx compare_op, compare_seq;
19160 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
19162 /* Shortcut: following common codes never translate
19163 into carry flag compares. */
19164 if (code == EQ || code == NE || code == UNEQ || code == LTGT
19165 || code == ORDERED || code == UNORDERED)
19166 return false;
19168 /* These comparisons require zero flag; swap operands so they won't. */
19169 if ((code == GT || code == UNLE || code == LE || code == UNGT)
19170 && !TARGET_IEEE_FP)
19172 rtx tmp = op0;
19173 op0 = op1;
19174 op1 = tmp;
19175 code = swap_condition (code);
19178 /* Try to expand the comparison and verify that we end up with
19179 carry flag based comparison. This fails to be true only when
19180 we decide to expand comparison using arithmetic that is not
19181 too common scenario. */
19182 start_sequence ();
19183 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
19184 compare_seq = get_insns ();
19185 end_sequence ();
19187 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
19188 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
19189 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
19190 else
19191 code = GET_CODE (compare_op);
19193 if (code != LTU && code != GEU)
19194 return false;
19196 emit_insn (compare_seq);
19197 *pop = compare_op;
19198 return true;
19201 if (!INTEGRAL_MODE_P (mode))
19202 return false;
19204 switch (code)
19206 case LTU:
19207 case GEU:
19208 break;
19210 /* Convert a==0 into (unsigned)a<1. */
19211 case EQ:
19212 case NE:
19213 if (op1 != const0_rtx)
19214 return false;
19215 op1 = const1_rtx;
19216 code = (code == EQ ? LTU : GEU);
19217 break;
19219 /* Convert a>b into b<a or a>=b-1. */
19220 case GTU:
19221 case LEU:
19222 if (CONST_INT_P (op1))
19224 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
19225 /* Bail out on overflow. We still can swap operands but that
19226 would force loading of the constant into register. */
19227 if (op1 == const0_rtx
19228 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
19229 return false;
19230 code = (code == GTU ? GEU : LTU);
19232 else
19234 rtx tmp = op1;
19235 op1 = op0;
19236 op0 = tmp;
19237 code = (code == GTU ? LTU : GEU);
19239 break;
19241 /* Convert a>=0 into (unsigned)a<0x80000000. */
19242 case LT:
19243 case GE:
19244 if (mode == DImode || op1 != const0_rtx)
19245 return false;
19246 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19247 code = (code == LT ? GEU : LTU);
19248 break;
19249 case LE:
19250 case GT:
19251 if (mode == DImode || op1 != constm1_rtx)
19252 return false;
19253 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
19254 code = (code == LE ? GEU : LTU);
19255 break;
19257 default:
19258 return false;
19260 /* Swapping operands may cause constant to appear as first operand. */
19261 if (!nonimmediate_operand (op0, VOIDmode))
19263 if (!can_create_pseudo_p ())
19264 return false;
19265 op0 = force_reg (mode, op0);
19267 *pop = ix86_expand_compare (code, op0, op1);
19268 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
19269 return true;
19272 bool
19273 ix86_expand_int_movcc (rtx operands[])
19275 enum rtx_code code = GET_CODE (operands[1]), compare_code;
19276 rtx compare_seq, compare_op;
19277 enum machine_mode mode = GET_MODE (operands[0]);
19278 bool sign_bit_compare_p = false;
19279 rtx op0 = XEXP (operands[1], 0);
19280 rtx op1 = XEXP (operands[1], 1);
19282 if (GET_MODE (op0) == TImode
19283 || (GET_MODE (op0) == DImode
19284 && !TARGET_64BIT))
19285 return false;
19287 start_sequence ();
19288 compare_op = ix86_expand_compare (code, op0, op1);
19289 compare_seq = get_insns ();
19290 end_sequence ();
19292 compare_code = GET_CODE (compare_op);
19294 if ((op1 == const0_rtx && (code == GE || code == LT))
19295 || (op1 == constm1_rtx && (code == GT || code == LE)))
19296 sign_bit_compare_p = true;
19298 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
19299 HImode insns, we'd be swallowed in word prefix ops. */
19301 if ((mode != HImode || TARGET_FAST_PREFIX)
19302 && (mode != (TARGET_64BIT ? TImode : DImode))
19303 && CONST_INT_P (operands[2])
19304 && CONST_INT_P (operands[3]))
19306 rtx out = operands[0];
19307 HOST_WIDE_INT ct = INTVAL (operands[2]);
19308 HOST_WIDE_INT cf = INTVAL (operands[3]);
19309 HOST_WIDE_INT diff;
19311 diff = ct - cf;
19312 /* Sign bit compares are better done using shifts than we do by using
19313 sbb. */
19314 if (sign_bit_compare_p
19315 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
19317 /* Detect overlap between destination and compare sources. */
19318 rtx tmp = out;
19320 if (!sign_bit_compare_p)
19322 rtx flags;
19323 bool fpcmp = false;
19325 compare_code = GET_CODE (compare_op);
19327 flags = XEXP (compare_op, 0);
19329 if (GET_MODE (flags) == CCFPmode
19330 || GET_MODE (flags) == CCFPUmode)
19332 fpcmp = true;
19333 compare_code
19334 = ix86_fp_compare_code_to_integer (compare_code);
19337 /* To simplify rest of code, restrict to the GEU case. */
19338 if (compare_code == LTU)
19340 HOST_WIDE_INT tmp = ct;
19341 ct = cf;
19342 cf = tmp;
19343 compare_code = reverse_condition (compare_code);
19344 code = reverse_condition (code);
19346 else
19348 if (fpcmp)
19349 PUT_CODE (compare_op,
19350 reverse_condition_maybe_unordered
19351 (GET_CODE (compare_op)));
19352 else
19353 PUT_CODE (compare_op,
19354 reverse_condition (GET_CODE (compare_op)));
19356 diff = ct - cf;
19358 if (reg_overlap_mentioned_p (out, op0)
19359 || reg_overlap_mentioned_p (out, op1))
19360 tmp = gen_reg_rtx (mode);
19362 if (mode == DImode)
19363 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
19364 else
19365 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
19366 flags, compare_op));
19368 else
19370 if (code == GT || code == GE)
19371 code = reverse_condition (code);
19372 else
19374 HOST_WIDE_INT tmp = ct;
19375 ct = cf;
19376 cf = tmp;
19377 diff = ct - cf;
19379 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
19382 if (diff == 1)
19385 * cmpl op0,op1
19386 * sbbl dest,dest
19387 * [addl dest, ct]
19389 * Size 5 - 8.
19391 if (ct)
19392 tmp = expand_simple_binop (mode, PLUS,
19393 tmp, GEN_INT (ct),
19394 copy_rtx (tmp), 1, OPTAB_DIRECT);
19396 else if (cf == -1)
19399 * cmpl op0,op1
19400 * sbbl dest,dest
19401 * orl $ct, dest
19403 * Size 8.
19405 tmp = expand_simple_binop (mode, IOR,
19406 tmp, GEN_INT (ct),
19407 copy_rtx (tmp), 1, OPTAB_DIRECT);
19409 else if (diff == -1 && ct)
19412 * cmpl op0,op1
19413 * sbbl dest,dest
19414 * notl dest
19415 * [addl dest, cf]
19417 * Size 8 - 11.
19419 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19420 if (cf)
19421 tmp = expand_simple_binop (mode, PLUS,
19422 copy_rtx (tmp), GEN_INT (cf),
19423 copy_rtx (tmp), 1, OPTAB_DIRECT);
19425 else
19428 * cmpl op0,op1
19429 * sbbl dest,dest
19430 * [notl dest]
19431 * andl cf - ct, dest
19432 * [addl dest, ct]
19434 * Size 8 - 11.
19437 if (cf == 0)
19439 cf = ct;
19440 ct = 0;
19441 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
19444 tmp = expand_simple_binop (mode, AND,
19445 copy_rtx (tmp),
19446 gen_int_mode (cf - ct, mode),
19447 copy_rtx (tmp), 1, OPTAB_DIRECT);
19448 if (ct)
19449 tmp = expand_simple_binop (mode, PLUS,
19450 copy_rtx (tmp), GEN_INT (ct),
19451 copy_rtx (tmp), 1, OPTAB_DIRECT);
19454 if (!rtx_equal_p (tmp, out))
19455 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
19457 return true;
19460 if (diff < 0)
19462 enum machine_mode cmp_mode = GET_MODE (op0);
19464 HOST_WIDE_INT tmp;
19465 tmp = ct, ct = cf, cf = tmp;
19466 diff = -diff;
19468 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19470 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19472 /* We may be reversing unordered compare to normal compare, that
19473 is not valid in general (we may convert non-trapping condition
19474 to trapping one), however on i386 we currently emit all
19475 comparisons unordered. */
19476 compare_code = reverse_condition_maybe_unordered (compare_code);
19477 code = reverse_condition_maybe_unordered (code);
19479 else
19481 compare_code = reverse_condition (compare_code);
19482 code = reverse_condition (code);
19486 compare_code = UNKNOWN;
19487 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
19488 && CONST_INT_P (op1))
19490 if (op1 == const0_rtx
19491 && (code == LT || code == GE))
19492 compare_code = code;
19493 else if (op1 == constm1_rtx)
19495 if (code == LE)
19496 compare_code = LT;
19497 else if (code == GT)
19498 compare_code = GE;
19502 /* Optimize dest = (op0 < 0) ? -1 : cf. */
19503 if (compare_code != UNKNOWN
19504 && GET_MODE (op0) == GET_MODE (out)
19505 && (cf == -1 || ct == -1))
19507 /* If lea code below could be used, only optimize
19508 if it results in a 2 insn sequence. */
19510 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
19511 || diff == 3 || diff == 5 || diff == 9)
19512 || (compare_code == LT && ct == -1)
19513 || (compare_code == GE && cf == -1))
19516 * notl op1 (if necessary)
19517 * sarl $31, op1
19518 * orl cf, op1
19520 if (ct != -1)
19522 cf = ct;
19523 ct = -1;
19524 code = reverse_condition (code);
19527 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19529 out = expand_simple_binop (mode, IOR,
19530 out, GEN_INT (cf),
19531 out, 1, OPTAB_DIRECT);
19532 if (out != operands[0])
19533 emit_move_insn (operands[0], out);
19535 return true;
19540 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
19541 || diff == 3 || diff == 5 || diff == 9)
19542 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
19543 && (mode != DImode
19544 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
19547 * xorl dest,dest
19548 * cmpl op1,op2
19549 * setcc dest
19550 * lea cf(dest*(ct-cf)),dest
19552 * Size 14.
19554 * This also catches the degenerate setcc-only case.
19557 rtx tmp;
19558 int nops;
19560 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19562 nops = 0;
19563 /* On x86_64 the lea instruction operates on Pmode, so we need
19564 to get arithmetics done in proper mode to match. */
19565 if (diff == 1)
19566 tmp = copy_rtx (out);
19567 else
19569 rtx out1;
19570 out1 = copy_rtx (out);
19571 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
19572 nops++;
19573 if (diff & 1)
19575 tmp = gen_rtx_PLUS (mode, tmp, out1);
19576 nops++;
19579 if (cf != 0)
19581 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
19582 nops++;
19584 if (!rtx_equal_p (tmp, out))
19586 if (nops == 1)
19587 out = force_operand (tmp, copy_rtx (out));
19588 else
19589 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
19591 if (!rtx_equal_p (out, operands[0]))
19592 emit_move_insn (operands[0], copy_rtx (out));
19594 return true;
19598 * General case: Jumpful:
19599 * xorl dest,dest cmpl op1, op2
19600 * cmpl op1, op2 movl ct, dest
19601 * setcc dest jcc 1f
19602 * decl dest movl cf, dest
19603 * andl (cf-ct),dest 1:
19604 * addl ct,dest
19606 * Size 20. Size 14.
19608 * This is reasonably steep, but branch mispredict costs are
19609 * high on modern cpus, so consider failing only if optimizing
19610 * for space.
19613 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19614 && BRANCH_COST (optimize_insn_for_speed_p (),
19615 false) >= 2)
19617 if (cf == 0)
19619 enum machine_mode cmp_mode = GET_MODE (op0);
19621 cf = ct;
19622 ct = 0;
19624 if (SCALAR_FLOAT_MODE_P (cmp_mode))
19626 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
19628 /* We may be reversing unordered compare to normal compare,
19629 that is not valid in general (we may convert non-trapping
19630 condition to trapping one), however on i386 we currently
19631 emit all comparisons unordered. */
19632 code = reverse_condition_maybe_unordered (code);
19634 else
19636 code = reverse_condition (code);
19637 if (compare_code != UNKNOWN)
19638 compare_code = reverse_condition (compare_code);
19642 if (compare_code != UNKNOWN)
19644 /* notl op1 (if needed)
19645 sarl $31, op1
19646 andl (cf-ct), op1
19647 addl ct, op1
19649 For x < 0 (resp. x <= -1) there will be no notl,
19650 so if possible swap the constants to get rid of the
19651 complement.
19652 True/false will be -1/0 while code below (store flag
19653 followed by decrement) is 0/-1, so the constants need
19654 to be exchanged once more. */
19656 if (compare_code == GE || !cf)
19658 code = reverse_condition (code);
19659 compare_code = LT;
19661 else
19663 HOST_WIDE_INT tmp = cf;
19664 cf = ct;
19665 ct = tmp;
19668 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
19670 else
19672 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
19674 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
19675 constm1_rtx,
19676 copy_rtx (out), 1, OPTAB_DIRECT);
19679 out = expand_simple_binop (mode, AND, copy_rtx (out),
19680 gen_int_mode (cf - ct, mode),
19681 copy_rtx (out), 1, OPTAB_DIRECT);
19682 if (ct)
19683 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
19684 copy_rtx (out), 1, OPTAB_DIRECT);
19685 if (!rtx_equal_p (out, operands[0]))
19686 emit_move_insn (operands[0], copy_rtx (out));
19688 return true;
19692 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
19694 /* Try a few things more with specific constants and a variable. */
19696 optab op;
19697 rtx var, orig_out, out, tmp;
19699 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
19700 return false;
19702 /* If one of the two operands is an interesting constant, load a
19703 constant with the above and mask it in with a logical operation. */
19705 if (CONST_INT_P (operands[2]))
19707 var = operands[3];
19708 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
19709 operands[3] = constm1_rtx, op = and_optab;
19710 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
19711 operands[3] = const0_rtx, op = ior_optab;
19712 else
19713 return false;
19715 else if (CONST_INT_P (operands[3]))
19717 var = operands[2];
19718 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
19719 operands[2] = constm1_rtx, op = and_optab;
19720 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
19721 operands[2] = const0_rtx, op = ior_optab;
19722 else
19723 return false;
19725 else
19726 return false;
19728 orig_out = operands[0];
19729 tmp = gen_reg_rtx (mode);
19730 operands[0] = tmp;
19732 /* Recurse to get the constant loaded. */
19733 if (ix86_expand_int_movcc (operands) == 0)
19734 return false;
19736 /* Mask in the interesting variable. */
19737 out = expand_binop (mode, op, var, tmp, orig_out, 0,
19738 OPTAB_WIDEN);
19739 if (!rtx_equal_p (out, orig_out))
19740 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
19742 return true;
19746 * For comparison with above,
19748 * movl cf,dest
19749 * movl ct,tmp
19750 * cmpl op1,op2
19751 * cmovcc tmp,dest
19753 * Size 15.
19756 if (! nonimmediate_operand (operands[2], mode))
19757 operands[2] = force_reg (mode, operands[2]);
19758 if (! nonimmediate_operand (operands[3], mode))
19759 operands[3] = force_reg (mode, operands[3]);
19761 if (! register_operand (operands[2], VOIDmode)
19762 && (mode == QImode
19763 || ! register_operand (operands[3], VOIDmode)))
19764 operands[2] = force_reg (mode, operands[2]);
19766 if (mode == QImode
19767 && ! register_operand (operands[3], VOIDmode))
19768 operands[3] = force_reg (mode, operands[3]);
19770 emit_insn (compare_seq);
19771 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
19772 gen_rtx_IF_THEN_ELSE (mode,
19773 compare_op, operands[2],
19774 operands[3])));
19775 return true;
19778 /* Swap, force into registers, or otherwise massage the two operands
19779 to an sse comparison with a mask result. Thus we differ a bit from
19780 ix86_prepare_fp_compare_args which expects to produce a flags result.
19782 The DEST operand exists to help determine whether to commute commutative
19783 operators. The POP0/POP1 operands are updated in place. The new
19784 comparison code is returned, or UNKNOWN if not implementable. */
19786 static enum rtx_code
19787 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
19788 rtx *pop0, rtx *pop1)
19790 rtx tmp;
19792 switch (code)
19794 case LTGT:
19795 case UNEQ:
19796 /* AVX supports all the needed comparisons. */
19797 if (TARGET_AVX)
19798 break;
19799 /* We have no LTGT as an operator. We could implement it with
19800 NE & ORDERED, but this requires an extra temporary. It's
19801 not clear that it's worth it. */
19802 return UNKNOWN;
19804 case LT:
19805 case LE:
19806 case UNGT:
19807 case UNGE:
19808 /* These are supported directly. */
19809 break;
19811 case EQ:
19812 case NE:
19813 case UNORDERED:
19814 case ORDERED:
19815 /* AVX has 3 operand comparisons, no need to swap anything. */
19816 if (TARGET_AVX)
19817 break;
19818 /* For commutative operators, try to canonicalize the destination
19819 operand to be first in the comparison - this helps reload to
19820 avoid extra moves. */
19821 if (!dest || !rtx_equal_p (dest, *pop1))
19822 break;
19823 /* FALLTHRU */
19825 case GE:
19826 case GT:
19827 case UNLE:
19828 case UNLT:
19829 /* These are not supported directly before AVX, and furthermore
19830 ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
19831 comparison operands to transform into something that is
19832 supported. */
19833 tmp = *pop0;
19834 *pop0 = *pop1;
19835 *pop1 = tmp;
19836 code = swap_condition (code);
19837 break;
19839 default:
19840 gcc_unreachable ();
19843 return code;
19846 /* Detect conditional moves that exactly match min/max operational
19847 semantics. Note that this is IEEE safe, as long as we don't
19848 interchange the operands.
19850 Returns FALSE if this conditional move doesn't match a MIN/MAX,
19851 and TRUE if the operation is successful and instructions are emitted. */
19853 static bool
19854 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
19855 rtx cmp_op1, rtx if_true, rtx if_false)
19857 enum machine_mode mode;
19858 bool is_min;
19859 rtx tmp;
19861 if (code == LT)
19863 else if (code == UNGE)
19865 tmp = if_true;
19866 if_true = if_false;
19867 if_false = tmp;
19869 else
19870 return false;
19872 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
19873 is_min = true;
19874 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
19875 is_min = false;
19876 else
19877 return false;
19879 mode = GET_MODE (dest);
19881 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
19882 but MODE may be a vector mode and thus not appropriate. */
19883 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
19885 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
19886 rtvec v;
19888 if_true = force_reg (mode, if_true);
19889 v = gen_rtvec (2, if_true, if_false);
19890 tmp = gen_rtx_UNSPEC (mode, v, u);
19892 else
19894 code = is_min ? SMIN : SMAX;
19895 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
19898 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
19899 return true;
19902 /* Expand an sse vector comparison. Return the register with the result. */
19904 static rtx
19905 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
19906 rtx op_true, rtx op_false)
19908 enum machine_mode mode = GET_MODE (dest);
19909 enum machine_mode cmp_mode = GET_MODE (cmp_op0);
19910 rtx x;
19912 cmp_op0 = force_reg (cmp_mode, cmp_op0);
19913 if (!nonimmediate_operand (cmp_op1, cmp_mode))
19914 cmp_op1 = force_reg (cmp_mode, cmp_op1);
19916 if (optimize
19917 || reg_overlap_mentioned_p (dest, op_true)
19918 || reg_overlap_mentioned_p (dest, op_false))
19919 dest = gen_reg_rtx (mode);
19921 x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
19922 if (cmp_mode != mode)
19924 x = force_reg (cmp_mode, x);
19925 convert_move (dest, x, false);
19927 else
19928 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19930 return dest;
19933 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
19934 operations. This is used for both scalar and vector conditional moves. */
19936 static void
19937 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
19939 enum machine_mode mode = GET_MODE (dest);
19940 rtx t2, t3, x;
19942 if (vector_all_ones_operand (op_true, mode)
19943 && rtx_equal_p (op_false, CONST0_RTX (mode)))
19945 emit_insn (gen_rtx_SET (VOIDmode, dest, cmp));
19947 else if (op_false == CONST0_RTX (mode))
19949 op_true = force_reg (mode, op_true);
19950 x = gen_rtx_AND (mode, cmp, op_true);
19951 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19953 else if (op_true == CONST0_RTX (mode))
19955 op_false = force_reg (mode, op_false);
19956 x = gen_rtx_NOT (mode, cmp);
19957 x = gen_rtx_AND (mode, x, op_false);
19958 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19960 else if (INTEGRAL_MODE_P (mode) && op_true == CONSTM1_RTX (mode))
19962 op_false = force_reg (mode, op_false);
19963 x = gen_rtx_IOR (mode, cmp, op_false);
19964 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
19966 else if (TARGET_XOP)
19968 op_true = force_reg (mode, op_true);
19970 if (!nonimmediate_operand (op_false, mode))
19971 op_false = force_reg (mode, op_false);
19973 emit_insn (gen_rtx_SET (mode, dest,
19974 gen_rtx_IF_THEN_ELSE (mode, cmp,
19975 op_true,
19976 op_false)));
19978 else
19980 rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
19982 if (!nonimmediate_operand (op_true, mode))
19983 op_true = force_reg (mode, op_true);
19985 op_false = force_reg (mode, op_false);
19987 switch (mode)
19989 case V4SFmode:
19990 if (TARGET_SSE4_1)
19991 gen = gen_sse4_1_blendvps;
19992 break;
19993 case V2DFmode:
19994 if (TARGET_SSE4_1)
19995 gen = gen_sse4_1_blendvpd;
19996 break;
19997 case V16QImode:
19998 case V8HImode:
19999 case V4SImode:
20000 case V2DImode:
20001 if (TARGET_SSE4_1)
20003 gen = gen_sse4_1_pblendvb;
20004 dest = gen_lowpart (V16QImode, dest);
20005 op_false = gen_lowpart (V16QImode, op_false);
20006 op_true = gen_lowpart (V16QImode, op_true);
20007 cmp = gen_lowpart (V16QImode, cmp);
20009 break;
20010 case V8SFmode:
20011 if (TARGET_AVX)
20012 gen = gen_avx_blendvps256;
20013 break;
20014 case V4DFmode:
20015 if (TARGET_AVX)
20016 gen = gen_avx_blendvpd256;
20017 break;
20018 case V32QImode:
20019 case V16HImode:
20020 case V8SImode:
20021 case V4DImode:
20022 if (TARGET_AVX2)
20024 gen = gen_avx2_pblendvb;
20025 dest = gen_lowpart (V32QImode, dest);
20026 op_false = gen_lowpart (V32QImode, op_false);
20027 op_true = gen_lowpart (V32QImode, op_true);
20028 cmp = gen_lowpart (V32QImode, cmp);
20030 break;
20031 default:
20032 break;
20035 if (gen != NULL)
20036 emit_insn (gen (dest, op_false, op_true, cmp));
20037 else
20039 op_true = force_reg (mode, op_true);
20041 t2 = gen_reg_rtx (mode);
20042 if (optimize)
20043 t3 = gen_reg_rtx (mode);
20044 else
20045 t3 = dest;
20047 x = gen_rtx_AND (mode, op_true, cmp);
20048 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
20050 x = gen_rtx_NOT (mode, cmp);
20051 x = gen_rtx_AND (mode, x, op_false);
20052 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
20054 x = gen_rtx_IOR (mode, t3, t2);
20055 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
20060 /* Expand a floating-point conditional move. Return true if successful. */
20062 bool
20063 ix86_expand_fp_movcc (rtx operands[])
20065 enum machine_mode mode = GET_MODE (operands[0]);
20066 enum rtx_code code = GET_CODE (operands[1]);
20067 rtx tmp, compare_op;
20068 rtx op0 = XEXP (operands[1], 0);
20069 rtx op1 = XEXP (operands[1], 1);
20071 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
20073 enum machine_mode cmode;
20075 /* Since we've no cmove for sse registers, don't force bad register
20076 allocation just to gain access to it. Deny movcc when the
20077 comparison mode doesn't match the move mode. */
20078 cmode = GET_MODE (op0);
20079 if (cmode == VOIDmode)
20080 cmode = GET_MODE (op1);
20081 if (cmode != mode)
20082 return false;
20084 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
20085 if (code == UNKNOWN)
20086 return false;
20088 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
20089 operands[2], operands[3]))
20090 return true;
20092 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
20093 operands[2], operands[3]);
20094 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
20095 return true;
20098 if (GET_MODE (op0) == TImode
20099 || (GET_MODE (op0) == DImode
20100 && !TARGET_64BIT))
20101 return false;
20103 /* The floating point conditional move instructions don't directly
20104 support conditions resulting from a signed integer comparison. */
20106 compare_op = ix86_expand_compare (code, op0, op1);
20107 if (!fcmov_comparison_operator (compare_op, VOIDmode))
20109 tmp = gen_reg_rtx (QImode);
20110 ix86_expand_setcc (tmp, code, op0, op1);
20112 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
20115 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
20116 gen_rtx_IF_THEN_ELSE (mode, compare_op,
20117 operands[2], operands[3])));
20119 return true;
20122 /* Expand a floating-point vector conditional move; a vcond operation
20123 rather than a movcc operation. */
20125 bool
20126 ix86_expand_fp_vcond (rtx operands[])
20128 enum rtx_code code = GET_CODE (operands[3]);
20129 rtx cmp;
20131 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
20132 &operands[4], &operands[5]);
20133 if (code == UNKNOWN)
20135 rtx temp;
20136 switch (GET_CODE (operands[3]))
20138 case LTGT:
20139 temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
20140 operands[5], operands[0], operands[0]);
20141 cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
20142 operands[5], operands[1], operands[2]);
20143 code = AND;
20144 break;
20145 case UNEQ:
20146 temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
20147 operands[5], operands[0], operands[0]);
20148 cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
20149 operands[5], operands[1], operands[2]);
20150 code = IOR;
20151 break;
20152 default:
20153 gcc_unreachable ();
20155 cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
20156 OPTAB_DIRECT);
20157 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20158 return true;
20161 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
20162 operands[5], operands[1], operands[2]))
20163 return true;
20165 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
20166 operands[1], operands[2]);
20167 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
20168 return true;
20171 /* Expand a signed/unsigned integral vector conditional move. */
20173 bool
20174 ix86_expand_int_vcond (rtx operands[])
20176 enum machine_mode data_mode = GET_MODE (operands[0]);
20177 enum machine_mode mode = GET_MODE (operands[4]);
20178 enum rtx_code code = GET_CODE (operands[3]);
20179 bool negate = false;
20180 rtx x, cop0, cop1;
20182 cop0 = operands[4];
20183 cop1 = operands[5];
20185 /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
20186 and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
20187 if ((code == LT || code == GE)
20188 && data_mode == mode
20189 && cop1 == CONST0_RTX (mode)
20190 && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
20191 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
20192 && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
20193 && (GET_MODE_SIZE (data_mode) == 16
20194 || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
20196 rtx negop = operands[2 - (code == LT)];
20197 int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
20198 if (negop == CONST1_RTX (data_mode))
20200 rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
20201 operands[0], 1, OPTAB_DIRECT);
20202 if (res != operands[0])
20203 emit_move_insn (operands[0], res);
20204 return true;
20206 else if (GET_MODE_INNER (data_mode) != DImode
20207 && vector_all_ones_operand (negop, data_mode))
20209 rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
20210 operands[0], 0, OPTAB_DIRECT);
20211 if (res != operands[0])
20212 emit_move_insn (operands[0], res);
20213 return true;
20217 if (!nonimmediate_operand (cop1, mode))
20218 cop1 = force_reg (mode, cop1);
20219 if (!general_operand (operands[1], data_mode))
20220 operands[1] = force_reg (data_mode, operands[1]);
20221 if (!general_operand (operands[2], data_mode))
20222 operands[2] = force_reg (data_mode, operands[2]);
20224 /* XOP supports all of the comparisons on all 128-bit vector int types. */
20225 if (TARGET_XOP
20226 && (mode == V16QImode || mode == V8HImode
20227 || mode == V4SImode || mode == V2DImode))
20229 else
20231 /* Canonicalize the comparison to EQ, GT, GTU. */
20232 switch (code)
20234 case EQ:
20235 case GT:
20236 case GTU:
20237 break;
20239 case NE:
20240 case LE:
20241 case LEU:
20242 code = reverse_condition (code);
20243 negate = true;
20244 break;
20246 case GE:
20247 case GEU:
20248 code = reverse_condition (code);
20249 negate = true;
20250 /* FALLTHRU */
20252 case LT:
20253 case LTU:
20254 code = swap_condition (code);
20255 x = cop0, cop0 = cop1, cop1 = x;
20256 break;
20258 default:
20259 gcc_unreachable ();
20262 /* Only SSE4.1/SSE4.2 supports V2DImode. */
20263 if (mode == V2DImode)
20265 switch (code)
20267 case EQ:
20268 /* SSE4.1 supports EQ. */
20269 if (!TARGET_SSE4_1)
20270 return false;
20271 break;
20273 case GT:
20274 case GTU:
20275 /* SSE4.2 supports GT/GTU. */
20276 if (!TARGET_SSE4_2)
20277 return false;
20278 break;
20280 default:
20281 gcc_unreachable ();
20285 /* Unsigned parallel compare is not supported by the hardware.
20286 Play some tricks to turn this into a signed comparison
20287 against 0. */
20288 if (code == GTU)
20290 cop0 = force_reg (mode, cop0);
20292 switch (mode)
20294 case V8SImode:
20295 case V4DImode:
20296 case V4SImode:
20297 case V2DImode:
20299 rtx t1, t2, mask;
20300 rtx (*gen_sub3) (rtx, rtx, rtx);
20302 switch (mode)
20304 case V8SImode: gen_sub3 = gen_subv8si3; break;
20305 case V4DImode: gen_sub3 = gen_subv4di3; break;
20306 case V4SImode: gen_sub3 = gen_subv4si3; break;
20307 case V2DImode: gen_sub3 = gen_subv2di3; break;
20308 default:
20309 gcc_unreachable ();
20311 /* Subtract (-(INT MAX) - 1) from both operands to make
20312 them signed. */
20313 mask = ix86_build_signbit_mask (mode, true, false);
20314 t1 = gen_reg_rtx (mode);
20315 emit_insn (gen_sub3 (t1, cop0, mask));
20317 t2 = gen_reg_rtx (mode);
20318 emit_insn (gen_sub3 (t2, cop1, mask));
20320 cop0 = t1;
20321 cop1 = t2;
20322 code = GT;
20324 break;
20326 case V32QImode:
20327 case V16HImode:
20328 case V16QImode:
20329 case V8HImode:
20330 /* Perform a parallel unsigned saturating subtraction. */
20331 x = gen_reg_rtx (mode);
20332 emit_insn (gen_rtx_SET (VOIDmode, x,
20333 gen_rtx_US_MINUS (mode, cop0, cop1)));
20335 cop0 = x;
20336 cop1 = CONST0_RTX (mode);
20337 code = EQ;
20338 negate = !negate;
20339 break;
20341 default:
20342 gcc_unreachable ();
20347 /* Allow the comparison to be done in one mode, but the movcc to
20348 happen in another mode. */
20349 if (data_mode == mode)
20351 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
20352 operands[1+negate], operands[2-negate]);
20354 else
20356 gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
20357 x = ix86_expand_sse_cmp (gen_lowpart (mode, operands[0]),
20358 code, cop0, cop1,
20359 operands[1+negate], operands[2-negate]);
20360 x = gen_lowpart (data_mode, x);
20363 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
20364 operands[2-negate]);
20365 return true;
20368 /* Expand a variable vector permutation. */
20370 void
20371 ix86_expand_vec_perm (rtx operands[])
20373 rtx target = operands[0];
20374 rtx op0 = operands[1];
20375 rtx op1 = operands[2];
20376 rtx mask = operands[3];
20377 rtx t1, t2, t3, t4, vt, vt2, vec[32];
20378 enum machine_mode mode = GET_MODE (op0);
20379 enum machine_mode maskmode = GET_MODE (mask);
20380 int w, e, i;
20381 bool one_operand_shuffle = rtx_equal_p (op0, op1);
20383 /* Number of elements in the vector. */
20384 w = GET_MODE_NUNITS (mode);
20385 e = GET_MODE_UNIT_SIZE (mode);
20386 gcc_assert (w <= 32);
20388 if (TARGET_AVX2)
20390 if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
20392 /* Unfortunately, the VPERMQ and VPERMPD instructions only support
20393 an constant shuffle operand. With a tiny bit of effort we can
20394 use VPERMD instead. A re-interpretation stall for V4DFmode is
20395 unfortunate but there's no avoiding it.
20396 Similarly for V16HImode we don't have instructions for variable
20397 shuffling, while for V32QImode we can use after preparing suitable
20398 masks vpshufb; vpshufb; vpermq; vpor. */
20400 if (mode == V16HImode)
20402 maskmode = mode = V32QImode;
20403 w = 32;
20404 e = 1;
20406 else
20408 maskmode = mode = V8SImode;
20409 w = 8;
20410 e = 4;
20412 t1 = gen_reg_rtx (maskmode);
20414 /* Replicate the low bits of the V4DImode mask into V8SImode:
20415 mask = { A B C D }
20416 t1 = { A A B B C C D D }. */
20417 for (i = 0; i < w / 2; ++i)
20418 vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
20419 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20420 vt = force_reg (maskmode, vt);
20421 mask = gen_lowpart (maskmode, mask);
20422 if (maskmode == V8SImode)
20423 emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
20424 else
20425 emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
20427 /* Multiply the shuffle indicies by two. */
20428 t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
20429 OPTAB_DIRECT);
20431 /* Add one to the odd shuffle indicies:
20432 t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
20433 for (i = 0; i < w / 2; ++i)
20435 vec[i * 2] = const0_rtx;
20436 vec[i * 2 + 1] = const1_rtx;
20438 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20439 vt = validize_mem (force_const_mem (maskmode, vt));
20440 t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
20441 OPTAB_DIRECT);
20443 /* Continue as if V8SImode (resp. V32QImode) was used initially. */
20444 operands[3] = mask = t1;
20445 target = gen_lowpart (mode, target);
20446 op0 = gen_lowpart (mode, op0);
20447 op1 = gen_lowpart (mode, op1);
20450 switch (mode)
20452 case V8SImode:
20453 /* The VPERMD and VPERMPS instructions already properly ignore
20454 the high bits of the shuffle elements. No need for us to
20455 perform an AND ourselves. */
20456 if (one_operand_shuffle)
20457 emit_insn (gen_avx2_permvarv8si (target, op0, mask));
20458 else
20460 t1 = gen_reg_rtx (V8SImode);
20461 t2 = gen_reg_rtx (V8SImode);
20462 emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
20463 emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
20464 goto merge_two;
20466 return;
20468 case V8SFmode:
20469 mask = gen_lowpart (V8SImode, mask);
20470 if (one_operand_shuffle)
20471 emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
20472 else
20474 t1 = gen_reg_rtx (V8SFmode);
20475 t2 = gen_reg_rtx (V8SFmode);
20476 emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
20477 emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
20478 goto merge_two;
20480 return;
20482 case V4SImode:
20483 /* By combining the two 128-bit input vectors into one 256-bit
20484 input vector, we can use VPERMD and VPERMPS for the full
20485 two-operand shuffle. */
20486 t1 = gen_reg_rtx (V8SImode);
20487 t2 = gen_reg_rtx (V8SImode);
20488 emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
20489 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20490 emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
20491 emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
20492 return;
20494 case V4SFmode:
20495 t1 = gen_reg_rtx (V8SFmode);
20496 t2 = gen_reg_rtx (V8SImode);
20497 mask = gen_lowpart (V4SImode, mask);
20498 emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
20499 emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
20500 emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
20501 emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
20502 return;
20504 case V32QImode:
20505 t1 = gen_reg_rtx (V32QImode);
20506 t2 = gen_reg_rtx (V32QImode);
20507 t3 = gen_reg_rtx (V32QImode);
20508 vt2 = GEN_INT (-128);
20509 for (i = 0; i < 32; i++)
20510 vec[i] = vt2;
20511 vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20512 vt = force_reg (V32QImode, vt);
20513 for (i = 0; i < 32; i++)
20514 vec[i] = i < 16 ? vt2 : const0_rtx;
20515 vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
20516 vt2 = force_reg (V32QImode, vt2);
20517 /* From mask create two adjusted masks, which contain the same
20518 bits as mask in the low 7 bits of each vector element.
20519 The first mask will have the most significant bit clear
20520 if it requests element from the same 128-bit lane
20521 and MSB set if it requests element from the other 128-bit lane.
20522 The second mask will have the opposite values of the MSB,
20523 and additionally will have its 128-bit lanes swapped.
20524 E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
20525 t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
20526 t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
20527 stands for other 12 bytes. */
20528 /* The bit whether element is from the same lane or the other
20529 lane is bit 4, so shift it up by 3 to the MSB position. */
20530 emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
20531 gen_lowpart (V4DImode, mask),
20532 GEN_INT (3)));
20533 /* Clear MSB bits from the mask just in case it had them set. */
20534 emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
20535 /* After this t1 will have MSB set for elements from other lane. */
20536 emit_insn (gen_xorv32qi3 (t1, t1, vt2));
20537 /* Clear bits other than MSB. */
20538 emit_insn (gen_andv32qi3 (t1, t1, vt));
20539 /* Or in the lower bits from mask into t3. */
20540 emit_insn (gen_iorv32qi3 (t3, t1, t2));
20541 /* And invert MSB bits in t1, so MSB is set for elements from the same
20542 lane. */
20543 emit_insn (gen_xorv32qi3 (t1, t1, vt));
20544 /* Swap 128-bit lanes in t3. */
20545 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20546 gen_lowpart (V4DImode, t3),
20547 const2_rtx, GEN_INT (3),
20548 const0_rtx, const1_rtx));
20549 /* And or in the lower bits from mask into t1. */
20550 emit_insn (gen_iorv32qi3 (t1, t1, t2));
20551 if (one_operand_shuffle)
20553 /* Each of these shuffles will put 0s in places where
20554 element from the other 128-bit lane is needed, otherwise
20555 will shuffle in the requested value. */
20556 emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
20557 emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
20558 /* For t3 the 128-bit lanes are swapped again. */
20559 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20560 gen_lowpart (V4DImode, t3),
20561 const2_rtx, GEN_INT (3),
20562 const0_rtx, const1_rtx));
20563 /* And oring both together leads to the result. */
20564 emit_insn (gen_iorv32qi3 (target, t1, t3));
20565 return;
20568 t4 = gen_reg_rtx (V32QImode);
20569 /* Similarly to the above one_operand_shuffle code,
20570 just for repeated twice for each operand. merge_two:
20571 code will merge the two results together. */
20572 emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
20573 emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
20574 emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
20575 emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
20576 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
20577 gen_lowpart (V4DImode, t4),
20578 const2_rtx, GEN_INT (3),
20579 const0_rtx, const1_rtx));
20580 emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
20581 gen_lowpart (V4DImode, t3),
20582 const2_rtx, GEN_INT (3),
20583 const0_rtx, const1_rtx));
20584 emit_insn (gen_iorv32qi3 (t4, t2, t4));
20585 emit_insn (gen_iorv32qi3 (t3, t1, t3));
20586 t1 = t4;
20587 t2 = t3;
20588 goto merge_two;
20590 default:
20591 gcc_assert (GET_MODE_SIZE (mode) <= 16);
20592 break;
20596 if (TARGET_XOP)
20598 /* The XOP VPPERM insn supports three inputs. By ignoring the
20599 one_operand_shuffle special case, we avoid creating another
20600 set of constant vectors in memory. */
20601 one_operand_shuffle = false;
20603 /* mask = mask & {2*w-1, ...} */
20604 vt = GEN_INT (2*w - 1);
20606 else
20608 /* mask = mask & {w-1, ...} */
20609 vt = GEN_INT (w - 1);
20612 for (i = 0; i < w; i++)
20613 vec[i] = vt;
20614 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20615 mask = expand_simple_binop (maskmode, AND, mask, vt,
20616 NULL_RTX, 0, OPTAB_DIRECT);
20618 /* For non-QImode operations, convert the word permutation control
20619 into a byte permutation control. */
20620 if (mode != V16QImode)
20622 mask = expand_simple_binop (maskmode, ASHIFT, mask,
20623 GEN_INT (exact_log2 (e)),
20624 NULL_RTX, 0, OPTAB_DIRECT);
20626 /* Convert mask to vector of chars. */
20627 mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
20629 /* Replicate each of the input bytes into byte positions:
20630 (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
20631 (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
20632 (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
20633 for (i = 0; i < 16; ++i)
20634 vec[i] = GEN_INT (i/e * e);
20635 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20636 vt = validize_mem (force_const_mem (V16QImode, vt));
20637 if (TARGET_XOP)
20638 emit_insn (gen_xop_pperm (mask, mask, mask, vt));
20639 else
20640 emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
20642 /* Convert it into the byte positions by doing
20643 mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
20644 for (i = 0; i < 16; ++i)
20645 vec[i] = GEN_INT (i % e);
20646 vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
20647 vt = validize_mem (force_const_mem (V16QImode, vt));
20648 emit_insn (gen_addv16qi3 (mask, mask, vt));
20651 /* The actual shuffle operations all operate on V16QImode. */
20652 op0 = gen_lowpart (V16QImode, op0);
20653 op1 = gen_lowpart (V16QImode, op1);
20654 target = gen_lowpart (V16QImode, target);
20656 if (TARGET_XOP)
20658 emit_insn (gen_xop_pperm (target, op0, op1, mask));
20660 else if (one_operand_shuffle)
20662 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
20664 else
20666 rtx xops[6];
20667 bool ok;
20669 /* Shuffle the two input vectors independently. */
20670 t1 = gen_reg_rtx (V16QImode);
20671 t2 = gen_reg_rtx (V16QImode);
20672 emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
20673 emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
20675 merge_two:
20676 /* Then merge them together. The key is whether any given control
20677 element contained a bit set that indicates the second word. */
20678 mask = operands[3];
20679 vt = GEN_INT (w);
20680 if (maskmode == V2DImode && !TARGET_SSE4_1)
20682 /* Without SSE4.1, we don't have V2DImode EQ. Perform one
20683 more shuffle to convert the V2DI input mask into a V4SI
20684 input mask. At which point the masking that expand_int_vcond
20685 will work as desired. */
20686 rtx t3 = gen_reg_rtx (V4SImode);
20687 emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
20688 const0_rtx, const0_rtx,
20689 const2_rtx, const2_rtx));
20690 mask = t3;
20691 maskmode = V4SImode;
20692 e = w = 4;
20695 for (i = 0; i < w; i++)
20696 vec[i] = vt;
20697 vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
20698 vt = force_reg (maskmode, vt);
20699 mask = expand_simple_binop (maskmode, AND, mask, vt,
20700 NULL_RTX, 0, OPTAB_DIRECT);
20702 xops[0] = gen_lowpart (mode, operands[0]);
20703 xops[1] = gen_lowpart (mode, t2);
20704 xops[2] = gen_lowpart (mode, t1);
20705 xops[3] = gen_rtx_EQ (maskmode, mask, vt);
20706 xops[4] = mask;
20707 xops[5] = vt;
20708 ok = ix86_expand_int_vcond (xops);
20709 gcc_assert (ok);
20713 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
20714 true if we should do zero extension, else sign extension. HIGH_P is
20715 true if we want the N/2 high elements, else the low elements. */
20717 void
20718 ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
20720 enum machine_mode imode = GET_MODE (src);
20721 rtx tmp;
20723 if (TARGET_SSE4_1)
20725 rtx (*unpack)(rtx, rtx);
20726 rtx (*extract)(rtx, rtx) = NULL;
20727 enum machine_mode halfmode = BLKmode;
20729 switch (imode)
20731 case V32QImode:
20732 if (unsigned_p)
20733 unpack = gen_avx2_zero_extendv16qiv16hi2;
20734 else
20735 unpack = gen_avx2_sign_extendv16qiv16hi2;
20736 halfmode = V16QImode;
20737 extract
20738 = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
20739 break;
20740 case V16HImode:
20741 if (unsigned_p)
20742 unpack = gen_avx2_zero_extendv8hiv8si2;
20743 else
20744 unpack = gen_avx2_sign_extendv8hiv8si2;
20745 halfmode = V8HImode;
20746 extract
20747 = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
20748 break;
20749 case V8SImode:
20750 if (unsigned_p)
20751 unpack = gen_avx2_zero_extendv4siv4di2;
20752 else
20753 unpack = gen_avx2_sign_extendv4siv4di2;
20754 halfmode = V4SImode;
20755 extract
20756 = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
20757 break;
20758 case V16QImode:
20759 if (unsigned_p)
20760 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
20761 else
20762 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
20763 break;
20764 case V8HImode:
20765 if (unsigned_p)
20766 unpack = gen_sse4_1_zero_extendv4hiv4si2;
20767 else
20768 unpack = gen_sse4_1_sign_extendv4hiv4si2;
20769 break;
20770 case V4SImode:
20771 if (unsigned_p)
20772 unpack = gen_sse4_1_zero_extendv2siv2di2;
20773 else
20774 unpack = gen_sse4_1_sign_extendv2siv2di2;
20775 break;
20776 default:
20777 gcc_unreachable ();
20780 if (GET_MODE_SIZE (imode) == 32)
20782 tmp = gen_reg_rtx (halfmode);
20783 emit_insn (extract (tmp, src));
20785 else if (high_p)
20787 /* Shift higher 8 bytes to lower 8 bytes. */
20788 tmp = gen_reg_rtx (imode);
20789 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
20790 gen_lowpart (V1TImode, src),
20791 GEN_INT (64)));
20793 else
20794 tmp = src;
20796 emit_insn (unpack (dest, tmp));
20798 else
20800 rtx (*unpack)(rtx, rtx, rtx);
20802 switch (imode)
20804 case V16QImode:
20805 if (high_p)
20806 unpack = gen_vec_interleave_highv16qi;
20807 else
20808 unpack = gen_vec_interleave_lowv16qi;
20809 break;
20810 case V8HImode:
20811 if (high_p)
20812 unpack = gen_vec_interleave_highv8hi;
20813 else
20814 unpack = gen_vec_interleave_lowv8hi;
20815 break;
20816 case V4SImode:
20817 if (high_p)
20818 unpack = gen_vec_interleave_highv4si;
20819 else
20820 unpack = gen_vec_interleave_lowv4si;
20821 break;
20822 default:
20823 gcc_unreachable ();
20826 if (unsigned_p)
20827 tmp = force_reg (imode, CONST0_RTX (imode));
20828 else
20829 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
20830 src, pc_rtx, pc_rtx);
20832 emit_insn (unpack (gen_lowpart (imode, dest), src, tmp));
20836 /* Expand conditional increment or decrement using adb/sbb instructions.
20837 The default case using setcc followed by the conditional move can be
20838 done by generic code. */
20839 bool
20840 ix86_expand_int_addcc (rtx operands[])
20842 enum rtx_code code = GET_CODE (operands[1]);
20843 rtx flags;
20844 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
20845 rtx compare_op;
20846 rtx val = const0_rtx;
20847 bool fpcmp = false;
20848 enum machine_mode mode;
20849 rtx op0 = XEXP (operands[1], 0);
20850 rtx op1 = XEXP (operands[1], 1);
20852 if (operands[3] != const1_rtx
20853 && operands[3] != constm1_rtx)
20854 return false;
20855 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
20856 return false;
20857 code = GET_CODE (compare_op);
20859 flags = XEXP (compare_op, 0);
20861 if (GET_MODE (flags) == CCFPmode
20862 || GET_MODE (flags) == CCFPUmode)
20864 fpcmp = true;
20865 code = ix86_fp_compare_code_to_integer (code);
20868 if (code != LTU)
20870 val = constm1_rtx;
20871 if (fpcmp)
20872 PUT_CODE (compare_op,
20873 reverse_condition_maybe_unordered
20874 (GET_CODE (compare_op)));
20875 else
20876 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
20879 mode = GET_MODE (operands[0]);
20881 /* Construct either adc or sbb insn. */
20882 if ((code == LTU) == (operands[3] == constm1_rtx))
20884 switch (mode)
20886 case QImode:
20887 insn = gen_subqi3_carry;
20888 break;
20889 case HImode:
20890 insn = gen_subhi3_carry;
20891 break;
20892 case SImode:
20893 insn = gen_subsi3_carry;
20894 break;
20895 case DImode:
20896 insn = gen_subdi3_carry;
20897 break;
20898 default:
20899 gcc_unreachable ();
20902 else
20904 switch (mode)
20906 case QImode:
20907 insn = gen_addqi3_carry;
20908 break;
20909 case HImode:
20910 insn = gen_addhi3_carry;
20911 break;
20912 case SImode:
20913 insn = gen_addsi3_carry;
20914 break;
20915 case DImode:
20916 insn = gen_adddi3_carry;
20917 break;
20918 default:
20919 gcc_unreachable ();
20922 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
20924 return true;
20928 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
20929 but works for floating pointer parameters and nonoffsetable memories.
20930 For pushes, it returns just stack offsets; the values will be saved
20931 in the right order. Maximally three parts are generated. */
20933 static int
20934 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
20936 int size;
20938 if (!TARGET_64BIT)
20939 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
20940 else
20941 size = (GET_MODE_SIZE (mode) + 4) / 8;
20943 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
20944 gcc_assert (size >= 2 && size <= 4);
20946 /* Optimize constant pool reference to immediates. This is used by fp
20947 moves, that force all constants to memory to allow combining. */
20948 if (MEM_P (operand) && MEM_READONLY_P (operand))
20950 rtx tmp = maybe_get_pool_constant (operand);
20951 if (tmp)
20952 operand = tmp;
20955 if (MEM_P (operand) && !offsettable_memref_p (operand))
20957 /* The only non-offsetable memories we handle are pushes. */
20958 int ok = push_operand (operand, VOIDmode);
20960 gcc_assert (ok);
20962 operand = copy_rtx (operand);
20963 PUT_MODE (operand, word_mode);
20964 parts[0] = parts[1] = parts[2] = parts[3] = operand;
20965 return size;
20968 if (GET_CODE (operand) == CONST_VECTOR)
20970 enum machine_mode imode = int_mode_for_mode (mode);
20971 /* Caution: if we looked through a constant pool memory above,
20972 the operand may actually have a different mode now. That's
20973 ok, since we want to pun this all the way back to an integer. */
20974 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
20975 gcc_assert (operand != NULL);
20976 mode = imode;
20979 if (!TARGET_64BIT)
20981 if (mode == DImode)
20982 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
20983 else
20985 int i;
20987 if (REG_P (operand))
20989 gcc_assert (reload_completed);
20990 for (i = 0; i < size; i++)
20991 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
20993 else if (offsettable_memref_p (operand))
20995 operand = adjust_address (operand, SImode, 0);
20996 parts[0] = operand;
20997 for (i = 1; i < size; i++)
20998 parts[i] = adjust_address (operand, SImode, 4 * i);
21000 else if (GET_CODE (operand) == CONST_DOUBLE)
21002 REAL_VALUE_TYPE r;
21003 long l[4];
21005 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21006 switch (mode)
21008 case TFmode:
21009 real_to_target (l, &r, mode);
21010 parts[3] = gen_int_mode (l[3], SImode);
21011 parts[2] = gen_int_mode (l[2], SImode);
21012 break;
21013 case XFmode:
21014 /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
21015 long double may not be 80-bit. */
21016 real_to_target (l, &r, mode);
21017 parts[2] = gen_int_mode (l[2], SImode);
21018 break;
21019 case DFmode:
21020 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
21021 break;
21022 default:
21023 gcc_unreachable ();
21025 parts[1] = gen_int_mode (l[1], SImode);
21026 parts[0] = gen_int_mode (l[0], SImode);
21028 else
21029 gcc_unreachable ();
21032 else
21034 if (mode == TImode)
21035 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
21036 if (mode == XFmode || mode == TFmode)
21038 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
21039 if (REG_P (operand))
21041 gcc_assert (reload_completed);
21042 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
21043 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
21045 else if (offsettable_memref_p (operand))
21047 operand = adjust_address (operand, DImode, 0);
21048 parts[0] = operand;
21049 parts[1] = adjust_address (operand, upper_mode, 8);
21051 else if (GET_CODE (operand) == CONST_DOUBLE)
21053 REAL_VALUE_TYPE r;
21054 long l[4];
21056 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
21057 real_to_target (l, &r, mode);
21059 /* Do not use shift by 32 to avoid warning on 32bit systems. */
21060 if (HOST_BITS_PER_WIDE_INT >= 64)
21061 parts[0]
21062 = gen_int_mode
21063 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
21064 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
21065 DImode);
21066 else
21067 parts[0] = immed_double_const (l[0], l[1], DImode);
21069 if (upper_mode == SImode)
21070 parts[1] = gen_int_mode (l[2], SImode);
21071 else if (HOST_BITS_PER_WIDE_INT >= 64)
21072 parts[1]
21073 = gen_int_mode
21074 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
21075 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
21076 DImode);
21077 else
21078 parts[1] = immed_double_const (l[2], l[3], DImode);
21080 else
21081 gcc_unreachable ();
21085 return size;
21088 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
21089 Return false when normal moves are needed; true when all required
21090 insns have been emitted. Operands 2-4 contain the input values
21091 int the correct order; operands 5-7 contain the output values. */
21093 void
21094 ix86_split_long_move (rtx operands[])
21096 rtx part[2][4];
21097 int nparts, i, j;
21098 int push = 0;
21099 int collisions = 0;
21100 enum machine_mode mode = GET_MODE (operands[0]);
21101 bool collisionparts[4];
21103 /* The DFmode expanders may ask us to move double.
21104 For 64bit target this is single move. By hiding the fact
21105 here we simplify i386.md splitters. */
21106 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
21108 /* Optimize constant pool reference to immediates. This is used by
21109 fp moves, that force all constants to memory to allow combining. */
21111 if (MEM_P (operands[1])
21112 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
21113 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
21114 operands[1] = get_pool_constant (XEXP (operands[1], 0));
21115 if (push_operand (operands[0], VOIDmode))
21117 operands[0] = copy_rtx (operands[0]);
21118 PUT_MODE (operands[0], word_mode);
21120 else
21121 operands[0] = gen_lowpart (DImode, operands[0]);
21122 operands[1] = gen_lowpart (DImode, operands[1]);
21123 emit_move_insn (operands[0], operands[1]);
21124 return;
21127 /* The only non-offsettable memory we handle is push. */
21128 if (push_operand (operands[0], VOIDmode))
21129 push = 1;
21130 else
21131 gcc_assert (!MEM_P (operands[0])
21132 || offsettable_memref_p (operands[0]));
21134 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
21135 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
21137 /* When emitting push, take care for source operands on the stack. */
21138 if (push && MEM_P (operands[1])
21139 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
21141 rtx src_base = XEXP (part[1][nparts - 1], 0);
21143 /* Compensate for the stack decrement by 4. */
21144 if (!TARGET_64BIT && nparts == 3
21145 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
21146 src_base = plus_constant (Pmode, src_base, 4);
21148 /* src_base refers to the stack pointer and is
21149 automatically decreased by emitted push. */
21150 for (i = 0; i < nparts; i++)
21151 part[1][i] = change_address (part[1][i],
21152 GET_MODE (part[1][i]), src_base);
21155 /* We need to do copy in the right order in case an address register
21156 of the source overlaps the destination. */
21157 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
21159 rtx tmp;
21161 for (i = 0; i < nparts; i++)
21163 collisionparts[i]
21164 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
21165 if (collisionparts[i])
21166 collisions++;
21169 /* Collision in the middle part can be handled by reordering. */
21170 if (collisions == 1 && nparts == 3 && collisionparts [1])
21172 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21173 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21175 else if (collisions == 1
21176 && nparts == 4
21177 && (collisionparts [1] || collisionparts [2]))
21179 if (collisionparts [1])
21181 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
21182 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
21184 else
21186 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
21187 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
21191 /* If there are more collisions, we can't handle it by reordering.
21192 Do an lea to the last part and use only one colliding move. */
21193 else if (collisions > 1)
21195 rtx base;
21197 collisions = 1;
21199 base = part[0][nparts - 1];
21201 /* Handle the case when the last part isn't valid for lea.
21202 Happens in 64-bit mode storing the 12-byte XFmode. */
21203 if (GET_MODE (base) != Pmode)
21204 base = gen_rtx_REG (Pmode, REGNO (base));
21206 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
21207 part[1][0] = replace_equiv_address (part[1][0], base);
21208 for (i = 1; i < nparts; i++)
21210 tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
21211 part[1][i] = replace_equiv_address (part[1][i], tmp);
21216 if (push)
21218 if (!TARGET_64BIT)
21220 if (nparts == 3)
21222 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
21223 emit_insn (ix86_gen_add3 (stack_pointer_rtx,
21224 stack_pointer_rtx, GEN_INT (-4)));
21225 emit_move_insn (part[0][2], part[1][2]);
21227 else if (nparts == 4)
21229 emit_move_insn (part[0][3], part[1][3]);
21230 emit_move_insn (part[0][2], part[1][2]);
21233 else
21235 /* In 64bit mode we don't have 32bit push available. In case this is
21236 register, it is OK - we will just use larger counterpart. We also
21237 retype memory - these comes from attempt to avoid REX prefix on
21238 moving of second half of TFmode value. */
21239 if (GET_MODE (part[1][1]) == SImode)
21241 switch (GET_CODE (part[1][1]))
21243 case MEM:
21244 part[1][1] = adjust_address (part[1][1], DImode, 0);
21245 break;
21247 case REG:
21248 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
21249 break;
21251 default:
21252 gcc_unreachable ();
21255 if (GET_MODE (part[1][0]) == SImode)
21256 part[1][0] = part[1][1];
21259 emit_move_insn (part[0][1], part[1][1]);
21260 emit_move_insn (part[0][0], part[1][0]);
21261 return;
21264 /* Choose correct order to not overwrite the source before it is copied. */
21265 if ((REG_P (part[0][0])
21266 && REG_P (part[1][1])
21267 && (REGNO (part[0][0]) == REGNO (part[1][1])
21268 || (nparts == 3
21269 && REGNO (part[0][0]) == REGNO (part[1][2]))
21270 || (nparts == 4
21271 && REGNO (part[0][0]) == REGNO (part[1][3]))))
21272 || (collisions > 0
21273 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
21275 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
21277 operands[2 + i] = part[0][j];
21278 operands[6 + i] = part[1][j];
21281 else
21283 for (i = 0; i < nparts; i++)
21285 operands[2 + i] = part[0][i];
21286 operands[6 + i] = part[1][i];
21290 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
21291 if (optimize_insn_for_size_p ())
21293 for (j = 0; j < nparts - 1; j++)
21294 if (CONST_INT_P (operands[6 + j])
21295 && operands[6 + j] != const0_rtx
21296 && REG_P (operands[2 + j]))
21297 for (i = j; i < nparts - 1; i++)
21298 if (CONST_INT_P (operands[7 + i])
21299 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
21300 operands[7 + i] = operands[2 + j];
21303 for (i = 0; i < nparts; i++)
21304 emit_move_insn (operands[2 + i], operands[6 + i]);
21306 return;
21309 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
21310 left shift by a constant, either using a single shift or
21311 a sequence of add instructions. */
21313 static void
21314 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
21316 rtx (*insn)(rtx, rtx, rtx);
21318 if (count == 1
21319 || (count * ix86_cost->add <= ix86_cost->shift_const
21320 && !optimize_insn_for_size_p ()))
21322 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
21323 while (count-- > 0)
21324 emit_insn (insn (operand, operand, operand));
21326 else
21328 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21329 emit_insn (insn (operand, operand, GEN_INT (count)));
21333 void
21334 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
21336 rtx (*gen_ashl3)(rtx, rtx, rtx);
21337 rtx (*gen_shld)(rtx, rtx, rtx);
21338 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21340 rtx low[2], high[2];
21341 int count;
21343 if (CONST_INT_P (operands[2]))
21345 split_double_mode (mode, operands, 2, low, high);
21346 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21348 if (count >= half_width)
21350 emit_move_insn (high[0], low[1]);
21351 emit_move_insn (low[0], const0_rtx);
21353 if (count > half_width)
21354 ix86_expand_ashl_const (high[0], count - half_width, mode);
21356 else
21358 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21360 if (!rtx_equal_p (operands[0], operands[1]))
21361 emit_move_insn (operands[0], operands[1]);
21363 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
21364 ix86_expand_ashl_const (low[0], count, mode);
21366 return;
21369 split_double_mode (mode, operands, 1, low, high);
21371 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
21373 if (operands[1] == const1_rtx)
21375 /* Assuming we've chosen a QImode capable registers, then 1 << N
21376 can be done with two 32/64-bit shifts, no branches, no cmoves. */
21377 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
21379 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
21381 ix86_expand_clear (low[0]);
21382 ix86_expand_clear (high[0]);
21383 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
21385 d = gen_lowpart (QImode, low[0]);
21386 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21387 s = gen_rtx_EQ (QImode, flags, const0_rtx);
21388 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21390 d = gen_lowpart (QImode, high[0]);
21391 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
21392 s = gen_rtx_NE (QImode, flags, const0_rtx);
21393 emit_insn (gen_rtx_SET (VOIDmode, d, s));
21396 /* Otherwise, we can get the same results by manually performing
21397 a bit extract operation on bit 5/6, and then performing the two
21398 shifts. The two methods of getting 0/1 into low/high are exactly
21399 the same size. Avoiding the shift in the bit extract case helps
21400 pentium4 a bit; no one else seems to care much either way. */
21401 else
21403 enum machine_mode half_mode;
21404 rtx (*gen_lshr3)(rtx, rtx, rtx);
21405 rtx (*gen_and3)(rtx, rtx, rtx);
21406 rtx (*gen_xor3)(rtx, rtx, rtx);
21407 HOST_WIDE_INT bits;
21408 rtx x;
21410 if (mode == DImode)
21412 half_mode = SImode;
21413 gen_lshr3 = gen_lshrsi3;
21414 gen_and3 = gen_andsi3;
21415 gen_xor3 = gen_xorsi3;
21416 bits = 5;
21418 else
21420 half_mode = DImode;
21421 gen_lshr3 = gen_lshrdi3;
21422 gen_and3 = gen_anddi3;
21423 gen_xor3 = gen_xordi3;
21424 bits = 6;
21427 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
21428 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
21429 else
21430 x = gen_lowpart (half_mode, operands[2]);
21431 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
21433 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
21434 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
21435 emit_move_insn (low[0], high[0]);
21436 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
21439 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21440 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
21441 return;
21444 if (operands[1] == constm1_rtx)
21446 /* For -1 << N, we can avoid the shld instruction, because we
21447 know that we're shifting 0...31/63 ones into a -1. */
21448 emit_move_insn (low[0], constm1_rtx);
21449 if (optimize_insn_for_size_p ())
21450 emit_move_insn (high[0], low[0]);
21451 else
21452 emit_move_insn (high[0], constm1_rtx);
21454 else
21456 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
21458 if (!rtx_equal_p (operands[0], operands[1]))
21459 emit_move_insn (operands[0], operands[1]);
21461 split_double_mode (mode, operands, 1, low, high);
21462 emit_insn (gen_shld (high[0], low[0], operands[2]));
21465 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
21467 if (TARGET_CMOVE && scratch)
21469 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21470 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21472 ix86_expand_clear (scratch);
21473 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
21475 else
21477 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21478 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21480 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
21484 void
21485 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
21487 rtx (*gen_ashr3)(rtx, rtx, rtx)
21488 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
21489 rtx (*gen_shrd)(rtx, rtx, rtx);
21490 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21492 rtx low[2], high[2];
21493 int count;
21495 if (CONST_INT_P (operands[2]))
21497 split_double_mode (mode, operands, 2, low, high);
21498 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21500 if (count == GET_MODE_BITSIZE (mode) - 1)
21502 emit_move_insn (high[0], high[1]);
21503 emit_insn (gen_ashr3 (high[0], high[0],
21504 GEN_INT (half_width - 1)));
21505 emit_move_insn (low[0], high[0]);
21508 else if (count >= half_width)
21510 emit_move_insn (low[0], high[1]);
21511 emit_move_insn (high[0], low[0]);
21512 emit_insn (gen_ashr3 (high[0], high[0],
21513 GEN_INT (half_width - 1)));
21515 if (count > half_width)
21516 emit_insn (gen_ashr3 (low[0], low[0],
21517 GEN_INT (count - half_width)));
21519 else
21521 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21523 if (!rtx_equal_p (operands[0], operands[1]))
21524 emit_move_insn (operands[0], operands[1]);
21526 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21527 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
21530 else
21532 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21534 if (!rtx_equal_p (operands[0], operands[1]))
21535 emit_move_insn (operands[0], operands[1]);
21537 split_double_mode (mode, operands, 1, low, high);
21539 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21540 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
21542 if (TARGET_CMOVE && scratch)
21544 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21545 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21547 emit_move_insn (scratch, high[0]);
21548 emit_insn (gen_ashr3 (scratch, scratch,
21549 GEN_INT (half_width - 1)));
21550 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21551 scratch));
21553 else
21555 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
21556 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
21558 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
21563 void
21564 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
21566 rtx (*gen_lshr3)(rtx, rtx, rtx)
21567 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
21568 rtx (*gen_shrd)(rtx, rtx, rtx);
21569 int half_width = GET_MODE_BITSIZE (mode) >> 1;
21571 rtx low[2], high[2];
21572 int count;
21574 if (CONST_INT_P (operands[2]))
21576 split_double_mode (mode, operands, 2, low, high);
21577 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
21579 if (count >= half_width)
21581 emit_move_insn (low[0], high[1]);
21582 ix86_expand_clear (high[0]);
21584 if (count > half_width)
21585 emit_insn (gen_lshr3 (low[0], low[0],
21586 GEN_INT (count - half_width)));
21588 else
21590 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21592 if (!rtx_equal_p (operands[0], operands[1]))
21593 emit_move_insn (operands[0], operands[1]);
21595 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
21596 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
21599 else
21601 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
21603 if (!rtx_equal_p (operands[0], operands[1]))
21604 emit_move_insn (operands[0], operands[1]);
21606 split_double_mode (mode, operands, 1, low, high);
21608 emit_insn (gen_shrd (low[0], high[0], operands[2]));
21609 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
21611 if (TARGET_CMOVE && scratch)
21613 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
21614 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
21616 ix86_expand_clear (scratch);
21617 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
21618 scratch));
21620 else
21622 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
21623 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
21625 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
21630 /* Predict just emitted jump instruction to be taken with probability PROB. */
21631 static void
21632 predict_jump (int prob)
21634 rtx insn = get_last_insn ();
21635 gcc_assert (JUMP_P (insn));
21636 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
21639 /* Helper function for the string operations below. Dest VARIABLE whether
21640 it is aligned to VALUE bytes. If true, jump to the label. */
21641 static rtx
21642 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
21644 rtx label = gen_label_rtx ();
21645 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
21646 if (GET_MODE (variable) == DImode)
21647 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
21648 else
21649 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
21650 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
21651 1, label);
21652 if (epilogue)
21653 predict_jump (REG_BR_PROB_BASE * 50 / 100);
21654 else
21655 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21656 return label;
21659 /* Adjust COUNTER by the VALUE. */
21660 static void
21661 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
21663 rtx (*gen_add)(rtx, rtx, rtx)
21664 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
21666 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
21669 /* Zero extend possibly SImode EXP to Pmode register. */
21671 ix86_zero_extend_to_Pmode (rtx exp)
21673 return force_reg (Pmode, convert_to_mode (Pmode, exp, 1));
21676 /* Divide COUNTREG by SCALE. */
21677 static rtx
21678 scale_counter (rtx countreg, int scale)
21680 rtx sc;
21682 if (scale == 1)
21683 return countreg;
21684 if (CONST_INT_P (countreg))
21685 return GEN_INT (INTVAL (countreg) / scale);
21686 gcc_assert (REG_P (countreg));
21688 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
21689 GEN_INT (exact_log2 (scale)),
21690 NULL, 1, OPTAB_DIRECT);
21691 return sc;
21694 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
21695 DImode for constant loop counts. */
21697 static enum machine_mode
21698 counter_mode (rtx count_exp)
21700 if (GET_MODE (count_exp) != VOIDmode)
21701 return GET_MODE (count_exp);
21702 if (!CONST_INT_P (count_exp))
21703 return Pmode;
21704 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
21705 return DImode;
21706 return SImode;
21709 /* Copy the address to a Pmode register. This is used for x32 to
21710 truncate DImode TLS address to a SImode register. */
21712 static rtx
21713 ix86_copy_addr_to_reg (rtx addr)
21715 if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
21716 return copy_addr_to_reg (addr);
21717 else
21719 gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
21720 return gen_rtx_SUBREG (SImode, copy_to_mode_reg (DImode, addr), 0);
21724 /* When SRCPTR is non-NULL, output simple loop to move memory
21725 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
21726 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
21727 equivalent loop to set memory by VALUE (supposed to be in MODE).
21729 The size is rounded down to whole number of chunk size moved at once.
21730 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
21733 static void
21734 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
21735 rtx destptr, rtx srcptr, rtx value,
21736 rtx count, enum machine_mode mode, int unroll,
21737 int expected_size)
21739 rtx out_label, top_label, iter, tmp;
21740 enum machine_mode iter_mode = counter_mode (count);
21741 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
21742 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
21743 rtx size;
21744 rtx x_addr;
21745 rtx y_addr;
21746 int i;
21748 top_label = gen_label_rtx ();
21749 out_label = gen_label_rtx ();
21750 iter = gen_reg_rtx (iter_mode);
21752 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
21753 NULL, 1, OPTAB_DIRECT);
21754 /* Those two should combine. */
21755 if (piece_size == const1_rtx)
21757 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
21758 true, out_label);
21759 predict_jump (REG_BR_PROB_BASE * 10 / 100);
21761 emit_move_insn (iter, const0_rtx);
21763 emit_label (top_label);
21765 tmp = convert_modes (Pmode, iter_mode, iter, true);
21766 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
21767 destmem = change_address (destmem, mode, x_addr);
21769 if (srcmem)
21771 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
21772 srcmem = change_address (srcmem, mode, y_addr);
21774 /* When unrolling for chips that reorder memory reads and writes,
21775 we can save registers by using single temporary.
21776 Also using 4 temporaries is overkill in 32bit mode. */
21777 if (!TARGET_64BIT && 0)
21779 for (i = 0; i < unroll; i++)
21781 if (i)
21783 destmem =
21784 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21785 srcmem =
21786 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21788 emit_move_insn (destmem, srcmem);
21791 else
21793 rtx tmpreg[4];
21794 gcc_assert (unroll <= 4);
21795 for (i = 0; i < unroll; i++)
21797 tmpreg[i] = gen_reg_rtx (mode);
21798 if (i)
21800 srcmem =
21801 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
21803 emit_move_insn (tmpreg[i], srcmem);
21805 for (i = 0; i < unroll; i++)
21807 if (i)
21809 destmem =
21810 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21812 emit_move_insn (destmem, tmpreg[i]);
21816 else
21817 for (i = 0; i < unroll; i++)
21819 if (i)
21820 destmem =
21821 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
21822 emit_move_insn (destmem, value);
21825 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
21826 true, OPTAB_LIB_WIDEN);
21827 if (tmp != iter)
21828 emit_move_insn (iter, tmp);
21830 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
21831 true, top_label);
21832 if (expected_size != -1)
21834 expected_size /= GET_MODE_SIZE (mode) * unroll;
21835 if (expected_size == 0)
21836 predict_jump (0);
21837 else if (expected_size > REG_BR_PROB_BASE)
21838 predict_jump (REG_BR_PROB_BASE - 1);
21839 else
21840 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
21842 else
21843 predict_jump (REG_BR_PROB_BASE * 80 / 100);
21844 iter = ix86_zero_extend_to_Pmode (iter);
21845 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
21846 true, OPTAB_LIB_WIDEN);
21847 if (tmp != destptr)
21848 emit_move_insn (destptr, tmp);
21849 if (srcptr)
21851 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
21852 true, OPTAB_LIB_WIDEN);
21853 if (tmp != srcptr)
21854 emit_move_insn (srcptr, tmp);
21856 emit_label (out_label);
21859 /* Output "rep; mov" instruction.
21860 Arguments have same meaning as for previous function */
21861 static void
21862 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
21863 rtx destptr, rtx srcptr,
21864 rtx count,
21865 enum machine_mode mode)
21867 rtx destexp;
21868 rtx srcexp;
21869 rtx countreg;
21870 HOST_WIDE_INT rounded_count;
21872 /* If the size is known, it is shorter to use rep movs. */
21873 if (mode == QImode && CONST_INT_P (count)
21874 && !(INTVAL (count) & 3))
21875 mode = SImode;
21877 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21878 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21879 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
21880 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
21881 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21882 if (mode != QImode)
21884 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21885 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21886 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21887 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
21888 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21889 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
21891 else
21893 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21894 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
21896 if (CONST_INT_P (count))
21898 rounded_count = (INTVAL (count)
21899 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21900 destmem = shallow_copy_rtx (destmem);
21901 srcmem = shallow_copy_rtx (srcmem);
21902 set_mem_size (destmem, rounded_count);
21903 set_mem_size (srcmem, rounded_count);
21905 else
21907 if (MEM_SIZE_KNOWN_P (destmem))
21908 clear_mem_size (destmem);
21909 if (MEM_SIZE_KNOWN_P (srcmem))
21910 clear_mem_size (srcmem);
21912 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
21913 destexp, srcexp));
21916 /* Output "rep; stos" instruction.
21917 Arguments have same meaning as for previous function */
21918 static void
21919 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
21920 rtx count, enum machine_mode mode,
21921 rtx orig_value)
21923 rtx destexp;
21924 rtx countreg;
21925 HOST_WIDE_INT rounded_count;
21927 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
21928 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
21929 value = force_reg (mode, gen_lowpart (mode, value));
21930 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
21931 if (mode != QImode)
21933 destexp = gen_rtx_ASHIFT (Pmode, countreg,
21934 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
21935 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
21937 else
21938 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
21939 if (orig_value == const0_rtx && CONST_INT_P (count))
21941 rounded_count = (INTVAL (count)
21942 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
21943 destmem = shallow_copy_rtx (destmem);
21944 set_mem_size (destmem, rounded_count);
21946 else if (MEM_SIZE_KNOWN_P (destmem))
21947 clear_mem_size (destmem);
21948 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
21951 static void
21952 emit_strmov (rtx destmem, rtx srcmem,
21953 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
21955 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
21956 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
21957 emit_insn (gen_strmov (destptr, dest, srcptr, src));
21960 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
21961 static void
21962 expand_movmem_epilogue (rtx destmem, rtx srcmem,
21963 rtx destptr, rtx srcptr, rtx count, int max_size)
21965 rtx src, dest;
21966 if (CONST_INT_P (count))
21968 HOST_WIDE_INT countval = INTVAL (count);
21969 int offset = 0;
21971 if ((countval & 0x10) && max_size > 16)
21973 if (TARGET_64BIT)
21975 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21976 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
21978 else
21979 gcc_unreachable ();
21980 offset += 16;
21982 if ((countval & 0x08) && max_size > 8)
21984 if (TARGET_64BIT)
21985 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
21986 else
21988 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21989 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
21991 offset += 8;
21993 if ((countval & 0x04) && max_size > 4)
21995 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
21996 offset += 4;
21998 if ((countval & 0x02) && max_size > 2)
22000 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
22001 offset += 2;
22003 if ((countval & 0x01) && max_size > 1)
22005 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
22006 offset += 1;
22008 return;
22010 if (max_size > 8)
22012 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
22013 count, 1, OPTAB_DIRECT);
22014 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
22015 count, QImode, 1, 4);
22016 return;
22019 /* When there are stringops, we can cheaply increase dest and src pointers.
22020 Otherwise we save code size by maintaining offset (zero is readily
22021 available from preceding rep operation) and using x86 addressing modes.
22023 if (TARGET_SINGLE_STRINGOP)
22025 if (max_size > 4)
22027 rtx label = ix86_expand_aligntest (count, 4, true);
22028 src = change_address (srcmem, SImode, srcptr);
22029 dest = change_address (destmem, SImode, destptr);
22030 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22031 emit_label (label);
22032 LABEL_NUSES (label) = 1;
22034 if (max_size > 2)
22036 rtx label = ix86_expand_aligntest (count, 2, true);
22037 src = change_address (srcmem, HImode, srcptr);
22038 dest = change_address (destmem, HImode, destptr);
22039 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22040 emit_label (label);
22041 LABEL_NUSES (label) = 1;
22043 if (max_size > 1)
22045 rtx label = ix86_expand_aligntest (count, 1, true);
22046 src = change_address (srcmem, QImode, srcptr);
22047 dest = change_address (destmem, QImode, destptr);
22048 emit_insn (gen_strmov (destptr, dest, srcptr, src));
22049 emit_label (label);
22050 LABEL_NUSES (label) = 1;
22053 else
22055 rtx offset = force_reg (Pmode, const0_rtx);
22056 rtx tmp;
22058 if (max_size > 4)
22060 rtx label = ix86_expand_aligntest (count, 4, true);
22061 src = change_address (srcmem, SImode, srcptr);
22062 dest = change_address (destmem, SImode, destptr);
22063 emit_move_insn (dest, src);
22064 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
22065 true, OPTAB_LIB_WIDEN);
22066 if (tmp != offset)
22067 emit_move_insn (offset, tmp);
22068 emit_label (label);
22069 LABEL_NUSES (label) = 1;
22071 if (max_size > 2)
22073 rtx label = ix86_expand_aligntest (count, 2, true);
22074 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22075 src = change_address (srcmem, HImode, tmp);
22076 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22077 dest = change_address (destmem, HImode, tmp);
22078 emit_move_insn (dest, src);
22079 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
22080 true, OPTAB_LIB_WIDEN);
22081 if (tmp != offset)
22082 emit_move_insn (offset, tmp);
22083 emit_label (label);
22084 LABEL_NUSES (label) = 1;
22086 if (max_size > 1)
22088 rtx label = ix86_expand_aligntest (count, 1, true);
22089 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
22090 src = change_address (srcmem, QImode, tmp);
22091 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
22092 dest = change_address (destmem, QImode, tmp);
22093 emit_move_insn (dest, src);
22094 emit_label (label);
22095 LABEL_NUSES (label) = 1;
22100 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22101 static void
22102 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
22103 rtx count, int max_size)
22105 count =
22106 expand_simple_binop (counter_mode (count), AND, count,
22107 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
22108 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
22109 gen_lowpart (QImode, value), count, QImode,
22110 1, max_size / 2);
22113 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
22114 static void
22115 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
22117 rtx dest;
22119 if (CONST_INT_P (count))
22121 HOST_WIDE_INT countval = INTVAL (count);
22122 int offset = 0;
22124 if ((countval & 0x10) && max_size > 16)
22126 if (TARGET_64BIT)
22128 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22129 emit_insn (gen_strset (destptr, dest, value));
22130 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
22131 emit_insn (gen_strset (destptr, dest, value));
22133 else
22134 gcc_unreachable ();
22135 offset += 16;
22137 if ((countval & 0x08) && max_size > 8)
22139 if (TARGET_64BIT)
22141 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
22142 emit_insn (gen_strset (destptr, dest, value));
22144 else
22146 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22147 emit_insn (gen_strset (destptr, dest, value));
22148 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
22149 emit_insn (gen_strset (destptr, dest, value));
22151 offset += 8;
22153 if ((countval & 0x04) && max_size > 4)
22155 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
22156 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22157 offset += 4;
22159 if ((countval & 0x02) && max_size > 2)
22161 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
22162 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22163 offset += 2;
22165 if ((countval & 0x01) && max_size > 1)
22167 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
22168 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22169 offset += 1;
22171 return;
22173 if (max_size > 32)
22175 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
22176 return;
22178 if (max_size > 16)
22180 rtx label = ix86_expand_aligntest (count, 16, true);
22181 if (TARGET_64BIT)
22183 dest = change_address (destmem, DImode, destptr);
22184 emit_insn (gen_strset (destptr, dest, value));
22185 emit_insn (gen_strset (destptr, dest, value));
22187 else
22189 dest = change_address (destmem, SImode, destptr);
22190 emit_insn (gen_strset (destptr, dest, value));
22191 emit_insn (gen_strset (destptr, dest, value));
22192 emit_insn (gen_strset (destptr, dest, value));
22193 emit_insn (gen_strset (destptr, dest, value));
22195 emit_label (label);
22196 LABEL_NUSES (label) = 1;
22198 if (max_size > 8)
22200 rtx label = ix86_expand_aligntest (count, 8, true);
22201 if (TARGET_64BIT)
22203 dest = change_address (destmem, DImode, destptr);
22204 emit_insn (gen_strset (destptr, dest, value));
22206 else
22208 dest = change_address (destmem, SImode, destptr);
22209 emit_insn (gen_strset (destptr, dest, value));
22210 emit_insn (gen_strset (destptr, dest, value));
22212 emit_label (label);
22213 LABEL_NUSES (label) = 1;
22215 if (max_size > 4)
22217 rtx label = ix86_expand_aligntest (count, 4, true);
22218 dest = change_address (destmem, SImode, destptr);
22219 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
22220 emit_label (label);
22221 LABEL_NUSES (label) = 1;
22223 if (max_size > 2)
22225 rtx label = ix86_expand_aligntest (count, 2, true);
22226 dest = change_address (destmem, HImode, destptr);
22227 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
22228 emit_label (label);
22229 LABEL_NUSES (label) = 1;
22231 if (max_size > 1)
22233 rtx label = ix86_expand_aligntest (count, 1, true);
22234 dest = change_address (destmem, QImode, destptr);
22235 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
22236 emit_label (label);
22237 LABEL_NUSES (label) = 1;
22241 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
22242 DESIRED_ALIGNMENT. */
22243 static void
22244 expand_movmem_prologue (rtx destmem, rtx srcmem,
22245 rtx destptr, rtx srcptr, rtx count,
22246 int align, int desired_alignment)
22248 if (align <= 1 && desired_alignment > 1)
22250 rtx label = ix86_expand_aligntest (destptr, 1, false);
22251 srcmem = change_address (srcmem, QImode, srcptr);
22252 destmem = change_address (destmem, QImode, destptr);
22253 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22254 ix86_adjust_counter (count, 1);
22255 emit_label (label);
22256 LABEL_NUSES (label) = 1;
22258 if (align <= 2 && desired_alignment > 2)
22260 rtx label = ix86_expand_aligntest (destptr, 2, false);
22261 srcmem = change_address (srcmem, HImode, srcptr);
22262 destmem = change_address (destmem, HImode, destptr);
22263 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22264 ix86_adjust_counter (count, 2);
22265 emit_label (label);
22266 LABEL_NUSES (label) = 1;
22268 if (align <= 4 && desired_alignment > 4)
22270 rtx label = ix86_expand_aligntest (destptr, 4, false);
22271 srcmem = change_address (srcmem, SImode, srcptr);
22272 destmem = change_address (destmem, SImode, destptr);
22273 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
22274 ix86_adjust_counter (count, 4);
22275 emit_label (label);
22276 LABEL_NUSES (label) = 1;
22278 gcc_assert (desired_alignment <= 8);
22281 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
22282 ALIGN_BYTES is how many bytes need to be copied. */
22283 static rtx
22284 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
22285 int desired_align, int align_bytes)
22287 rtx src = *srcp;
22288 rtx orig_dst = dst;
22289 rtx orig_src = src;
22290 int off = 0;
22291 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
22292 if (src_align_bytes >= 0)
22293 src_align_bytes = desired_align - src_align_bytes;
22294 if (align_bytes & 1)
22296 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22297 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
22298 off = 1;
22299 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22301 if (align_bytes & 2)
22303 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22304 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
22305 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22306 set_mem_align (dst, 2 * BITS_PER_UNIT);
22307 if (src_align_bytes >= 0
22308 && (src_align_bytes & 1) == (align_bytes & 1)
22309 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
22310 set_mem_align (src, 2 * BITS_PER_UNIT);
22311 off = 2;
22312 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22314 if (align_bytes & 4)
22316 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22317 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
22318 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22319 set_mem_align (dst, 4 * BITS_PER_UNIT);
22320 if (src_align_bytes >= 0)
22322 unsigned int src_align = 0;
22323 if ((src_align_bytes & 3) == (align_bytes & 3))
22324 src_align = 4;
22325 else if ((src_align_bytes & 1) == (align_bytes & 1))
22326 src_align = 2;
22327 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22328 set_mem_align (src, src_align * BITS_PER_UNIT);
22330 off = 4;
22331 emit_insn (gen_strmov (destreg, dst, srcreg, src));
22333 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22334 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
22335 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22336 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22337 if (src_align_bytes >= 0)
22339 unsigned int src_align = 0;
22340 if ((src_align_bytes & 7) == (align_bytes & 7))
22341 src_align = 8;
22342 else if ((src_align_bytes & 3) == (align_bytes & 3))
22343 src_align = 4;
22344 else if ((src_align_bytes & 1) == (align_bytes & 1))
22345 src_align = 2;
22346 if (src_align > (unsigned int) desired_align)
22347 src_align = desired_align;
22348 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
22349 set_mem_align (src, src_align * BITS_PER_UNIT);
22351 if (MEM_SIZE_KNOWN_P (orig_dst))
22352 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22353 if (MEM_SIZE_KNOWN_P (orig_src))
22354 set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
22355 *srcp = src;
22356 return dst;
22359 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
22360 DESIRED_ALIGNMENT. */
22361 static void
22362 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
22363 int align, int desired_alignment)
22365 if (align <= 1 && desired_alignment > 1)
22367 rtx label = ix86_expand_aligntest (destptr, 1, false);
22368 destmem = change_address (destmem, QImode, destptr);
22369 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
22370 ix86_adjust_counter (count, 1);
22371 emit_label (label);
22372 LABEL_NUSES (label) = 1;
22374 if (align <= 2 && desired_alignment > 2)
22376 rtx label = ix86_expand_aligntest (destptr, 2, false);
22377 destmem = change_address (destmem, HImode, destptr);
22378 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
22379 ix86_adjust_counter (count, 2);
22380 emit_label (label);
22381 LABEL_NUSES (label) = 1;
22383 if (align <= 4 && desired_alignment > 4)
22385 rtx label = ix86_expand_aligntest (destptr, 4, false);
22386 destmem = change_address (destmem, SImode, destptr);
22387 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
22388 ix86_adjust_counter (count, 4);
22389 emit_label (label);
22390 LABEL_NUSES (label) = 1;
22392 gcc_assert (desired_alignment <= 8);
22395 /* Set enough from DST to align DST known to by aligned by ALIGN to
22396 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
22397 static rtx
22398 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
22399 int desired_align, int align_bytes)
22401 int off = 0;
22402 rtx orig_dst = dst;
22403 if (align_bytes & 1)
22405 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
22406 off = 1;
22407 emit_insn (gen_strset (destreg, dst,
22408 gen_lowpart (QImode, value)));
22410 if (align_bytes & 2)
22412 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
22413 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
22414 set_mem_align (dst, 2 * BITS_PER_UNIT);
22415 off = 2;
22416 emit_insn (gen_strset (destreg, dst,
22417 gen_lowpart (HImode, value)));
22419 if (align_bytes & 4)
22421 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
22422 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
22423 set_mem_align (dst, 4 * BITS_PER_UNIT);
22424 off = 4;
22425 emit_insn (gen_strset (destreg, dst,
22426 gen_lowpart (SImode, value)));
22428 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
22429 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
22430 set_mem_align (dst, desired_align * BITS_PER_UNIT);
22431 if (MEM_SIZE_KNOWN_P (orig_dst))
22432 set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
22433 return dst;
22436 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
22437 static enum stringop_alg
22438 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
22439 int *dynamic_check, bool *noalign)
22441 const struct stringop_algs * algs;
22442 bool optimize_for_speed;
22443 /* Algorithms using the rep prefix want at least edi and ecx;
22444 additionally, memset wants eax and memcpy wants esi. Don't
22445 consider such algorithms if the user has appropriated those
22446 registers for their own purposes. */
22447 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
22448 || (memset
22449 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
22450 *noalign = false;
22452 #define ALG_USABLE_P(alg) (rep_prefix_usable \
22453 || (alg != rep_prefix_1_byte \
22454 && alg != rep_prefix_4_byte \
22455 && alg != rep_prefix_8_byte))
22456 const struct processor_costs *cost;
22458 /* Even if the string operation call is cold, we still might spend a lot
22459 of time processing large blocks. */
22460 if (optimize_function_for_size_p (cfun)
22461 || (optimize_insn_for_size_p ()
22462 && expected_size != -1 && expected_size < 256))
22463 optimize_for_speed = false;
22464 else
22465 optimize_for_speed = true;
22467 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
22469 *dynamic_check = -1;
22470 if (memset)
22471 algs = &cost->memset[TARGET_64BIT != 0];
22472 else
22473 algs = &cost->memcpy[TARGET_64BIT != 0];
22474 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
22475 return ix86_stringop_alg;
22476 /* rep; movq or rep; movl is the smallest variant. */
22477 else if (!optimize_for_speed)
22479 if (!count || (count & 3))
22480 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
22481 else
22482 return rep_prefix_usable ? rep_prefix_4_byte : loop;
22484 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
22486 else if (expected_size != -1 && expected_size < 4)
22487 return loop_1_byte;
22488 else if (expected_size != -1)
22490 unsigned int i;
22491 enum stringop_alg alg = libcall;
22492 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22494 /* We get here if the algorithms that were not libcall-based
22495 were rep-prefix based and we are unable to use rep prefixes
22496 based on global register usage. Break out of the loop and
22497 use the heuristic below. */
22498 if (algs->size[i].max == 0)
22499 break;
22500 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
22502 enum stringop_alg candidate = algs->size[i].alg;
22504 if (candidate != libcall && ALG_USABLE_P (candidate))
22505 alg = candidate;
22506 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
22507 last non-libcall inline algorithm. */
22508 if (TARGET_INLINE_ALL_STRINGOPS)
22510 /* When the current size is best to be copied by a libcall,
22511 but we are still forced to inline, run the heuristic below
22512 that will pick code for medium sized blocks. */
22513 if (alg != libcall)
22514 return alg;
22515 break;
22517 else if (ALG_USABLE_P (candidate))
22519 *noalign = algs->size[i].noalign;
22520 return candidate;
22524 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
22526 /* When asked to inline the call anyway, try to pick meaningful choice.
22527 We look for maximal size of block that is faster to copy by hand and
22528 take blocks of at most of that size guessing that average size will
22529 be roughly half of the block.
22531 If this turns out to be bad, we might simply specify the preferred
22532 choice in ix86_costs. */
22533 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22534 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
22536 int max = -1;
22537 enum stringop_alg alg;
22538 int i;
22539 bool any_alg_usable_p = true;
22541 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
22543 enum stringop_alg candidate = algs->size[i].alg;
22544 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
22546 if (candidate != libcall && candidate
22547 && ALG_USABLE_P (candidate))
22548 max = algs->size[i].max;
22550 /* If there aren't any usable algorithms, then recursing on
22551 smaller sizes isn't going to find anything. Just return the
22552 simple byte-at-a-time copy loop. */
22553 if (!any_alg_usable_p)
22555 /* Pick something reasonable. */
22556 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22557 *dynamic_check = 128;
22558 return loop_1_byte;
22560 if (max == -1)
22561 max = 4096;
22562 alg = decide_alg (count, max / 2, memset, dynamic_check, noalign);
22563 gcc_assert (*dynamic_check == -1);
22564 gcc_assert (alg != libcall);
22565 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
22566 *dynamic_check = max;
22567 return alg;
22569 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
22570 #undef ALG_USABLE_P
22573 /* Decide on alignment. We know that the operand is already aligned to ALIGN
22574 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
22575 static int
22576 decide_alignment (int align,
22577 enum stringop_alg alg,
22578 int expected_size)
22580 int desired_align = 0;
22581 switch (alg)
22583 case no_stringop:
22584 gcc_unreachable ();
22585 case loop:
22586 case unrolled_loop:
22587 desired_align = GET_MODE_SIZE (Pmode);
22588 break;
22589 case rep_prefix_8_byte:
22590 desired_align = 8;
22591 break;
22592 case rep_prefix_4_byte:
22593 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22594 copying whole cacheline at once. */
22595 if (TARGET_PENTIUMPRO)
22596 desired_align = 8;
22597 else
22598 desired_align = 4;
22599 break;
22600 case rep_prefix_1_byte:
22601 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
22602 copying whole cacheline at once. */
22603 if (TARGET_PENTIUMPRO)
22604 desired_align = 8;
22605 else
22606 desired_align = 1;
22607 break;
22608 case loop_1_byte:
22609 desired_align = 1;
22610 break;
22611 case libcall:
22612 return 0;
22615 if (optimize_size)
22616 desired_align = 1;
22617 if (desired_align < align)
22618 desired_align = align;
22619 if (expected_size != -1 && expected_size < 4)
22620 desired_align = align;
22621 return desired_align;
22624 /* Return the smallest power of 2 greater than VAL. */
22625 static int
22626 smallest_pow2_greater_than (int val)
22628 int ret = 1;
22629 while (ret <= val)
22630 ret <<= 1;
22631 return ret;
22634 /* Expand string move (memcpy) operation. Use i386 string operations
22635 when profitable. expand_setmem contains similar code. The code
22636 depends upon architecture, block size and alignment, but always has
22637 the same overall structure:
22639 1) Prologue guard: Conditional that jumps up to epilogues for small
22640 blocks that can be handled by epilogue alone. This is faster
22641 but also needed for correctness, since prologue assume the block
22642 is larger than the desired alignment.
22644 Optional dynamic check for size and libcall for large
22645 blocks is emitted here too, with -minline-stringops-dynamically.
22647 2) Prologue: copy first few bytes in order to get destination
22648 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
22649 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
22650 copied. We emit either a jump tree on power of two sized
22651 blocks, or a byte loop.
22653 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
22654 with specified algorithm.
22656 4) Epilogue: code copying tail of the block that is too small to be
22657 handled by main body (or up to size guarded by prologue guard). */
22659 bool
22660 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
22661 rtx expected_align_exp, rtx expected_size_exp)
22663 rtx destreg;
22664 rtx srcreg;
22665 rtx label = NULL;
22666 rtx tmp;
22667 rtx jump_around_label = NULL;
22668 HOST_WIDE_INT align = 1;
22669 unsigned HOST_WIDE_INT count = 0;
22670 HOST_WIDE_INT expected_size = -1;
22671 int size_needed = 0, epilogue_size_needed;
22672 int desired_align = 0, align_bytes = 0;
22673 enum stringop_alg alg;
22674 int dynamic_check;
22675 bool need_zero_guard = false;
22676 bool noalign;
22678 if (CONST_INT_P (align_exp))
22679 align = INTVAL (align_exp);
22680 /* i386 can do misaligned access on reasonably increased cost. */
22681 if (CONST_INT_P (expected_align_exp)
22682 && INTVAL (expected_align_exp) > align)
22683 align = INTVAL (expected_align_exp);
22684 /* ALIGN is the minimum of destination and source alignment, but we care here
22685 just about destination alignment. */
22686 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
22687 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
22689 if (CONST_INT_P (count_exp))
22690 count = expected_size = INTVAL (count_exp);
22691 if (CONST_INT_P (expected_size_exp) && count == 0)
22692 expected_size = INTVAL (expected_size_exp);
22694 /* Make sure we don't need to care about overflow later on. */
22695 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
22696 return false;
22698 /* Step 0: Decide on preferred algorithm, desired alignment and
22699 size of chunks to be copied by main loop. */
22701 alg = decide_alg (count, expected_size, false, &dynamic_check, &noalign);
22702 desired_align = decide_alignment (align, alg, expected_size);
22704 if (!TARGET_ALIGN_STRINGOPS || noalign)
22705 align = desired_align;
22707 if (alg == libcall)
22708 return false;
22709 gcc_assert (alg != no_stringop);
22710 if (!count)
22711 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
22712 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
22713 srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
22714 switch (alg)
22716 case libcall:
22717 case no_stringop:
22718 gcc_unreachable ();
22719 case loop:
22720 need_zero_guard = true;
22721 size_needed = GET_MODE_SIZE (word_mode);
22722 break;
22723 case unrolled_loop:
22724 need_zero_guard = true;
22725 size_needed = GET_MODE_SIZE (word_mode) * (TARGET_64BIT ? 4 : 2);
22726 break;
22727 case rep_prefix_8_byte:
22728 size_needed = 8;
22729 break;
22730 case rep_prefix_4_byte:
22731 size_needed = 4;
22732 break;
22733 case rep_prefix_1_byte:
22734 size_needed = 1;
22735 break;
22736 case loop_1_byte:
22737 need_zero_guard = true;
22738 size_needed = 1;
22739 break;
22742 epilogue_size_needed = size_needed;
22744 /* Step 1: Prologue guard. */
22746 /* Alignment code needs count to be in register. */
22747 if (CONST_INT_P (count_exp) && desired_align > align)
22749 if (INTVAL (count_exp) > desired_align
22750 && INTVAL (count_exp) > size_needed)
22752 align_bytes
22753 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
22754 if (align_bytes <= 0)
22755 align_bytes = 0;
22756 else
22757 align_bytes = desired_align - align_bytes;
22759 if (align_bytes == 0)
22760 count_exp = force_reg (counter_mode (count_exp), count_exp);
22762 gcc_assert (desired_align >= 1 && align >= 1);
22764 /* Ensure that alignment prologue won't copy past end of block. */
22765 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
22767 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
22768 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
22769 Make sure it is power of 2. */
22770 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
22772 if (count)
22774 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
22776 /* If main algorithm works on QImode, no epilogue is needed.
22777 For small sizes just don't align anything. */
22778 if (size_needed == 1)
22779 desired_align = align;
22780 else
22781 goto epilogue;
22784 else
22786 label = gen_label_rtx ();
22787 emit_cmp_and_jump_insns (count_exp,
22788 GEN_INT (epilogue_size_needed),
22789 LTU, 0, counter_mode (count_exp), 1, label);
22790 if (expected_size == -1 || expected_size < epilogue_size_needed)
22791 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22792 else
22793 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22797 /* Emit code to decide on runtime whether library call or inline should be
22798 used. */
22799 if (dynamic_check != -1)
22801 if (CONST_INT_P (count_exp))
22803 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
22805 emit_block_move_via_libcall (dst, src, count_exp, false);
22806 count_exp = const0_rtx;
22807 goto epilogue;
22810 else
22812 rtx hot_label = gen_label_rtx ();
22813 jump_around_label = gen_label_rtx ();
22814 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
22815 LEU, 0, GET_MODE (count_exp), 1, hot_label);
22816 predict_jump (REG_BR_PROB_BASE * 90 / 100);
22817 emit_block_move_via_libcall (dst, src, count_exp, false);
22818 emit_jump (jump_around_label);
22819 emit_label (hot_label);
22823 /* Step 2: Alignment prologue. */
22825 if (desired_align > align)
22827 if (align_bytes == 0)
22829 /* Except for the first move in epilogue, we no longer know
22830 constant offset in aliasing info. It don't seems to worth
22831 the pain to maintain it for the first move, so throw away
22832 the info early. */
22833 src = change_address (src, BLKmode, srcreg);
22834 dst = change_address (dst, BLKmode, destreg);
22835 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
22836 desired_align);
22838 else
22840 /* If we know how many bytes need to be stored before dst is
22841 sufficiently aligned, maintain aliasing info accurately. */
22842 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
22843 desired_align, align_bytes);
22844 count_exp = plus_constant (counter_mode (count_exp),
22845 count_exp, -align_bytes);
22846 count -= align_bytes;
22848 if (need_zero_guard
22849 && (count < (unsigned HOST_WIDE_INT) size_needed
22850 || (align_bytes == 0
22851 && count < ((unsigned HOST_WIDE_INT) size_needed
22852 + desired_align - align))))
22854 /* It is possible that we copied enough so the main loop will not
22855 execute. */
22856 gcc_assert (size_needed > 1);
22857 if (label == NULL_RTX)
22858 label = gen_label_rtx ();
22859 emit_cmp_and_jump_insns (count_exp,
22860 GEN_INT (size_needed),
22861 LTU, 0, counter_mode (count_exp), 1, label);
22862 if (expected_size == -1
22863 || expected_size < (desired_align - align) / 2 + size_needed)
22864 predict_jump (REG_BR_PROB_BASE * 20 / 100);
22865 else
22866 predict_jump (REG_BR_PROB_BASE * 60 / 100);
22869 if (label && size_needed == 1)
22871 emit_label (label);
22872 LABEL_NUSES (label) = 1;
22873 label = NULL;
22874 epilogue_size_needed = 1;
22876 else if (label == NULL_RTX)
22877 epilogue_size_needed = size_needed;
22879 /* Step 3: Main loop. */
22881 switch (alg)
22883 case libcall:
22884 case no_stringop:
22885 gcc_unreachable ();
22886 case loop_1_byte:
22887 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22888 count_exp, QImode, 1, expected_size);
22889 break;
22890 case loop:
22891 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22892 count_exp, word_mode, 1, expected_size);
22893 break;
22894 case unrolled_loop:
22895 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
22896 registers for 4 temporaries anyway. */
22897 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
22898 count_exp, word_mode, TARGET_64BIT ? 4 : 2,
22899 expected_size);
22900 break;
22901 case rep_prefix_8_byte:
22902 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22903 DImode);
22904 break;
22905 case rep_prefix_4_byte:
22906 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22907 SImode);
22908 break;
22909 case rep_prefix_1_byte:
22910 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
22911 QImode);
22912 break;
22914 /* Adjust properly the offset of src and dest memory for aliasing. */
22915 if (CONST_INT_P (count_exp))
22917 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
22918 (count / size_needed) * size_needed);
22919 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
22920 (count / size_needed) * size_needed);
22922 else
22924 src = change_address (src, BLKmode, srcreg);
22925 dst = change_address (dst, BLKmode, destreg);
22928 /* Step 4: Epilogue to copy the remaining bytes. */
22929 epilogue:
22930 if (label)
22932 /* When the main loop is done, COUNT_EXP might hold original count,
22933 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
22934 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
22935 bytes. Compensate if needed. */
22937 if (size_needed < epilogue_size_needed)
22939 tmp =
22940 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
22941 GEN_INT (size_needed - 1), count_exp, 1,
22942 OPTAB_DIRECT);
22943 if (tmp != count_exp)
22944 emit_move_insn (count_exp, tmp);
22946 emit_label (label);
22947 LABEL_NUSES (label) = 1;
22950 if (count_exp != const0_rtx && epilogue_size_needed > 1)
22951 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
22952 epilogue_size_needed);
22953 if (jump_around_label)
22954 emit_label (jump_around_label);
22955 return true;
22958 /* Helper function for memcpy. For QImode value 0xXY produce
22959 0xXYXYXYXY of wide specified by MODE. This is essentially
22960 a * 0x10101010, but we can do slightly better than
22961 synth_mult by unwinding the sequence by hand on CPUs with
22962 slow multiply. */
22963 static rtx
22964 promote_duplicated_reg (enum machine_mode mode, rtx val)
22966 enum machine_mode valmode = GET_MODE (val);
22967 rtx tmp;
22968 int nops = mode == DImode ? 3 : 2;
22970 gcc_assert (mode == SImode || mode == DImode);
22971 if (val == const0_rtx)
22972 return copy_to_mode_reg (mode, const0_rtx);
22973 if (CONST_INT_P (val))
22975 HOST_WIDE_INT v = INTVAL (val) & 255;
22977 v |= v << 8;
22978 v |= v << 16;
22979 if (mode == DImode)
22980 v |= (v << 16) << 16;
22981 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
22984 if (valmode == VOIDmode)
22985 valmode = QImode;
22986 if (valmode != QImode)
22987 val = gen_lowpart (QImode, val);
22988 if (mode == QImode)
22989 return val;
22990 if (!TARGET_PARTIAL_REG_STALL)
22991 nops--;
22992 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
22993 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
22994 <= (ix86_cost->shift_const + ix86_cost->add) * nops
22995 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
22997 rtx reg = convert_modes (mode, QImode, val, true);
22998 tmp = promote_duplicated_reg (mode, const1_rtx);
22999 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
23000 OPTAB_DIRECT);
23002 else
23004 rtx reg = convert_modes (mode, QImode, val, true);
23006 if (!TARGET_PARTIAL_REG_STALL)
23007 if (mode == SImode)
23008 emit_insn (gen_movsi_insv_1 (reg, reg));
23009 else
23010 emit_insn (gen_movdi_insv_1 (reg, reg));
23011 else
23013 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
23014 NULL, 1, OPTAB_DIRECT);
23015 reg =
23016 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23018 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
23019 NULL, 1, OPTAB_DIRECT);
23020 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23021 if (mode == SImode)
23022 return reg;
23023 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
23024 NULL, 1, OPTAB_DIRECT);
23025 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
23026 return reg;
23030 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
23031 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
23032 alignment from ALIGN to DESIRED_ALIGN. */
23033 static rtx
23034 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
23036 rtx promoted_val;
23038 if (TARGET_64BIT
23039 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
23040 promoted_val = promote_duplicated_reg (DImode, val);
23041 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
23042 promoted_val = promote_duplicated_reg (SImode, val);
23043 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
23044 promoted_val = promote_duplicated_reg (HImode, val);
23045 else
23046 promoted_val = val;
23048 return promoted_val;
23051 /* Expand string clear operation (bzero). Use i386 string operations when
23052 profitable. See expand_movmem comment for explanation of individual
23053 steps performed. */
23054 bool
23055 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
23056 rtx expected_align_exp, rtx expected_size_exp)
23058 rtx destreg;
23059 rtx label = NULL;
23060 rtx tmp;
23061 rtx jump_around_label = NULL;
23062 HOST_WIDE_INT align = 1;
23063 unsigned HOST_WIDE_INT count = 0;
23064 HOST_WIDE_INT expected_size = -1;
23065 int size_needed = 0, epilogue_size_needed;
23066 int desired_align = 0, align_bytes = 0;
23067 enum stringop_alg alg;
23068 rtx promoted_val = NULL;
23069 bool force_loopy_epilogue = false;
23070 int dynamic_check;
23071 bool need_zero_guard = false;
23072 bool noalign;
23074 if (CONST_INT_P (align_exp))
23075 align = INTVAL (align_exp);
23076 /* i386 can do misaligned access on reasonably increased cost. */
23077 if (CONST_INT_P (expected_align_exp)
23078 && INTVAL (expected_align_exp) > align)
23079 align = INTVAL (expected_align_exp);
23080 if (CONST_INT_P (count_exp))
23081 count = expected_size = INTVAL (count_exp);
23082 if (CONST_INT_P (expected_size_exp) && count == 0)
23083 expected_size = INTVAL (expected_size_exp);
23085 /* Make sure we don't need to care about overflow later on. */
23086 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
23087 return false;
23089 /* Step 0: Decide on preferred algorithm, desired alignment and
23090 size of chunks to be copied by main loop. */
23092 alg = decide_alg (count, expected_size, true, &dynamic_check, &noalign);
23093 desired_align = decide_alignment (align, alg, expected_size);
23095 if (!TARGET_ALIGN_STRINGOPS || noalign)
23096 align = desired_align;
23098 if (alg == libcall)
23099 return false;
23100 gcc_assert (alg != no_stringop);
23101 if (!count)
23102 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
23103 destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
23104 switch (alg)
23106 case libcall:
23107 case no_stringop:
23108 gcc_unreachable ();
23109 case loop:
23110 need_zero_guard = true;
23111 size_needed = GET_MODE_SIZE (word_mode);
23112 break;
23113 case unrolled_loop:
23114 need_zero_guard = true;
23115 size_needed = GET_MODE_SIZE (word_mode) * 4;
23116 break;
23117 case rep_prefix_8_byte:
23118 size_needed = 8;
23119 break;
23120 case rep_prefix_4_byte:
23121 size_needed = 4;
23122 break;
23123 case rep_prefix_1_byte:
23124 size_needed = 1;
23125 break;
23126 case loop_1_byte:
23127 need_zero_guard = true;
23128 size_needed = 1;
23129 break;
23131 epilogue_size_needed = size_needed;
23133 /* Step 1: Prologue guard. */
23135 /* Alignment code needs count to be in register. */
23136 if (CONST_INT_P (count_exp) && desired_align > align)
23138 if (INTVAL (count_exp) > desired_align
23139 && INTVAL (count_exp) > size_needed)
23141 align_bytes
23142 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
23143 if (align_bytes <= 0)
23144 align_bytes = 0;
23145 else
23146 align_bytes = desired_align - align_bytes;
23148 if (align_bytes == 0)
23150 enum machine_mode mode = SImode;
23151 if (TARGET_64BIT && (count & ~0xffffffff))
23152 mode = DImode;
23153 count_exp = force_reg (mode, count_exp);
23156 /* Do the cheap promotion to allow better CSE across the
23157 main loop and epilogue (ie one load of the big constant in the
23158 front of all code. */
23159 if (CONST_INT_P (val_exp))
23160 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23161 desired_align, align);
23162 /* Ensure that alignment prologue won't copy past end of block. */
23163 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
23165 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
23166 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
23167 Make sure it is power of 2. */
23168 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
23170 /* To improve performance of small blocks, we jump around the VAL
23171 promoting mode. This mean that if the promoted VAL is not constant,
23172 we might not use it in the epilogue and have to use byte
23173 loop variant. */
23174 if (epilogue_size_needed > 2 && !promoted_val)
23175 force_loopy_epilogue = true;
23176 if (count)
23178 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
23180 /* If main algorithm works on QImode, no epilogue is needed.
23181 For small sizes just don't align anything. */
23182 if (size_needed == 1)
23183 desired_align = align;
23184 else
23185 goto epilogue;
23188 else
23190 label = gen_label_rtx ();
23191 emit_cmp_and_jump_insns (count_exp,
23192 GEN_INT (epilogue_size_needed),
23193 LTU, 0, counter_mode (count_exp), 1, label);
23194 if (expected_size == -1 || expected_size <= epilogue_size_needed)
23195 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23196 else
23197 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23200 if (dynamic_check != -1)
23202 rtx hot_label = gen_label_rtx ();
23203 jump_around_label = gen_label_rtx ();
23204 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
23205 LEU, 0, counter_mode (count_exp), 1, hot_label);
23206 predict_jump (REG_BR_PROB_BASE * 90 / 100);
23207 set_storage_via_libcall (dst, count_exp, val_exp, false);
23208 emit_jump (jump_around_label);
23209 emit_label (hot_label);
23212 /* Step 2: Alignment prologue. */
23214 /* Do the expensive promotion once we branched off the small blocks. */
23215 if (!promoted_val)
23216 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
23217 desired_align, align);
23218 gcc_assert (desired_align >= 1 && align >= 1);
23220 if (desired_align > align)
23222 if (align_bytes == 0)
23224 /* Except for the first move in epilogue, we no longer know
23225 constant offset in aliasing info. It don't seems to worth
23226 the pain to maintain it for the first move, so throw away
23227 the info early. */
23228 dst = change_address (dst, BLKmode, destreg);
23229 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
23230 desired_align);
23232 else
23234 /* If we know how many bytes need to be stored before dst is
23235 sufficiently aligned, maintain aliasing info accurately. */
23236 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
23237 desired_align, align_bytes);
23238 count_exp = plus_constant (counter_mode (count_exp),
23239 count_exp, -align_bytes);
23240 count -= align_bytes;
23242 if (need_zero_guard
23243 && (count < (unsigned HOST_WIDE_INT) size_needed
23244 || (align_bytes == 0
23245 && count < ((unsigned HOST_WIDE_INT) size_needed
23246 + desired_align - align))))
23248 /* It is possible that we copied enough so the main loop will not
23249 execute. */
23250 gcc_assert (size_needed > 1);
23251 if (label == NULL_RTX)
23252 label = gen_label_rtx ();
23253 emit_cmp_and_jump_insns (count_exp,
23254 GEN_INT (size_needed),
23255 LTU, 0, counter_mode (count_exp), 1, label);
23256 if (expected_size == -1
23257 || expected_size < (desired_align - align) / 2 + size_needed)
23258 predict_jump (REG_BR_PROB_BASE * 20 / 100);
23259 else
23260 predict_jump (REG_BR_PROB_BASE * 60 / 100);
23263 if (label && size_needed == 1)
23265 emit_label (label);
23266 LABEL_NUSES (label) = 1;
23267 label = NULL;
23268 promoted_val = val_exp;
23269 epilogue_size_needed = 1;
23271 else if (label == NULL_RTX)
23272 epilogue_size_needed = size_needed;
23274 /* Step 3: Main loop. */
23276 switch (alg)
23278 case libcall:
23279 case no_stringop:
23280 gcc_unreachable ();
23281 case loop_1_byte:
23282 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23283 count_exp, QImode, 1, expected_size);
23284 break;
23285 case loop:
23286 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23287 count_exp, word_mode, 1, expected_size);
23288 break;
23289 case unrolled_loop:
23290 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
23291 count_exp, word_mode, 4, expected_size);
23292 break;
23293 case rep_prefix_8_byte:
23294 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23295 DImode, val_exp);
23296 break;
23297 case rep_prefix_4_byte:
23298 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23299 SImode, val_exp);
23300 break;
23301 case rep_prefix_1_byte:
23302 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
23303 QImode, val_exp);
23304 break;
23306 /* Adjust properly the offset of src and dest memory for aliasing. */
23307 if (CONST_INT_P (count_exp))
23308 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
23309 (count / size_needed) * size_needed);
23310 else
23311 dst = change_address (dst, BLKmode, destreg);
23313 /* Step 4: Epilogue to copy the remaining bytes. */
23315 if (label)
23317 /* When the main loop is done, COUNT_EXP might hold original count,
23318 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
23319 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
23320 bytes. Compensate if needed. */
23322 if (size_needed < epilogue_size_needed)
23324 tmp =
23325 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
23326 GEN_INT (size_needed - 1), count_exp, 1,
23327 OPTAB_DIRECT);
23328 if (tmp != count_exp)
23329 emit_move_insn (count_exp, tmp);
23331 emit_label (label);
23332 LABEL_NUSES (label) = 1;
23334 epilogue:
23335 if (count_exp != const0_rtx && epilogue_size_needed > 1)
23337 if (force_loopy_epilogue)
23338 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
23339 epilogue_size_needed);
23340 else
23341 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
23342 epilogue_size_needed);
23344 if (jump_around_label)
23345 emit_label (jump_around_label);
23346 return true;
23349 /* Expand the appropriate insns for doing strlen if not just doing
23350 repnz; scasb
23352 out = result, initialized with the start address
23353 align_rtx = alignment of the address.
23354 scratch = scratch register, initialized with the startaddress when
23355 not aligned, otherwise undefined
23357 This is just the body. It needs the initializations mentioned above and
23358 some address computing at the end. These things are done in i386.md. */
23360 static void
23361 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
23363 int align;
23364 rtx tmp;
23365 rtx align_2_label = NULL_RTX;
23366 rtx align_3_label = NULL_RTX;
23367 rtx align_4_label = gen_label_rtx ();
23368 rtx end_0_label = gen_label_rtx ();
23369 rtx mem;
23370 rtx tmpreg = gen_reg_rtx (SImode);
23371 rtx scratch = gen_reg_rtx (SImode);
23372 rtx cmp;
23374 align = 0;
23375 if (CONST_INT_P (align_rtx))
23376 align = INTVAL (align_rtx);
23378 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
23380 /* Is there a known alignment and is it less than 4? */
23381 if (align < 4)
23383 rtx scratch1 = gen_reg_rtx (Pmode);
23384 emit_move_insn (scratch1, out);
23385 /* Is there a known alignment and is it not 2? */
23386 if (align != 2)
23388 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
23389 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
23391 /* Leave just the 3 lower bits. */
23392 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
23393 NULL_RTX, 0, OPTAB_WIDEN);
23395 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23396 Pmode, 1, align_4_label);
23397 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
23398 Pmode, 1, align_2_label);
23399 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
23400 Pmode, 1, align_3_label);
23402 else
23404 /* Since the alignment is 2, we have to check 2 or 0 bytes;
23405 check if is aligned to 4 - byte. */
23407 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
23408 NULL_RTX, 0, OPTAB_WIDEN);
23410 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
23411 Pmode, 1, align_4_label);
23414 mem = change_address (src, QImode, out);
23416 /* Now compare the bytes. */
23418 /* Compare the first n unaligned byte on a byte per byte basis. */
23419 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
23420 QImode, 1, end_0_label);
23422 /* Increment the address. */
23423 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23425 /* Not needed with an alignment of 2 */
23426 if (align != 2)
23428 emit_label (align_2_label);
23430 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23431 end_0_label);
23433 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23435 emit_label (align_3_label);
23438 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
23439 end_0_label);
23441 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
23444 /* Generate loop to check 4 bytes at a time. It is not a good idea to
23445 align this loop. It gives only huge programs, but does not help to
23446 speed up. */
23447 emit_label (align_4_label);
23449 mem = change_address (src, SImode, out);
23450 emit_move_insn (scratch, mem);
23451 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
23453 /* This formula yields a nonzero result iff one of the bytes is zero.
23454 This saves three branches inside loop and many cycles. */
23456 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
23457 emit_insn (gen_one_cmplsi2 (scratch, scratch));
23458 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
23459 emit_insn (gen_andsi3 (tmpreg, tmpreg,
23460 gen_int_mode (0x80808080, SImode)));
23461 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
23462 align_4_label);
23464 if (TARGET_CMOVE)
23466 rtx reg = gen_reg_rtx (SImode);
23467 rtx reg2 = gen_reg_rtx (Pmode);
23468 emit_move_insn (reg, tmpreg);
23469 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
23471 /* If zero is not in the first two bytes, move two bytes forward. */
23472 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23473 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23474 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23475 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
23476 gen_rtx_IF_THEN_ELSE (SImode, tmp,
23477 reg,
23478 tmpreg)));
23479 /* Emit lea manually to avoid clobbering of flags. */
23480 emit_insn (gen_rtx_SET (SImode, reg2,
23481 gen_rtx_PLUS (Pmode, out, const2_rtx)));
23483 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23484 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
23485 emit_insn (gen_rtx_SET (VOIDmode, out,
23486 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
23487 reg2,
23488 out)));
23490 else
23492 rtx end_2_label = gen_label_rtx ();
23493 /* Is zero in the first two bytes? */
23495 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
23496 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
23497 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
23498 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
23499 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
23500 pc_rtx);
23501 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
23502 JUMP_LABEL (tmp) = end_2_label;
23504 /* Not in the first two. Move two bytes forward. */
23505 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
23506 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
23508 emit_label (end_2_label);
23512 /* Avoid branch in fixing the byte. */
23513 tmpreg = gen_lowpart (QImode, tmpreg);
23514 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
23515 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
23516 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
23517 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
23519 emit_label (end_0_label);
23522 /* Expand strlen. */
23524 bool
23525 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
23527 rtx addr, scratch1, scratch2, scratch3, scratch4;
23529 /* The generic case of strlen expander is long. Avoid it's
23530 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
23532 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23533 && !TARGET_INLINE_ALL_STRINGOPS
23534 && !optimize_insn_for_size_p ()
23535 && (!CONST_INT_P (align) || INTVAL (align) < 4))
23536 return false;
23538 addr = force_reg (Pmode, XEXP (src, 0));
23539 scratch1 = gen_reg_rtx (Pmode);
23541 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
23542 && !optimize_insn_for_size_p ())
23544 /* Well it seems that some optimizer does not combine a call like
23545 foo(strlen(bar), strlen(bar));
23546 when the move and the subtraction is done here. It does calculate
23547 the length just once when these instructions are done inside of
23548 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
23549 often used and I use one fewer register for the lifetime of
23550 output_strlen_unroll() this is better. */
23552 emit_move_insn (out, addr);
23554 ix86_expand_strlensi_unroll_1 (out, src, align);
23556 /* strlensi_unroll_1 returns the address of the zero at the end of
23557 the string, like memchr(), so compute the length by subtracting
23558 the start address. */
23559 emit_insn (ix86_gen_sub3 (out, out, addr));
23561 else
23563 rtx unspec;
23565 /* Can't use this if the user has appropriated eax, ecx, or edi. */
23566 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
23567 return false;
23569 scratch2 = gen_reg_rtx (Pmode);
23570 scratch3 = gen_reg_rtx (Pmode);
23571 scratch4 = force_reg (Pmode, constm1_rtx);
23573 emit_move_insn (scratch3, addr);
23574 eoschar = force_reg (QImode, eoschar);
23576 src = replace_equiv_address_nv (src, scratch3);
23578 /* If .md starts supporting :P, this can be done in .md. */
23579 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
23580 scratch4), UNSPEC_SCAS);
23581 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
23582 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
23583 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
23585 return true;
23588 /* For given symbol (function) construct code to compute address of it's PLT
23589 entry in large x86-64 PIC model. */
23590 static rtx
23591 construct_plt_address (rtx symbol)
23593 rtx tmp, unspec;
23595 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
23596 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
23597 gcc_assert (Pmode == DImode);
23599 tmp = gen_reg_rtx (Pmode);
23600 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
23602 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
23603 emit_insn (ix86_gen_add3 (tmp, tmp, pic_offset_table_rtx));
23604 return tmp;
23608 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
23609 rtx callarg2,
23610 rtx pop, bool sibcall)
23612 /* We need to represent that SI and DI registers are clobbered
23613 by SYSV calls. */
23614 static int clobbered_registers[] = {
23615 XMM6_REG, XMM7_REG, XMM8_REG,
23616 XMM9_REG, XMM10_REG, XMM11_REG,
23617 XMM12_REG, XMM13_REG, XMM14_REG,
23618 XMM15_REG, SI_REG, DI_REG
23620 rtx vec[ARRAY_SIZE (clobbered_registers) + 3];
23621 rtx use = NULL, call;
23622 unsigned int vec_len;
23624 if (pop == const0_rtx)
23625 pop = NULL;
23626 gcc_assert (!TARGET_64BIT || !pop);
23628 if (TARGET_MACHO && !TARGET_64BIT)
23630 #if TARGET_MACHO
23631 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
23632 fnaddr = machopic_indirect_call_target (fnaddr);
23633 #endif
23635 else
23637 /* Static functions and indirect calls don't need the pic register. */
23638 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
23639 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23640 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
23641 use_reg (&use, pic_offset_table_rtx);
23644 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
23646 rtx al = gen_rtx_REG (QImode, AX_REG);
23647 emit_move_insn (al, callarg2);
23648 use_reg (&use, al);
23651 if (ix86_cmodel == CM_LARGE_PIC
23652 && MEM_P (fnaddr)
23653 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
23654 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
23655 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
23656 else if (sibcall
23657 ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
23658 : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
23660 fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
23661 fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
23664 vec_len = 0;
23665 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
23666 if (retval)
23667 call = gen_rtx_SET (VOIDmode, retval, call);
23668 vec[vec_len++] = call;
23670 if (pop)
23672 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
23673 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
23674 vec[vec_len++] = pop;
23677 if (TARGET_64BIT_MS_ABI
23678 && (!callarg2 || INTVAL (callarg2) != -2))
23680 unsigned i;
23682 vec[vec_len++] = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
23683 UNSPEC_MS_TO_SYSV_CALL);
23685 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
23686 vec[vec_len++]
23687 = gen_rtx_CLOBBER (VOIDmode,
23688 gen_rtx_REG (SSE_REGNO_P (clobbered_registers[i])
23689 ? TImode : DImode,
23690 clobbered_registers[i]));
23693 if (vec_len > 1)
23694 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
23695 call = emit_call_insn (call);
23696 if (use)
23697 CALL_INSN_FUNCTION_USAGE (call) = use;
23699 return call;
23702 /* Output the assembly for a call instruction. */
23704 const char *
23705 ix86_output_call_insn (rtx insn, rtx call_op)
23707 bool direct_p = constant_call_address_operand (call_op, VOIDmode);
23708 bool seh_nop_p = false;
23709 const char *xasm;
23711 if (SIBLING_CALL_P (insn))
23713 if (direct_p)
23714 xasm = "jmp\t%P0";
23715 /* SEH epilogue detection requires the indirect branch case
23716 to include REX.W. */
23717 else if (TARGET_SEH)
23718 xasm = "rex.W jmp %A0";
23719 else
23720 xasm = "jmp\t%A0";
23722 output_asm_insn (xasm, &call_op);
23723 return "";
23726 /* SEH unwinding can require an extra nop to be emitted in several
23727 circumstances. Determine if we have one of those. */
23728 if (TARGET_SEH)
23730 rtx i;
23732 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
23734 /* If we get to another real insn, we don't need the nop. */
23735 if (INSN_P (i))
23736 break;
23738 /* If we get to the epilogue note, prevent a catch region from
23739 being adjacent to the standard epilogue sequence. If non-
23740 call-exceptions, we'll have done this during epilogue emission. */
23741 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
23742 && !flag_non_call_exceptions
23743 && !can_throw_internal (insn))
23745 seh_nop_p = true;
23746 break;
23750 /* If we didn't find a real insn following the call, prevent the
23751 unwinder from looking into the next function. */
23752 if (i == NULL)
23753 seh_nop_p = true;
23756 if (direct_p)
23757 xasm = "call\t%P0";
23758 else
23759 xasm = "call\t%A0";
23761 output_asm_insn (xasm, &call_op);
23763 if (seh_nop_p)
23764 return "nop";
23766 return "";
23769 /* Clear stack slot assignments remembered from previous functions.
23770 This is called from INIT_EXPANDERS once before RTL is emitted for each
23771 function. */
23773 static struct machine_function *
23774 ix86_init_machine_status (void)
23776 struct machine_function *f;
23778 f = ggc_alloc_cleared_machine_function ();
23779 f->use_fast_prologue_epilogue_nregs = -1;
23780 f->call_abi = ix86_abi;
23782 return f;
23785 /* Return a MEM corresponding to a stack slot with mode MODE.
23786 Allocate a new slot if necessary.
23788 The RTL for a function can have several slots available: N is
23789 which slot to use. */
23792 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
23794 struct stack_local_entry *s;
23796 gcc_assert (n < MAX_386_STACK_LOCALS);
23798 for (s = ix86_stack_locals; s; s = s->next)
23799 if (s->mode == mode && s->n == n)
23800 return validize_mem (copy_rtx (s->rtl));
23802 s = ggc_alloc_stack_local_entry ();
23803 s->n = n;
23804 s->mode = mode;
23805 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
23807 s->next = ix86_stack_locals;
23808 ix86_stack_locals = s;
23809 return validize_mem (s->rtl);
23812 static void
23813 ix86_instantiate_decls (void)
23815 struct stack_local_entry *s;
23817 for (s = ix86_stack_locals; s; s = s->next)
23818 if (s->rtl != NULL_RTX)
23819 instantiate_decl_rtl (s->rtl);
23822 /* Calculate the length of the memory address in the instruction encoding.
23823 Includes addr32 prefix, does not include the one-byte modrm, opcode,
23824 or other prefixes. We never generate addr32 prefix for LEA insn. */
23827 memory_address_length (rtx addr, bool lea)
23829 struct ix86_address parts;
23830 rtx base, index, disp;
23831 int len;
23832 int ok;
23834 if (GET_CODE (addr) == PRE_DEC
23835 || GET_CODE (addr) == POST_INC
23836 || GET_CODE (addr) == PRE_MODIFY
23837 || GET_CODE (addr) == POST_MODIFY)
23838 return 0;
23840 ok = ix86_decompose_address (addr, &parts);
23841 gcc_assert (ok);
23843 len = (parts.seg == SEG_DEFAULT) ? 0 : 1;
23845 /* If this is not LEA instruction, add the length of addr32 prefix. */
23846 if (TARGET_64BIT && !lea
23847 && (SImode_address_operand (addr, VOIDmode)
23848 || (parts.base && GET_MODE (parts.base) == SImode)
23849 || (parts.index && GET_MODE (parts.index) == SImode)))
23850 len++;
23852 base = parts.base;
23853 index = parts.index;
23854 disp = parts.disp;
23856 if (base && GET_CODE (base) == SUBREG)
23857 base = SUBREG_REG (base);
23858 if (index && GET_CODE (index) == SUBREG)
23859 index = SUBREG_REG (index);
23861 gcc_assert (base == NULL_RTX || REG_P (base));
23862 gcc_assert (index == NULL_RTX || REG_P (index));
23864 /* Rule of thumb:
23865 - esp as the base always wants an index,
23866 - ebp as the base always wants a displacement,
23867 - r12 as the base always wants an index,
23868 - r13 as the base always wants a displacement. */
23870 /* Register Indirect. */
23871 if (base && !index && !disp)
23873 /* esp (for its index) and ebp (for its displacement) need
23874 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
23875 code. */
23876 if (base == arg_pointer_rtx
23877 || base == frame_pointer_rtx
23878 || REGNO (base) == SP_REG
23879 || REGNO (base) == BP_REG
23880 || REGNO (base) == R12_REG
23881 || REGNO (base) == R13_REG)
23882 len++;
23885 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
23886 is not disp32, but disp32(%rip), so for disp32
23887 SIB byte is needed, unless print_operand_address
23888 optimizes it into disp32(%rip) or (%rip) is implied
23889 by UNSPEC. */
23890 else if (disp && !base && !index)
23892 len += 4;
23893 if (TARGET_64BIT)
23895 rtx symbol = disp;
23897 if (GET_CODE (disp) == CONST)
23898 symbol = XEXP (disp, 0);
23899 if (GET_CODE (symbol) == PLUS
23900 && CONST_INT_P (XEXP (symbol, 1)))
23901 symbol = XEXP (symbol, 0);
23903 if (GET_CODE (symbol) != LABEL_REF
23904 && (GET_CODE (symbol) != SYMBOL_REF
23905 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
23906 && (GET_CODE (symbol) != UNSPEC
23907 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
23908 && XINT (symbol, 1) != UNSPEC_PCREL
23909 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
23910 len++;
23913 else
23915 /* Find the length of the displacement constant. */
23916 if (disp)
23918 if (base && satisfies_constraint_K (disp))
23919 len += 1;
23920 else
23921 len += 4;
23923 /* ebp always wants a displacement. Similarly r13. */
23924 else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
23925 len++;
23927 /* An index requires the two-byte modrm form.... */
23928 if (index
23929 /* ...like esp (or r12), which always wants an index. */
23930 || base == arg_pointer_rtx
23931 || base == frame_pointer_rtx
23932 || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
23933 len++;
23936 return len;
23939 /* Compute default value for "length_immediate" attribute. When SHORTFORM
23940 is set, expect that insn have 8bit immediate alternative. */
23942 ix86_attr_length_immediate_default (rtx insn, bool shortform)
23944 int len = 0;
23945 int i;
23946 extract_insn_cached (insn);
23947 for (i = recog_data.n_operands - 1; i >= 0; --i)
23948 if (CONSTANT_P (recog_data.operand[i]))
23950 enum attr_mode mode = get_attr_mode (insn);
23952 gcc_assert (!len);
23953 if (shortform && CONST_INT_P (recog_data.operand[i]))
23955 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
23956 switch (mode)
23958 case MODE_QI:
23959 len = 1;
23960 continue;
23961 case MODE_HI:
23962 ival = trunc_int_for_mode (ival, HImode);
23963 break;
23964 case MODE_SI:
23965 ival = trunc_int_for_mode (ival, SImode);
23966 break;
23967 default:
23968 break;
23970 if (IN_RANGE (ival, -128, 127))
23972 len = 1;
23973 continue;
23976 switch (mode)
23978 case MODE_QI:
23979 len = 1;
23980 break;
23981 case MODE_HI:
23982 len = 2;
23983 break;
23984 case MODE_SI:
23985 len = 4;
23986 break;
23987 /* Immediates for DImode instructions are encoded
23988 as 32bit sign extended values. */
23989 case MODE_DI:
23990 len = 4;
23991 break;
23992 default:
23993 fatal_insn ("unknown insn mode", insn);
23996 return len;
23999 /* Compute default value for "length_address" attribute. */
24001 ix86_attr_length_address_default (rtx insn)
24003 int i;
24005 if (get_attr_type (insn) == TYPE_LEA)
24007 rtx set = PATTERN (insn), addr;
24009 if (GET_CODE (set) == PARALLEL)
24010 set = XVECEXP (set, 0, 0);
24012 gcc_assert (GET_CODE (set) == SET);
24014 addr = SET_SRC (set);
24016 return memory_address_length (addr, true);
24019 extract_insn_cached (insn);
24020 for (i = recog_data.n_operands - 1; i >= 0; --i)
24021 if (MEM_P (recog_data.operand[i]))
24023 constrain_operands_cached (reload_completed);
24024 if (which_alternative != -1)
24026 const char *constraints = recog_data.constraints[i];
24027 int alt = which_alternative;
24029 while (*constraints == '=' || *constraints == '+')
24030 constraints++;
24031 while (alt-- > 0)
24032 while (*constraints++ != ',')
24034 /* Skip ignored operands. */
24035 if (*constraints == 'X')
24036 continue;
24038 return memory_address_length (XEXP (recog_data.operand[i], 0), false);
24040 return 0;
24043 /* Compute default value for "length_vex" attribute. It includes
24044 2 or 3 byte VEX prefix and 1 opcode byte. */
24047 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
24049 int i;
24051 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
24052 byte VEX prefix. */
24053 if (!has_0f_opcode || has_vex_w)
24054 return 3 + 1;
24056 /* We can always use 2 byte VEX prefix in 32bit. */
24057 if (!TARGET_64BIT)
24058 return 2 + 1;
24060 extract_insn_cached (insn);
24062 for (i = recog_data.n_operands - 1; i >= 0; --i)
24063 if (REG_P (recog_data.operand[i]))
24065 /* REX.W bit uses 3 byte VEX prefix. */
24066 if (GET_MODE (recog_data.operand[i]) == DImode
24067 && GENERAL_REG_P (recog_data.operand[i]))
24068 return 3 + 1;
24070 else
24072 /* REX.X or REX.B bits use 3 byte VEX prefix. */
24073 if (MEM_P (recog_data.operand[i])
24074 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
24075 return 3 + 1;
24078 return 2 + 1;
24081 /* Return the maximum number of instructions a cpu can issue. */
24083 static int
24084 ix86_issue_rate (void)
24086 switch (ix86_tune)
24088 case PROCESSOR_PENTIUM:
24089 case PROCESSOR_ATOM:
24090 case PROCESSOR_K6:
24091 case PROCESSOR_BTVER2:
24092 return 2;
24094 case PROCESSOR_PENTIUMPRO:
24095 case PROCESSOR_PENTIUM4:
24096 case PROCESSOR_CORE2:
24097 case PROCESSOR_COREI7:
24098 case PROCESSOR_HASWELL:
24099 case PROCESSOR_ATHLON:
24100 case PROCESSOR_K8:
24101 case PROCESSOR_AMDFAM10:
24102 case PROCESSOR_NOCONA:
24103 case PROCESSOR_GENERIC32:
24104 case PROCESSOR_GENERIC64:
24105 case PROCESSOR_BDVER1:
24106 case PROCESSOR_BDVER2:
24107 case PROCESSOR_BDVER3:
24108 case PROCESSOR_BTVER1:
24109 return 3;
24111 default:
24112 return 1;
24116 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
24117 by DEP_INSN and nothing set by DEP_INSN. */
24119 static bool
24120 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
24122 rtx set, set2;
24124 /* Simplify the test for uninteresting insns. */
24125 if (insn_type != TYPE_SETCC
24126 && insn_type != TYPE_ICMOV
24127 && insn_type != TYPE_FCMOV
24128 && insn_type != TYPE_IBR)
24129 return false;
24131 if ((set = single_set (dep_insn)) != 0)
24133 set = SET_DEST (set);
24134 set2 = NULL_RTX;
24136 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
24137 && XVECLEN (PATTERN (dep_insn), 0) == 2
24138 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
24139 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
24141 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24142 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
24144 else
24145 return false;
24147 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
24148 return false;
24150 /* This test is true if the dependent insn reads the flags but
24151 not any other potentially set register. */
24152 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
24153 return false;
24155 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
24156 return false;
24158 return true;
24161 /* Return true iff USE_INSN has a memory address with operands set by
24162 SET_INSN. */
24164 bool
24165 ix86_agi_dependent (rtx set_insn, rtx use_insn)
24167 int i;
24168 extract_insn_cached (use_insn);
24169 for (i = recog_data.n_operands - 1; i >= 0; --i)
24170 if (MEM_P (recog_data.operand[i]))
24172 rtx addr = XEXP (recog_data.operand[i], 0);
24173 return modified_in_p (addr, set_insn) != 0;
24175 return false;
24178 static int
24179 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
24181 enum attr_type insn_type, dep_insn_type;
24182 enum attr_memory memory;
24183 rtx set, set2;
24184 int dep_insn_code_number;
24186 /* Anti and output dependencies have zero cost on all CPUs. */
24187 if (REG_NOTE_KIND (link) != 0)
24188 return 0;
24190 dep_insn_code_number = recog_memoized (dep_insn);
24192 /* If we can't recognize the insns, we can't really do anything. */
24193 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
24194 return cost;
24196 insn_type = get_attr_type (insn);
24197 dep_insn_type = get_attr_type (dep_insn);
24199 switch (ix86_tune)
24201 case PROCESSOR_PENTIUM:
24202 /* Address Generation Interlock adds a cycle of latency. */
24203 if (insn_type == TYPE_LEA)
24205 rtx addr = PATTERN (insn);
24207 if (GET_CODE (addr) == PARALLEL)
24208 addr = XVECEXP (addr, 0, 0);
24210 gcc_assert (GET_CODE (addr) == SET);
24212 addr = SET_SRC (addr);
24213 if (modified_in_p (addr, dep_insn))
24214 cost += 1;
24216 else if (ix86_agi_dependent (dep_insn, insn))
24217 cost += 1;
24219 /* ??? Compares pair with jump/setcc. */
24220 if (ix86_flags_dependent (insn, dep_insn, insn_type))
24221 cost = 0;
24223 /* Floating point stores require value to be ready one cycle earlier. */
24224 if (insn_type == TYPE_FMOV
24225 && get_attr_memory (insn) == MEMORY_STORE
24226 && !ix86_agi_dependent (dep_insn, insn))
24227 cost += 1;
24228 break;
24230 case PROCESSOR_PENTIUMPRO:
24231 memory = get_attr_memory (insn);
24233 /* INT->FP conversion is expensive. */
24234 if (get_attr_fp_int_src (dep_insn))
24235 cost += 5;
24237 /* There is one cycle extra latency between an FP op and a store. */
24238 if (insn_type == TYPE_FMOV
24239 && (set = single_set (dep_insn)) != NULL_RTX
24240 && (set2 = single_set (insn)) != NULL_RTX
24241 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
24242 && MEM_P (SET_DEST (set2)))
24243 cost += 1;
24245 /* Show ability of reorder buffer to hide latency of load by executing
24246 in parallel with previous instruction in case
24247 previous instruction is not needed to compute the address. */
24248 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24249 && !ix86_agi_dependent (dep_insn, insn))
24251 /* Claim moves to take one cycle, as core can issue one load
24252 at time and the next load can start cycle later. */
24253 if (dep_insn_type == TYPE_IMOV
24254 || dep_insn_type == TYPE_FMOV)
24255 cost = 1;
24256 else if (cost > 1)
24257 cost--;
24259 break;
24261 case PROCESSOR_K6:
24262 memory = get_attr_memory (insn);
24264 /* The esp dependency is resolved before the instruction is really
24265 finished. */
24266 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
24267 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
24268 return 1;
24270 /* INT->FP conversion is expensive. */
24271 if (get_attr_fp_int_src (dep_insn))
24272 cost += 5;
24274 /* Show ability of reorder buffer to hide latency of load by executing
24275 in parallel with previous instruction in case
24276 previous instruction is not needed to compute the address. */
24277 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24278 && !ix86_agi_dependent (dep_insn, insn))
24280 /* Claim moves to take one cycle, as core can issue one load
24281 at time and the next load can start cycle later. */
24282 if (dep_insn_type == TYPE_IMOV
24283 || dep_insn_type == TYPE_FMOV)
24284 cost = 1;
24285 else if (cost > 2)
24286 cost -= 2;
24287 else
24288 cost = 1;
24290 break;
24292 case PROCESSOR_ATHLON:
24293 case PROCESSOR_K8:
24294 case PROCESSOR_AMDFAM10:
24295 case PROCESSOR_BDVER1:
24296 case PROCESSOR_BDVER2:
24297 case PROCESSOR_BDVER3:
24298 case PROCESSOR_BTVER1:
24299 case PROCESSOR_BTVER2:
24300 case PROCESSOR_ATOM:
24301 case PROCESSOR_GENERIC32:
24302 case PROCESSOR_GENERIC64:
24303 memory = get_attr_memory (insn);
24305 /* Show ability of reorder buffer to hide latency of load by executing
24306 in parallel with previous instruction in case
24307 previous instruction is not needed to compute the address. */
24308 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
24309 && !ix86_agi_dependent (dep_insn, insn))
24311 enum attr_unit unit = get_attr_unit (insn);
24312 int loadcost = 3;
24314 /* Because of the difference between the length of integer and
24315 floating unit pipeline preparation stages, the memory operands
24316 for floating point are cheaper.
24318 ??? For Athlon it the difference is most probably 2. */
24319 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
24320 loadcost = 3;
24321 else
24322 loadcost = TARGET_ATHLON ? 2 : 0;
24324 if (cost >= loadcost)
24325 cost -= loadcost;
24326 else
24327 cost = 0;
24330 default:
24331 break;
24334 return cost;
24337 /* How many alternative schedules to try. This should be as wide as the
24338 scheduling freedom in the DFA, but no wider. Making this value too
24339 large results extra work for the scheduler. */
24341 static int
24342 ia32_multipass_dfa_lookahead (void)
24344 switch (ix86_tune)
24346 case PROCESSOR_PENTIUM:
24347 return 2;
24349 case PROCESSOR_PENTIUMPRO:
24350 case PROCESSOR_K6:
24351 return 1;
24353 case PROCESSOR_CORE2:
24354 case PROCESSOR_COREI7:
24355 case PROCESSOR_HASWELL:
24356 case PROCESSOR_ATOM:
24357 /* Generally, we want haifa-sched:max_issue() to look ahead as far
24358 as many instructions can be executed on a cycle, i.e.,
24359 issue_rate. I wonder why tuning for many CPUs does not do this. */
24360 if (reload_completed)
24361 return ix86_issue_rate ();
24362 /* Don't use lookahead for pre-reload schedule to save compile time. */
24363 return 0;
24365 default:
24366 return 0;
24370 /* Try to reorder ready list to take advantage of Atom pipelined IMUL
24371 execution. It is applied if
24372 (1) IMUL instruction is on the top of list;
24373 (2) There exists the only producer of independent IMUL instruction in
24374 ready list;
24375 (3) Put found producer on the top of ready list.
24376 Returns issue rate. */
24378 static int
24379 ix86_sched_reorder(FILE *dump, int sched_verbose, rtx *ready, int *pn_ready,
24380 int clock_var ATTRIBUTE_UNUSED)
24382 static int issue_rate = -1;
24383 int n_ready = *pn_ready;
24384 rtx insn, insn1, insn2;
24385 int i;
24386 sd_iterator_def sd_it;
24387 dep_t dep;
24388 int index = -1;
24390 /* Set up issue rate. */
24391 issue_rate = ix86_issue_rate();
24393 /* Do reodering for Atom only. */
24394 if (ix86_tune != PROCESSOR_ATOM)
24395 return issue_rate;
24396 /* Do not perform ready list reodering for pre-reload schedule pass. */
24397 if (!reload_completed)
24398 return issue_rate;
24399 /* Nothing to do if ready list contains only 1 instruction. */
24400 if (n_ready <= 1)
24401 return issue_rate;
24403 /* Check that IMUL instruction is on the top of ready list. */
24404 insn = ready[n_ready - 1];
24405 if (!NONDEBUG_INSN_P (insn))
24406 return issue_rate;
24407 insn = PATTERN (insn);
24408 if (GET_CODE (insn) == PARALLEL)
24409 insn = XVECEXP (insn, 0, 0);
24410 if (GET_CODE (insn) != SET)
24411 return issue_rate;
24412 if (!(GET_CODE (SET_SRC (insn)) == MULT
24413 && GET_MODE (SET_SRC (insn)) == SImode))
24414 return issue_rate;
24416 /* Search for producer of independent IMUL instruction. */
24417 for (i = n_ready - 2; i>= 0; i--)
24419 insn = ready[i];
24420 if (!NONDEBUG_INSN_P (insn))
24421 continue;
24422 /* Skip IMUL instruction. */
24423 insn2 = PATTERN (insn);
24424 if (GET_CODE (insn2) == PARALLEL)
24425 insn2 = XVECEXP (insn2, 0, 0);
24426 if (GET_CODE (insn2) == SET
24427 && GET_CODE (SET_SRC (insn2)) == MULT
24428 && GET_MODE (SET_SRC (insn2)) == SImode)
24429 continue;
24431 FOR_EACH_DEP (insn, SD_LIST_FORW, sd_it, dep)
24433 rtx con;
24434 con = DEP_CON (dep);
24435 if (!NONDEBUG_INSN_P (con))
24436 continue;
24437 insn1 = PATTERN (con);
24438 if (GET_CODE (insn1) == PARALLEL)
24439 insn1 = XVECEXP (insn1, 0, 0);
24441 if (GET_CODE (insn1) == SET
24442 && GET_CODE (SET_SRC (insn1)) == MULT
24443 && GET_MODE (SET_SRC (insn1)) == SImode)
24445 sd_iterator_def sd_it1;
24446 dep_t dep1;
24447 /* Check if there is no other dependee for IMUL. */
24448 index = i;
24449 FOR_EACH_DEP (con, SD_LIST_BACK, sd_it1, dep1)
24451 rtx pro;
24452 pro = DEP_PRO (dep1);
24453 if (!NONDEBUG_INSN_P (pro))
24454 continue;
24455 if (pro != insn)
24456 index = -1;
24458 if (index >= 0)
24459 break;
24462 if (index >= 0)
24463 break;
24465 if (index < 0)
24466 return issue_rate; /* Didn't find IMUL producer. */
24468 if (sched_verbose > 1)
24469 fprintf(dump, ";;\tatom sched_reorder: swap %d and %d insns\n",
24470 INSN_UID (ready[index]), INSN_UID (ready[n_ready - 1]));
24472 /* Put IMUL producer (ready[index]) at the top of ready list. */
24473 insn1= ready[index];
24474 for (i = index; i < n_ready - 1; i++)
24475 ready[i] = ready[i + 1];
24476 ready[n_ready - 1] = insn1;
24478 return issue_rate;
24481 static bool
24482 ix86_class_likely_spilled_p (reg_class_t);
24484 /* Returns true if lhs of insn is HW function argument register and set up
24485 is_spilled to true if it is likely spilled HW register. */
24486 static bool
24487 insn_is_function_arg (rtx insn, bool* is_spilled)
24489 rtx dst;
24491 if (!NONDEBUG_INSN_P (insn))
24492 return false;
24493 /* Call instructions are not movable, ignore it. */
24494 if (CALL_P (insn))
24495 return false;
24496 insn = PATTERN (insn);
24497 if (GET_CODE (insn) == PARALLEL)
24498 insn = XVECEXP (insn, 0, 0);
24499 if (GET_CODE (insn) != SET)
24500 return false;
24501 dst = SET_DEST (insn);
24502 if (REG_P (dst) && HARD_REGISTER_P (dst)
24503 && ix86_function_arg_regno_p (REGNO (dst)))
24505 /* Is it likely spilled HW register? */
24506 if (!TEST_HARD_REG_BIT (fixed_reg_set, REGNO (dst))
24507 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst))))
24508 *is_spilled = true;
24509 return true;
24511 return false;
24514 /* Add output dependencies for chain of function adjacent arguments if only
24515 there is a move to likely spilled HW register. Return first argument
24516 if at least one dependence was added or NULL otherwise. */
24517 static rtx
24518 add_parameter_dependencies (rtx call, rtx head)
24520 rtx insn;
24521 rtx last = call;
24522 rtx first_arg = NULL;
24523 bool is_spilled = false;
24525 head = PREV_INSN (head);
24527 /* Find nearest to call argument passing instruction. */
24528 while (true)
24530 last = PREV_INSN (last);
24531 if (last == head)
24532 return NULL;
24533 if (!NONDEBUG_INSN_P (last))
24534 continue;
24535 if (insn_is_function_arg (last, &is_spilled))
24536 break;
24537 return NULL;
24540 first_arg = last;
24541 while (true)
24543 insn = PREV_INSN (last);
24544 if (!INSN_P (insn))
24545 break;
24546 if (insn == head)
24547 break;
24548 if (!NONDEBUG_INSN_P (insn))
24550 last = insn;
24551 continue;
24553 if (insn_is_function_arg (insn, &is_spilled))
24555 /* Add output depdendence between two function arguments if chain
24556 of output arguments contains likely spilled HW registers. */
24557 if (is_spilled)
24558 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24559 first_arg = last = insn;
24561 else
24562 break;
24564 if (!is_spilled)
24565 return NULL;
24566 return first_arg;
24569 /* Add output or anti dependency from insn to first_arg to restrict its code
24570 motion. */
24571 static void
24572 avoid_func_arg_motion (rtx first_arg, rtx insn)
24574 rtx set;
24575 rtx tmp;
24577 set = single_set (insn);
24578 if (!set)
24579 return;
24580 tmp = SET_DEST (set);
24581 if (REG_P (tmp))
24583 /* Add output dependency to the first function argument. */
24584 add_dependence (first_arg, insn, REG_DEP_OUTPUT);
24585 return;
24587 /* Add anti dependency. */
24588 add_dependence (first_arg, insn, REG_DEP_ANTI);
24591 /* Avoid cross block motion of function argument through adding dependency
24592 from the first non-jump instruction in bb. */
24593 static void
24594 add_dependee_for_func_arg (rtx arg, basic_block bb)
24596 rtx insn = BB_END (bb);
24598 while (insn)
24600 if (NONDEBUG_INSN_P (insn) && NONJUMP_INSN_P (insn))
24602 rtx set = single_set (insn);
24603 if (set)
24605 avoid_func_arg_motion (arg, insn);
24606 return;
24609 if (insn == BB_HEAD (bb))
24610 return;
24611 insn = PREV_INSN (insn);
24615 /* Hook for pre-reload schedule - avoid motion of function arguments
24616 passed in likely spilled HW registers. */
24617 static void
24618 ix86_dependencies_evaluation_hook (rtx head, rtx tail)
24620 rtx insn;
24621 rtx first_arg = NULL;
24622 if (reload_completed)
24623 return;
24624 while (head != tail && DEBUG_INSN_P (head))
24625 head = NEXT_INSN (head);
24626 for (insn = tail; insn != head; insn = PREV_INSN (insn))
24627 if (INSN_P (insn) && CALL_P (insn))
24629 first_arg = add_parameter_dependencies (insn, head);
24630 if (first_arg)
24632 /* Add dependee for first argument to predecessors if only
24633 region contains more than one block. */
24634 basic_block bb = BLOCK_FOR_INSN (insn);
24635 int rgn = CONTAINING_RGN (bb->index);
24636 int nr_blks = RGN_NR_BLOCKS (rgn);
24637 /* Skip trivial regions and region head blocks that can have
24638 predecessors outside of region. */
24639 if (nr_blks > 1 && BLOCK_TO_BB (bb->index) != 0)
24641 edge e;
24642 edge_iterator ei;
24644 /* Regions are SCCs with the exception of selective
24645 scheduling with pipelining of outer blocks enabled.
24646 So also check that immediate predecessors of a non-head
24647 block are in the same region. */
24648 FOR_EACH_EDGE (e, ei, bb->preds)
24650 /* Avoid creating of loop-carried dependencies through
24651 using topological ordering in the region. */
24652 if (rgn == CONTAINING_RGN (e->src->index)
24653 && BLOCK_TO_BB (bb->index) > BLOCK_TO_BB (e->src->index))
24654 add_dependee_for_func_arg (first_arg, e->src);
24657 insn = first_arg;
24658 if (insn == head)
24659 break;
24662 else if (first_arg)
24663 avoid_func_arg_motion (first_arg, insn);
24666 /* Hook for pre-reload schedule - set priority of moves from likely spilled
24667 HW registers to maximum, to schedule them at soon as possible. These are
24668 moves from function argument registers at the top of the function entry
24669 and moves from function return value registers after call. */
24670 static int
24671 ix86_adjust_priority (rtx insn, int priority)
24673 rtx set;
24675 if (reload_completed)
24676 return priority;
24678 if (!NONDEBUG_INSN_P (insn))
24679 return priority;
24681 set = single_set (insn);
24682 if (set)
24684 rtx tmp = SET_SRC (set);
24685 if (REG_P (tmp)
24686 && HARD_REGISTER_P (tmp)
24687 && !TEST_HARD_REG_BIT (fixed_reg_set, REGNO (tmp))
24688 && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (tmp))))
24689 return current_sched_info->sched_max_insns_priority;
24692 return priority;
24695 /* Model decoder of Core 2/i7.
24696 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
24697 track the instruction fetch block boundaries and make sure that long
24698 (9+ bytes) instructions are assigned to D0. */
24700 /* Maximum length of an insn that can be handled by
24701 a secondary decoder unit. '8' for Core 2/i7. */
24702 static int core2i7_secondary_decoder_max_insn_size;
24704 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
24705 '16' for Core 2/i7. */
24706 static int core2i7_ifetch_block_size;
24708 /* Maximum number of instructions decoder can handle per cycle.
24709 '6' for Core 2/i7. */
24710 static int core2i7_ifetch_block_max_insns;
24712 typedef struct ix86_first_cycle_multipass_data_ *
24713 ix86_first_cycle_multipass_data_t;
24714 typedef const struct ix86_first_cycle_multipass_data_ *
24715 const_ix86_first_cycle_multipass_data_t;
24717 /* A variable to store target state across calls to max_issue within
24718 one cycle. */
24719 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
24720 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
24722 /* Initialize DATA. */
24723 static void
24724 core2i7_first_cycle_multipass_init (void *_data)
24726 ix86_first_cycle_multipass_data_t data
24727 = (ix86_first_cycle_multipass_data_t) _data;
24729 data->ifetch_block_len = 0;
24730 data->ifetch_block_n_insns = 0;
24731 data->ready_try_change = NULL;
24732 data->ready_try_change_size = 0;
24735 /* Advancing the cycle; reset ifetch block counts. */
24736 static void
24737 core2i7_dfa_post_advance_cycle (void)
24739 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
24741 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24743 data->ifetch_block_len = 0;
24744 data->ifetch_block_n_insns = 0;
24747 static int min_insn_size (rtx);
24749 /* Filter out insns from ready_try that the core will not be able to issue
24750 on current cycle due to decoder. */
24751 static void
24752 core2i7_first_cycle_multipass_filter_ready_try
24753 (const_ix86_first_cycle_multipass_data_t data,
24754 char *ready_try, int n_ready, bool first_cycle_insn_p)
24756 while (n_ready--)
24758 rtx insn;
24759 int insn_size;
24761 if (ready_try[n_ready])
24762 continue;
24764 insn = get_ready_element (n_ready);
24765 insn_size = min_insn_size (insn);
24767 if (/* If this is a too long an insn for a secondary decoder ... */
24768 (!first_cycle_insn_p
24769 && insn_size > core2i7_secondary_decoder_max_insn_size)
24770 /* ... or it would not fit into the ifetch block ... */
24771 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
24772 /* ... or the decoder is full already ... */
24773 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
24774 /* ... mask the insn out. */
24776 ready_try[n_ready] = 1;
24778 if (data->ready_try_change)
24779 bitmap_set_bit (data->ready_try_change, n_ready);
24784 /* Prepare for a new round of multipass lookahead scheduling. */
24785 static void
24786 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
24787 bool first_cycle_insn_p)
24789 ix86_first_cycle_multipass_data_t data
24790 = (ix86_first_cycle_multipass_data_t) _data;
24791 const_ix86_first_cycle_multipass_data_t prev_data
24792 = ix86_first_cycle_multipass_data;
24794 /* Restore the state from the end of the previous round. */
24795 data->ifetch_block_len = prev_data->ifetch_block_len;
24796 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
24798 /* Filter instructions that cannot be issued on current cycle due to
24799 decoder restrictions. */
24800 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24801 first_cycle_insn_p);
24804 /* INSN is being issued in current solution. Account for its impact on
24805 the decoder model. */
24806 static void
24807 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
24808 rtx insn, const void *_prev_data)
24810 ix86_first_cycle_multipass_data_t data
24811 = (ix86_first_cycle_multipass_data_t) _data;
24812 const_ix86_first_cycle_multipass_data_t prev_data
24813 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
24815 int insn_size = min_insn_size (insn);
24817 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
24818 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
24819 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
24820 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
24822 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
24823 if (!data->ready_try_change)
24825 data->ready_try_change = sbitmap_alloc (n_ready);
24826 data->ready_try_change_size = n_ready;
24828 else if (data->ready_try_change_size < n_ready)
24830 data->ready_try_change = sbitmap_resize (data->ready_try_change,
24831 n_ready, 0);
24832 data->ready_try_change_size = n_ready;
24834 bitmap_clear (data->ready_try_change);
24836 /* Filter out insns from ready_try that the core will not be able to issue
24837 on current cycle due to decoder. */
24838 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
24839 false);
24842 /* Revert the effect on ready_try. */
24843 static void
24844 core2i7_first_cycle_multipass_backtrack (const void *_data,
24845 char *ready_try,
24846 int n_ready ATTRIBUTE_UNUSED)
24848 const_ix86_first_cycle_multipass_data_t data
24849 = (const_ix86_first_cycle_multipass_data_t) _data;
24850 unsigned int i = 0;
24851 sbitmap_iterator sbi;
24853 gcc_assert (bitmap_last_set_bit (data->ready_try_change) < n_ready);
24854 EXECUTE_IF_SET_IN_BITMAP (data->ready_try_change, 0, i, sbi)
24856 ready_try[i] = 0;
24860 /* Save the result of multipass lookahead scheduling for the next round. */
24861 static void
24862 core2i7_first_cycle_multipass_end (const void *_data)
24864 const_ix86_first_cycle_multipass_data_t data
24865 = (const_ix86_first_cycle_multipass_data_t) _data;
24866 ix86_first_cycle_multipass_data_t next_data
24867 = ix86_first_cycle_multipass_data;
24869 if (data != NULL)
24871 next_data->ifetch_block_len = data->ifetch_block_len;
24872 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
24876 /* Deallocate target data. */
24877 static void
24878 core2i7_first_cycle_multipass_fini (void *_data)
24880 ix86_first_cycle_multipass_data_t data
24881 = (ix86_first_cycle_multipass_data_t) _data;
24883 if (data->ready_try_change)
24885 sbitmap_free (data->ready_try_change);
24886 data->ready_try_change = NULL;
24887 data->ready_try_change_size = 0;
24891 /* Prepare for scheduling pass. */
24892 static void
24893 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
24894 int verbose ATTRIBUTE_UNUSED,
24895 int max_uid ATTRIBUTE_UNUSED)
24897 /* Install scheduling hooks for current CPU. Some of these hooks are used
24898 in time-critical parts of the scheduler, so we only set them up when
24899 they are actually used. */
24900 switch (ix86_tune)
24902 case PROCESSOR_CORE2:
24903 case PROCESSOR_COREI7:
24904 case PROCESSOR_HASWELL:
24905 /* Do not perform multipass scheduling for pre-reload schedule
24906 to save compile time. */
24907 if (reload_completed)
24909 targetm.sched.dfa_post_advance_cycle
24910 = core2i7_dfa_post_advance_cycle;
24911 targetm.sched.first_cycle_multipass_init
24912 = core2i7_first_cycle_multipass_init;
24913 targetm.sched.first_cycle_multipass_begin
24914 = core2i7_first_cycle_multipass_begin;
24915 targetm.sched.first_cycle_multipass_issue
24916 = core2i7_first_cycle_multipass_issue;
24917 targetm.sched.first_cycle_multipass_backtrack
24918 = core2i7_first_cycle_multipass_backtrack;
24919 targetm.sched.first_cycle_multipass_end
24920 = core2i7_first_cycle_multipass_end;
24921 targetm.sched.first_cycle_multipass_fini
24922 = core2i7_first_cycle_multipass_fini;
24924 /* Set decoder parameters. */
24925 core2i7_secondary_decoder_max_insn_size = 8;
24926 core2i7_ifetch_block_size = 16;
24927 core2i7_ifetch_block_max_insns = 6;
24928 break;
24930 /* ... Fall through ... */
24931 default:
24932 targetm.sched.dfa_post_advance_cycle = NULL;
24933 targetm.sched.first_cycle_multipass_init = NULL;
24934 targetm.sched.first_cycle_multipass_begin = NULL;
24935 targetm.sched.first_cycle_multipass_issue = NULL;
24936 targetm.sched.first_cycle_multipass_backtrack = NULL;
24937 targetm.sched.first_cycle_multipass_end = NULL;
24938 targetm.sched.first_cycle_multipass_fini = NULL;
24939 break;
24944 /* Compute the alignment given to a constant that is being placed in memory.
24945 EXP is the constant and ALIGN is the alignment that the object would
24946 ordinarily have.
24947 The value of this function is used instead of that alignment to align
24948 the object. */
24951 ix86_constant_alignment (tree exp, int align)
24953 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
24954 || TREE_CODE (exp) == INTEGER_CST)
24956 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
24957 return 64;
24958 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
24959 return 128;
24961 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
24962 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
24963 return BITS_PER_WORD;
24965 return align;
24968 /* Compute the alignment for a static variable.
24969 TYPE is the data type, and ALIGN is the alignment that
24970 the object would ordinarily have. The value of this function is used
24971 instead of that alignment to align the object. */
24974 ix86_data_alignment (tree type, int align)
24976 int max_align
24977 = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
24979 if (AGGREGATE_TYPE_P (type)
24980 && TYPE_SIZE (type)
24981 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24982 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
24983 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
24984 && align < max_align)
24985 align = max_align;
24987 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
24988 to 16byte boundary. */
24989 if (TARGET_64BIT)
24991 if (AGGREGATE_TYPE_P (type)
24992 && TYPE_SIZE (type)
24993 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
24994 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
24995 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
24996 return 128;
24999 if (TREE_CODE (type) == ARRAY_TYPE)
25001 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25002 return 64;
25003 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25004 return 128;
25006 else if (TREE_CODE (type) == COMPLEX_TYPE)
25009 if (TYPE_MODE (type) == DCmode && align < 64)
25010 return 64;
25011 if ((TYPE_MODE (type) == XCmode
25012 || TYPE_MODE (type) == TCmode) && align < 128)
25013 return 128;
25015 else if ((TREE_CODE (type) == RECORD_TYPE
25016 || TREE_CODE (type) == UNION_TYPE
25017 || TREE_CODE (type) == QUAL_UNION_TYPE)
25018 && TYPE_FIELDS (type))
25020 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25021 return 64;
25022 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25023 return 128;
25025 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25026 || TREE_CODE (type) == INTEGER_TYPE)
25028 if (TYPE_MODE (type) == DFmode && align < 64)
25029 return 64;
25030 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25031 return 128;
25034 return align;
25037 /* Compute the alignment for a local variable or a stack slot. EXP is
25038 the data type or decl itself, MODE is the widest mode available and
25039 ALIGN is the alignment that the object would ordinarily have. The
25040 value of this macro is used instead of that alignment to align the
25041 object. */
25043 unsigned int
25044 ix86_local_alignment (tree exp, enum machine_mode mode,
25045 unsigned int align)
25047 tree type, decl;
25049 if (exp && DECL_P (exp))
25051 type = TREE_TYPE (exp);
25052 decl = exp;
25054 else
25056 type = exp;
25057 decl = NULL;
25060 /* Don't do dynamic stack realignment for long long objects with
25061 -mpreferred-stack-boundary=2. */
25062 if (!TARGET_64BIT
25063 && align == 64
25064 && ix86_preferred_stack_boundary < 64
25065 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
25066 && (!type || !TYPE_USER_ALIGN (type))
25067 && (!decl || !DECL_USER_ALIGN (decl)))
25068 align = 32;
25070 /* If TYPE is NULL, we are allocating a stack slot for caller-save
25071 register in MODE. We will return the largest alignment of XF
25072 and DF. */
25073 if (!type)
25075 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
25076 align = GET_MODE_ALIGNMENT (DFmode);
25077 return align;
25080 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
25081 to 16byte boundary. Exact wording is:
25083 An array uses the same alignment as its elements, except that a local or
25084 global array variable of length at least 16 bytes or
25085 a C99 variable-length array variable always has alignment of at least 16 bytes.
25087 This was added to allow use of aligned SSE instructions at arrays. This
25088 rule is meant for static storage (where compiler can not do the analysis
25089 by itself). We follow it for automatic variables only when convenient.
25090 We fully control everything in the function compiled and functions from
25091 other unit can not rely on the alignment.
25093 Exclude va_list type. It is the common case of local array where
25094 we can not benefit from the alignment. */
25095 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
25096 && TARGET_SSE)
25098 if (AGGREGATE_TYPE_P (type)
25099 && (va_list_type_node == NULL_TREE
25100 || (TYPE_MAIN_VARIANT (type)
25101 != TYPE_MAIN_VARIANT (va_list_type_node)))
25102 && TYPE_SIZE (type)
25103 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
25104 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
25105 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
25106 return 128;
25108 if (TREE_CODE (type) == ARRAY_TYPE)
25110 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
25111 return 64;
25112 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
25113 return 128;
25115 else if (TREE_CODE (type) == COMPLEX_TYPE)
25117 if (TYPE_MODE (type) == DCmode && align < 64)
25118 return 64;
25119 if ((TYPE_MODE (type) == XCmode
25120 || TYPE_MODE (type) == TCmode) && align < 128)
25121 return 128;
25123 else if ((TREE_CODE (type) == RECORD_TYPE
25124 || TREE_CODE (type) == UNION_TYPE
25125 || TREE_CODE (type) == QUAL_UNION_TYPE)
25126 && TYPE_FIELDS (type))
25128 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
25129 return 64;
25130 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
25131 return 128;
25133 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
25134 || TREE_CODE (type) == INTEGER_TYPE)
25137 if (TYPE_MODE (type) == DFmode && align < 64)
25138 return 64;
25139 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
25140 return 128;
25142 return align;
25145 /* Compute the minimum required alignment for dynamic stack realignment
25146 purposes for a local variable, parameter or a stack slot. EXP is
25147 the data type or decl itself, MODE is its mode and ALIGN is the
25148 alignment that the object would ordinarily have. */
25150 unsigned int
25151 ix86_minimum_alignment (tree exp, enum machine_mode mode,
25152 unsigned int align)
25154 tree type, decl;
25156 if (exp && DECL_P (exp))
25158 type = TREE_TYPE (exp);
25159 decl = exp;
25161 else
25163 type = exp;
25164 decl = NULL;
25167 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
25168 return align;
25170 /* Don't do dynamic stack realignment for long long objects with
25171 -mpreferred-stack-boundary=2. */
25172 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
25173 && (!type || !TYPE_USER_ALIGN (type))
25174 && (!decl || !DECL_USER_ALIGN (decl)))
25175 return 32;
25177 return align;
25180 /* Find a location for the static chain incoming to a nested function.
25181 This is a register, unless all free registers are used by arguments. */
25183 static rtx
25184 ix86_static_chain (const_tree fndecl, bool incoming_p)
25186 unsigned regno;
25188 if (!DECL_STATIC_CHAIN (fndecl))
25189 return NULL;
25191 if (TARGET_64BIT)
25193 /* We always use R10 in 64-bit mode. */
25194 regno = R10_REG;
25196 else
25198 tree fntype;
25199 unsigned int ccvt;
25201 /* By default in 32-bit mode we use ECX to pass the static chain. */
25202 regno = CX_REG;
25204 fntype = TREE_TYPE (fndecl);
25205 ccvt = ix86_get_callcvt (fntype);
25206 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
25208 /* Fastcall functions use ecx/edx for arguments, which leaves
25209 us with EAX for the static chain.
25210 Thiscall functions use ecx for arguments, which also
25211 leaves us with EAX for the static chain. */
25212 regno = AX_REG;
25214 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
25216 /* Thiscall functions use ecx for arguments, which leaves
25217 us with EAX and EDX for the static chain.
25218 We are using for abi-compatibility EAX. */
25219 regno = AX_REG;
25221 else if (ix86_function_regparm (fntype, fndecl) == 3)
25223 /* For regparm 3, we have no free call-clobbered registers in
25224 which to store the static chain. In order to implement this,
25225 we have the trampoline push the static chain to the stack.
25226 However, we can't push a value below the return address when
25227 we call the nested function directly, so we have to use an
25228 alternate entry point. For this we use ESI, and have the
25229 alternate entry point push ESI, so that things appear the
25230 same once we're executing the nested function. */
25231 if (incoming_p)
25233 if (fndecl == current_function_decl)
25234 ix86_static_chain_on_stack = true;
25235 return gen_frame_mem (SImode,
25236 plus_constant (Pmode,
25237 arg_pointer_rtx, -8));
25239 regno = SI_REG;
25243 return gen_rtx_REG (Pmode, regno);
25246 /* Emit RTL insns to initialize the variable parts of a trampoline.
25247 FNDECL is the decl of the target address; M_TRAMP is a MEM for
25248 the trampoline, and CHAIN_VALUE is an RTX for the static chain
25249 to be passed to the target function. */
25251 static void
25252 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
25254 rtx mem, fnaddr;
25255 int opcode;
25256 int offset = 0;
25258 fnaddr = XEXP (DECL_RTL (fndecl), 0);
25260 if (TARGET_64BIT)
25262 int size;
25264 /* Load the function address to r11. Try to load address using
25265 the shorter movl instead of movabs. We may want to support
25266 movq for kernel mode, but kernel does not use trampolines at
25267 the moment. FNADDR is a 32bit address and may not be in
25268 DImode when ptr_mode == SImode. Always use movl in this
25269 case. */
25270 if (ptr_mode == SImode
25271 || x86_64_zext_immediate_operand (fnaddr, VOIDmode))
25273 fnaddr = copy_addr_to_reg (fnaddr);
25275 mem = adjust_address (m_tramp, HImode, offset);
25276 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
25278 mem = adjust_address (m_tramp, SImode, offset + 2);
25279 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
25280 offset += 6;
25282 else
25284 mem = adjust_address (m_tramp, HImode, offset);
25285 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
25287 mem = adjust_address (m_tramp, DImode, offset + 2);
25288 emit_move_insn (mem, fnaddr);
25289 offset += 10;
25292 /* Load static chain using movabs to r10. Use the shorter movl
25293 instead of movabs when ptr_mode == SImode. */
25294 if (ptr_mode == SImode)
25296 opcode = 0xba41;
25297 size = 6;
25299 else
25301 opcode = 0xba49;
25302 size = 10;
25305 mem = adjust_address (m_tramp, HImode, offset);
25306 emit_move_insn (mem, gen_int_mode (opcode, HImode));
25308 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
25309 emit_move_insn (mem, chain_value);
25310 offset += size;
25312 /* Jump to r11; the last (unused) byte is a nop, only there to
25313 pad the write out to a single 32-bit store. */
25314 mem = adjust_address (m_tramp, SImode, offset);
25315 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
25316 offset += 4;
25318 else
25320 rtx disp, chain;
25322 /* Depending on the static chain location, either load a register
25323 with a constant, or push the constant to the stack. All of the
25324 instructions are the same size. */
25325 chain = ix86_static_chain (fndecl, true);
25326 if (REG_P (chain))
25328 switch (REGNO (chain))
25330 case AX_REG:
25331 opcode = 0xb8; break;
25332 case CX_REG:
25333 opcode = 0xb9; break;
25334 default:
25335 gcc_unreachable ();
25338 else
25339 opcode = 0x68;
25341 mem = adjust_address (m_tramp, QImode, offset);
25342 emit_move_insn (mem, gen_int_mode (opcode, QImode));
25344 mem = adjust_address (m_tramp, SImode, offset + 1);
25345 emit_move_insn (mem, chain_value);
25346 offset += 5;
25348 mem = adjust_address (m_tramp, QImode, offset);
25349 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
25351 mem = adjust_address (m_tramp, SImode, offset + 1);
25353 /* Compute offset from the end of the jmp to the target function.
25354 In the case in which the trampoline stores the static chain on
25355 the stack, we need to skip the first insn which pushes the
25356 (call-saved) register static chain; this push is 1 byte. */
25357 offset += 5;
25358 disp = expand_binop (SImode, sub_optab, fnaddr,
25359 plus_constant (Pmode, XEXP (m_tramp, 0),
25360 offset - (MEM_P (chain) ? 1 : 0)),
25361 NULL_RTX, 1, OPTAB_DIRECT);
25362 emit_move_insn (mem, disp);
25365 gcc_assert (offset <= TRAMPOLINE_SIZE);
25367 #ifdef HAVE_ENABLE_EXECUTE_STACK
25368 #ifdef CHECK_EXECUTE_STACK_ENABLED
25369 if (CHECK_EXECUTE_STACK_ENABLED)
25370 #endif
25371 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
25372 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
25373 #endif
25376 /* The following file contains several enumerations and data structures
25377 built from the definitions in i386-builtin-types.def. */
25379 #include "i386-builtin-types.inc"
25381 /* Table for the ix86 builtin non-function types. */
25382 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
25384 /* Retrieve an element from the above table, building some of
25385 the types lazily. */
25387 static tree
25388 ix86_get_builtin_type (enum ix86_builtin_type tcode)
25390 unsigned int index;
25391 tree type, itype;
25393 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
25395 type = ix86_builtin_type_tab[(int) tcode];
25396 if (type != NULL)
25397 return type;
25399 gcc_assert (tcode > IX86_BT_LAST_PRIM);
25400 if (tcode <= IX86_BT_LAST_VECT)
25402 enum machine_mode mode;
25404 index = tcode - IX86_BT_LAST_PRIM - 1;
25405 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
25406 mode = ix86_builtin_type_vect_mode[index];
25408 type = build_vector_type_for_mode (itype, mode);
25410 else
25412 int quals;
25414 index = tcode - IX86_BT_LAST_VECT - 1;
25415 if (tcode <= IX86_BT_LAST_PTR)
25416 quals = TYPE_UNQUALIFIED;
25417 else
25418 quals = TYPE_QUAL_CONST;
25420 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
25421 if (quals != TYPE_UNQUALIFIED)
25422 itype = build_qualified_type (itype, quals);
25424 type = build_pointer_type (itype);
25427 ix86_builtin_type_tab[(int) tcode] = type;
25428 return type;
25431 /* Table for the ix86 builtin function types. */
25432 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
25434 /* Retrieve an element from the above table, building some of
25435 the types lazily. */
25437 static tree
25438 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
25440 tree type;
25442 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
25444 type = ix86_builtin_func_type_tab[(int) tcode];
25445 if (type != NULL)
25446 return type;
25448 if (tcode <= IX86_BT_LAST_FUNC)
25450 unsigned start = ix86_builtin_func_start[(int) tcode];
25451 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
25452 tree rtype, atype, args = void_list_node;
25453 unsigned i;
25455 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
25456 for (i = after - 1; i > start; --i)
25458 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
25459 args = tree_cons (NULL, atype, args);
25462 type = build_function_type (rtype, args);
25464 else
25466 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
25467 enum ix86_builtin_func_type icode;
25469 icode = ix86_builtin_func_alias_base[index];
25470 type = ix86_get_builtin_func_type (icode);
25473 ix86_builtin_func_type_tab[(int) tcode] = type;
25474 return type;
25478 /* Codes for all the SSE/MMX builtins. */
25479 enum ix86_builtins
25481 IX86_BUILTIN_ADDPS,
25482 IX86_BUILTIN_ADDSS,
25483 IX86_BUILTIN_DIVPS,
25484 IX86_BUILTIN_DIVSS,
25485 IX86_BUILTIN_MULPS,
25486 IX86_BUILTIN_MULSS,
25487 IX86_BUILTIN_SUBPS,
25488 IX86_BUILTIN_SUBSS,
25490 IX86_BUILTIN_CMPEQPS,
25491 IX86_BUILTIN_CMPLTPS,
25492 IX86_BUILTIN_CMPLEPS,
25493 IX86_BUILTIN_CMPGTPS,
25494 IX86_BUILTIN_CMPGEPS,
25495 IX86_BUILTIN_CMPNEQPS,
25496 IX86_BUILTIN_CMPNLTPS,
25497 IX86_BUILTIN_CMPNLEPS,
25498 IX86_BUILTIN_CMPNGTPS,
25499 IX86_BUILTIN_CMPNGEPS,
25500 IX86_BUILTIN_CMPORDPS,
25501 IX86_BUILTIN_CMPUNORDPS,
25502 IX86_BUILTIN_CMPEQSS,
25503 IX86_BUILTIN_CMPLTSS,
25504 IX86_BUILTIN_CMPLESS,
25505 IX86_BUILTIN_CMPNEQSS,
25506 IX86_BUILTIN_CMPNLTSS,
25507 IX86_BUILTIN_CMPNLESS,
25508 IX86_BUILTIN_CMPNGTSS,
25509 IX86_BUILTIN_CMPNGESS,
25510 IX86_BUILTIN_CMPORDSS,
25511 IX86_BUILTIN_CMPUNORDSS,
25513 IX86_BUILTIN_COMIEQSS,
25514 IX86_BUILTIN_COMILTSS,
25515 IX86_BUILTIN_COMILESS,
25516 IX86_BUILTIN_COMIGTSS,
25517 IX86_BUILTIN_COMIGESS,
25518 IX86_BUILTIN_COMINEQSS,
25519 IX86_BUILTIN_UCOMIEQSS,
25520 IX86_BUILTIN_UCOMILTSS,
25521 IX86_BUILTIN_UCOMILESS,
25522 IX86_BUILTIN_UCOMIGTSS,
25523 IX86_BUILTIN_UCOMIGESS,
25524 IX86_BUILTIN_UCOMINEQSS,
25526 IX86_BUILTIN_CVTPI2PS,
25527 IX86_BUILTIN_CVTPS2PI,
25528 IX86_BUILTIN_CVTSI2SS,
25529 IX86_BUILTIN_CVTSI642SS,
25530 IX86_BUILTIN_CVTSS2SI,
25531 IX86_BUILTIN_CVTSS2SI64,
25532 IX86_BUILTIN_CVTTPS2PI,
25533 IX86_BUILTIN_CVTTSS2SI,
25534 IX86_BUILTIN_CVTTSS2SI64,
25536 IX86_BUILTIN_MAXPS,
25537 IX86_BUILTIN_MAXSS,
25538 IX86_BUILTIN_MINPS,
25539 IX86_BUILTIN_MINSS,
25541 IX86_BUILTIN_LOADUPS,
25542 IX86_BUILTIN_STOREUPS,
25543 IX86_BUILTIN_MOVSS,
25545 IX86_BUILTIN_MOVHLPS,
25546 IX86_BUILTIN_MOVLHPS,
25547 IX86_BUILTIN_LOADHPS,
25548 IX86_BUILTIN_LOADLPS,
25549 IX86_BUILTIN_STOREHPS,
25550 IX86_BUILTIN_STORELPS,
25552 IX86_BUILTIN_MASKMOVQ,
25553 IX86_BUILTIN_MOVMSKPS,
25554 IX86_BUILTIN_PMOVMSKB,
25556 IX86_BUILTIN_MOVNTPS,
25557 IX86_BUILTIN_MOVNTQ,
25559 IX86_BUILTIN_LOADDQU,
25560 IX86_BUILTIN_STOREDQU,
25562 IX86_BUILTIN_PACKSSWB,
25563 IX86_BUILTIN_PACKSSDW,
25564 IX86_BUILTIN_PACKUSWB,
25566 IX86_BUILTIN_PADDB,
25567 IX86_BUILTIN_PADDW,
25568 IX86_BUILTIN_PADDD,
25569 IX86_BUILTIN_PADDQ,
25570 IX86_BUILTIN_PADDSB,
25571 IX86_BUILTIN_PADDSW,
25572 IX86_BUILTIN_PADDUSB,
25573 IX86_BUILTIN_PADDUSW,
25574 IX86_BUILTIN_PSUBB,
25575 IX86_BUILTIN_PSUBW,
25576 IX86_BUILTIN_PSUBD,
25577 IX86_BUILTIN_PSUBQ,
25578 IX86_BUILTIN_PSUBSB,
25579 IX86_BUILTIN_PSUBSW,
25580 IX86_BUILTIN_PSUBUSB,
25581 IX86_BUILTIN_PSUBUSW,
25583 IX86_BUILTIN_PAND,
25584 IX86_BUILTIN_PANDN,
25585 IX86_BUILTIN_POR,
25586 IX86_BUILTIN_PXOR,
25588 IX86_BUILTIN_PAVGB,
25589 IX86_BUILTIN_PAVGW,
25591 IX86_BUILTIN_PCMPEQB,
25592 IX86_BUILTIN_PCMPEQW,
25593 IX86_BUILTIN_PCMPEQD,
25594 IX86_BUILTIN_PCMPGTB,
25595 IX86_BUILTIN_PCMPGTW,
25596 IX86_BUILTIN_PCMPGTD,
25598 IX86_BUILTIN_PMADDWD,
25600 IX86_BUILTIN_PMAXSW,
25601 IX86_BUILTIN_PMAXUB,
25602 IX86_BUILTIN_PMINSW,
25603 IX86_BUILTIN_PMINUB,
25605 IX86_BUILTIN_PMULHUW,
25606 IX86_BUILTIN_PMULHW,
25607 IX86_BUILTIN_PMULLW,
25609 IX86_BUILTIN_PSADBW,
25610 IX86_BUILTIN_PSHUFW,
25612 IX86_BUILTIN_PSLLW,
25613 IX86_BUILTIN_PSLLD,
25614 IX86_BUILTIN_PSLLQ,
25615 IX86_BUILTIN_PSRAW,
25616 IX86_BUILTIN_PSRAD,
25617 IX86_BUILTIN_PSRLW,
25618 IX86_BUILTIN_PSRLD,
25619 IX86_BUILTIN_PSRLQ,
25620 IX86_BUILTIN_PSLLWI,
25621 IX86_BUILTIN_PSLLDI,
25622 IX86_BUILTIN_PSLLQI,
25623 IX86_BUILTIN_PSRAWI,
25624 IX86_BUILTIN_PSRADI,
25625 IX86_BUILTIN_PSRLWI,
25626 IX86_BUILTIN_PSRLDI,
25627 IX86_BUILTIN_PSRLQI,
25629 IX86_BUILTIN_PUNPCKHBW,
25630 IX86_BUILTIN_PUNPCKHWD,
25631 IX86_BUILTIN_PUNPCKHDQ,
25632 IX86_BUILTIN_PUNPCKLBW,
25633 IX86_BUILTIN_PUNPCKLWD,
25634 IX86_BUILTIN_PUNPCKLDQ,
25636 IX86_BUILTIN_SHUFPS,
25638 IX86_BUILTIN_RCPPS,
25639 IX86_BUILTIN_RCPSS,
25640 IX86_BUILTIN_RSQRTPS,
25641 IX86_BUILTIN_RSQRTPS_NR,
25642 IX86_BUILTIN_RSQRTSS,
25643 IX86_BUILTIN_RSQRTF,
25644 IX86_BUILTIN_SQRTPS,
25645 IX86_BUILTIN_SQRTPS_NR,
25646 IX86_BUILTIN_SQRTSS,
25648 IX86_BUILTIN_UNPCKHPS,
25649 IX86_BUILTIN_UNPCKLPS,
25651 IX86_BUILTIN_ANDPS,
25652 IX86_BUILTIN_ANDNPS,
25653 IX86_BUILTIN_ORPS,
25654 IX86_BUILTIN_XORPS,
25656 IX86_BUILTIN_EMMS,
25657 IX86_BUILTIN_LDMXCSR,
25658 IX86_BUILTIN_STMXCSR,
25659 IX86_BUILTIN_SFENCE,
25661 IX86_BUILTIN_FXSAVE,
25662 IX86_BUILTIN_FXRSTOR,
25663 IX86_BUILTIN_FXSAVE64,
25664 IX86_BUILTIN_FXRSTOR64,
25666 IX86_BUILTIN_XSAVE,
25667 IX86_BUILTIN_XRSTOR,
25668 IX86_BUILTIN_XSAVE64,
25669 IX86_BUILTIN_XRSTOR64,
25671 IX86_BUILTIN_XSAVEOPT,
25672 IX86_BUILTIN_XSAVEOPT64,
25674 /* 3DNow! Original */
25675 IX86_BUILTIN_FEMMS,
25676 IX86_BUILTIN_PAVGUSB,
25677 IX86_BUILTIN_PF2ID,
25678 IX86_BUILTIN_PFACC,
25679 IX86_BUILTIN_PFADD,
25680 IX86_BUILTIN_PFCMPEQ,
25681 IX86_BUILTIN_PFCMPGE,
25682 IX86_BUILTIN_PFCMPGT,
25683 IX86_BUILTIN_PFMAX,
25684 IX86_BUILTIN_PFMIN,
25685 IX86_BUILTIN_PFMUL,
25686 IX86_BUILTIN_PFRCP,
25687 IX86_BUILTIN_PFRCPIT1,
25688 IX86_BUILTIN_PFRCPIT2,
25689 IX86_BUILTIN_PFRSQIT1,
25690 IX86_BUILTIN_PFRSQRT,
25691 IX86_BUILTIN_PFSUB,
25692 IX86_BUILTIN_PFSUBR,
25693 IX86_BUILTIN_PI2FD,
25694 IX86_BUILTIN_PMULHRW,
25696 /* 3DNow! Athlon Extensions */
25697 IX86_BUILTIN_PF2IW,
25698 IX86_BUILTIN_PFNACC,
25699 IX86_BUILTIN_PFPNACC,
25700 IX86_BUILTIN_PI2FW,
25701 IX86_BUILTIN_PSWAPDSI,
25702 IX86_BUILTIN_PSWAPDSF,
25704 /* SSE2 */
25705 IX86_BUILTIN_ADDPD,
25706 IX86_BUILTIN_ADDSD,
25707 IX86_BUILTIN_DIVPD,
25708 IX86_BUILTIN_DIVSD,
25709 IX86_BUILTIN_MULPD,
25710 IX86_BUILTIN_MULSD,
25711 IX86_BUILTIN_SUBPD,
25712 IX86_BUILTIN_SUBSD,
25714 IX86_BUILTIN_CMPEQPD,
25715 IX86_BUILTIN_CMPLTPD,
25716 IX86_BUILTIN_CMPLEPD,
25717 IX86_BUILTIN_CMPGTPD,
25718 IX86_BUILTIN_CMPGEPD,
25719 IX86_BUILTIN_CMPNEQPD,
25720 IX86_BUILTIN_CMPNLTPD,
25721 IX86_BUILTIN_CMPNLEPD,
25722 IX86_BUILTIN_CMPNGTPD,
25723 IX86_BUILTIN_CMPNGEPD,
25724 IX86_BUILTIN_CMPORDPD,
25725 IX86_BUILTIN_CMPUNORDPD,
25726 IX86_BUILTIN_CMPEQSD,
25727 IX86_BUILTIN_CMPLTSD,
25728 IX86_BUILTIN_CMPLESD,
25729 IX86_BUILTIN_CMPNEQSD,
25730 IX86_BUILTIN_CMPNLTSD,
25731 IX86_BUILTIN_CMPNLESD,
25732 IX86_BUILTIN_CMPORDSD,
25733 IX86_BUILTIN_CMPUNORDSD,
25735 IX86_BUILTIN_COMIEQSD,
25736 IX86_BUILTIN_COMILTSD,
25737 IX86_BUILTIN_COMILESD,
25738 IX86_BUILTIN_COMIGTSD,
25739 IX86_BUILTIN_COMIGESD,
25740 IX86_BUILTIN_COMINEQSD,
25741 IX86_BUILTIN_UCOMIEQSD,
25742 IX86_BUILTIN_UCOMILTSD,
25743 IX86_BUILTIN_UCOMILESD,
25744 IX86_BUILTIN_UCOMIGTSD,
25745 IX86_BUILTIN_UCOMIGESD,
25746 IX86_BUILTIN_UCOMINEQSD,
25748 IX86_BUILTIN_MAXPD,
25749 IX86_BUILTIN_MAXSD,
25750 IX86_BUILTIN_MINPD,
25751 IX86_BUILTIN_MINSD,
25753 IX86_BUILTIN_ANDPD,
25754 IX86_BUILTIN_ANDNPD,
25755 IX86_BUILTIN_ORPD,
25756 IX86_BUILTIN_XORPD,
25758 IX86_BUILTIN_SQRTPD,
25759 IX86_BUILTIN_SQRTSD,
25761 IX86_BUILTIN_UNPCKHPD,
25762 IX86_BUILTIN_UNPCKLPD,
25764 IX86_BUILTIN_SHUFPD,
25766 IX86_BUILTIN_LOADUPD,
25767 IX86_BUILTIN_STOREUPD,
25768 IX86_BUILTIN_MOVSD,
25770 IX86_BUILTIN_LOADHPD,
25771 IX86_BUILTIN_LOADLPD,
25773 IX86_BUILTIN_CVTDQ2PD,
25774 IX86_BUILTIN_CVTDQ2PS,
25776 IX86_BUILTIN_CVTPD2DQ,
25777 IX86_BUILTIN_CVTPD2PI,
25778 IX86_BUILTIN_CVTPD2PS,
25779 IX86_BUILTIN_CVTTPD2DQ,
25780 IX86_BUILTIN_CVTTPD2PI,
25782 IX86_BUILTIN_CVTPI2PD,
25783 IX86_BUILTIN_CVTSI2SD,
25784 IX86_BUILTIN_CVTSI642SD,
25786 IX86_BUILTIN_CVTSD2SI,
25787 IX86_BUILTIN_CVTSD2SI64,
25788 IX86_BUILTIN_CVTSD2SS,
25789 IX86_BUILTIN_CVTSS2SD,
25790 IX86_BUILTIN_CVTTSD2SI,
25791 IX86_BUILTIN_CVTTSD2SI64,
25793 IX86_BUILTIN_CVTPS2DQ,
25794 IX86_BUILTIN_CVTPS2PD,
25795 IX86_BUILTIN_CVTTPS2DQ,
25797 IX86_BUILTIN_MOVNTI,
25798 IX86_BUILTIN_MOVNTI64,
25799 IX86_BUILTIN_MOVNTPD,
25800 IX86_BUILTIN_MOVNTDQ,
25802 IX86_BUILTIN_MOVQ128,
25804 /* SSE2 MMX */
25805 IX86_BUILTIN_MASKMOVDQU,
25806 IX86_BUILTIN_MOVMSKPD,
25807 IX86_BUILTIN_PMOVMSKB128,
25809 IX86_BUILTIN_PACKSSWB128,
25810 IX86_BUILTIN_PACKSSDW128,
25811 IX86_BUILTIN_PACKUSWB128,
25813 IX86_BUILTIN_PADDB128,
25814 IX86_BUILTIN_PADDW128,
25815 IX86_BUILTIN_PADDD128,
25816 IX86_BUILTIN_PADDQ128,
25817 IX86_BUILTIN_PADDSB128,
25818 IX86_BUILTIN_PADDSW128,
25819 IX86_BUILTIN_PADDUSB128,
25820 IX86_BUILTIN_PADDUSW128,
25821 IX86_BUILTIN_PSUBB128,
25822 IX86_BUILTIN_PSUBW128,
25823 IX86_BUILTIN_PSUBD128,
25824 IX86_BUILTIN_PSUBQ128,
25825 IX86_BUILTIN_PSUBSB128,
25826 IX86_BUILTIN_PSUBSW128,
25827 IX86_BUILTIN_PSUBUSB128,
25828 IX86_BUILTIN_PSUBUSW128,
25830 IX86_BUILTIN_PAND128,
25831 IX86_BUILTIN_PANDN128,
25832 IX86_BUILTIN_POR128,
25833 IX86_BUILTIN_PXOR128,
25835 IX86_BUILTIN_PAVGB128,
25836 IX86_BUILTIN_PAVGW128,
25838 IX86_BUILTIN_PCMPEQB128,
25839 IX86_BUILTIN_PCMPEQW128,
25840 IX86_BUILTIN_PCMPEQD128,
25841 IX86_BUILTIN_PCMPGTB128,
25842 IX86_BUILTIN_PCMPGTW128,
25843 IX86_BUILTIN_PCMPGTD128,
25845 IX86_BUILTIN_PMADDWD128,
25847 IX86_BUILTIN_PMAXSW128,
25848 IX86_BUILTIN_PMAXUB128,
25849 IX86_BUILTIN_PMINSW128,
25850 IX86_BUILTIN_PMINUB128,
25852 IX86_BUILTIN_PMULUDQ,
25853 IX86_BUILTIN_PMULUDQ128,
25854 IX86_BUILTIN_PMULHUW128,
25855 IX86_BUILTIN_PMULHW128,
25856 IX86_BUILTIN_PMULLW128,
25858 IX86_BUILTIN_PSADBW128,
25859 IX86_BUILTIN_PSHUFHW,
25860 IX86_BUILTIN_PSHUFLW,
25861 IX86_BUILTIN_PSHUFD,
25863 IX86_BUILTIN_PSLLDQI128,
25864 IX86_BUILTIN_PSLLWI128,
25865 IX86_BUILTIN_PSLLDI128,
25866 IX86_BUILTIN_PSLLQI128,
25867 IX86_BUILTIN_PSRAWI128,
25868 IX86_BUILTIN_PSRADI128,
25869 IX86_BUILTIN_PSRLDQI128,
25870 IX86_BUILTIN_PSRLWI128,
25871 IX86_BUILTIN_PSRLDI128,
25872 IX86_BUILTIN_PSRLQI128,
25874 IX86_BUILTIN_PSLLDQ128,
25875 IX86_BUILTIN_PSLLW128,
25876 IX86_BUILTIN_PSLLD128,
25877 IX86_BUILTIN_PSLLQ128,
25878 IX86_BUILTIN_PSRAW128,
25879 IX86_BUILTIN_PSRAD128,
25880 IX86_BUILTIN_PSRLW128,
25881 IX86_BUILTIN_PSRLD128,
25882 IX86_BUILTIN_PSRLQ128,
25884 IX86_BUILTIN_PUNPCKHBW128,
25885 IX86_BUILTIN_PUNPCKHWD128,
25886 IX86_BUILTIN_PUNPCKHDQ128,
25887 IX86_BUILTIN_PUNPCKHQDQ128,
25888 IX86_BUILTIN_PUNPCKLBW128,
25889 IX86_BUILTIN_PUNPCKLWD128,
25890 IX86_BUILTIN_PUNPCKLDQ128,
25891 IX86_BUILTIN_PUNPCKLQDQ128,
25893 IX86_BUILTIN_CLFLUSH,
25894 IX86_BUILTIN_MFENCE,
25895 IX86_BUILTIN_LFENCE,
25896 IX86_BUILTIN_PAUSE,
25898 IX86_BUILTIN_BSRSI,
25899 IX86_BUILTIN_BSRDI,
25900 IX86_BUILTIN_RDPMC,
25901 IX86_BUILTIN_RDTSC,
25902 IX86_BUILTIN_RDTSCP,
25903 IX86_BUILTIN_ROLQI,
25904 IX86_BUILTIN_ROLHI,
25905 IX86_BUILTIN_RORQI,
25906 IX86_BUILTIN_RORHI,
25908 /* SSE3. */
25909 IX86_BUILTIN_ADDSUBPS,
25910 IX86_BUILTIN_HADDPS,
25911 IX86_BUILTIN_HSUBPS,
25912 IX86_BUILTIN_MOVSHDUP,
25913 IX86_BUILTIN_MOVSLDUP,
25914 IX86_BUILTIN_ADDSUBPD,
25915 IX86_BUILTIN_HADDPD,
25916 IX86_BUILTIN_HSUBPD,
25917 IX86_BUILTIN_LDDQU,
25919 IX86_BUILTIN_MONITOR,
25920 IX86_BUILTIN_MWAIT,
25922 /* SSSE3. */
25923 IX86_BUILTIN_PHADDW,
25924 IX86_BUILTIN_PHADDD,
25925 IX86_BUILTIN_PHADDSW,
25926 IX86_BUILTIN_PHSUBW,
25927 IX86_BUILTIN_PHSUBD,
25928 IX86_BUILTIN_PHSUBSW,
25929 IX86_BUILTIN_PMADDUBSW,
25930 IX86_BUILTIN_PMULHRSW,
25931 IX86_BUILTIN_PSHUFB,
25932 IX86_BUILTIN_PSIGNB,
25933 IX86_BUILTIN_PSIGNW,
25934 IX86_BUILTIN_PSIGND,
25935 IX86_BUILTIN_PALIGNR,
25936 IX86_BUILTIN_PABSB,
25937 IX86_BUILTIN_PABSW,
25938 IX86_BUILTIN_PABSD,
25940 IX86_BUILTIN_PHADDW128,
25941 IX86_BUILTIN_PHADDD128,
25942 IX86_BUILTIN_PHADDSW128,
25943 IX86_BUILTIN_PHSUBW128,
25944 IX86_BUILTIN_PHSUBD128,
25945 IX86_BUILTIN_PHSUBSW128,
25946 IX86_BUILTIN_PMADDUBSW128,
25947 IX86_BUILTIN_PMULHRSW128,
25948 IX86_BUILTIN_PSHUFB128,
25949 IX86_BUILTIN_PSIGNB128,
25950 IX86_BUILTIN_PSIGNW128,
25951 IX86_BUILTIN_PSIGND128,
25952 IX86_BUILTIN_PALIGNR128,
25953 IX86_BUILTIN_PABSB128,
25954 IX86_BUILTIN_PABSW128,
25955 IX86_BUILTIN_PABSD128,
25957 /* AMDFAM10 - SSE4A New Instructions. */
25958 IX86_BUILTIN_MOVNTSD,
25959 IX86_BUILTIN_MOVNTSS,
25960 IX86_BUILTIN_EXTRQI,
25961 IX86_BUILTIN_EXTRQ,
25962 IX86_BUILTIN_INSERTQI,
25963 IX86_BUILTIN_INSERTQ,
25965 /* SSE4.1. */
25966 IX86_BUILTIN_BLENDPD,
25967 IX86_BUILTIN_BLENDPS,
25968 IX86_BUILTIN_BLENDVPD,
25969 IX86_BUILTIN_BLENDVPS,
25970 IX86_BUILTIN_PBLENDVB128,
25971 IX86_BUILTIN_PBLENDW128,
25973 IX86_BUILTIN_DPPD,
25974 IX86_BUILTIN_DPPS,
25976 IX86_BUILTIN_INSERTPS128,
25978 IX86_BUILTIN_MOVNTDQA,
25979 IX86_BUILTIN_MPSADBW128,
25980 IX86_BUILTIN_PACKUSDW128,
25981 IX86_BUILTIN_PCMPEQQ,
25982 IX86_BUILTIN_PHMINPOSUW128,
25984 IX86_BUILTIN_PMAXSB128,
25985 IX86_BUILTIN_PMAXSD128,
25986 IX86_BUILTIN_PMAXUD128,
25987 IX86_BUILTIN_PMAXUW128,
25989 IX86_BUILTIN_PMINSB128,
25990 IX86_BUILTIN_PMINSD128,
25991 IX86_BUILTIN_PMINUD128,
25992 IX86_BUILTIN_PMINUW128,
25994 IX86_BUILTIN_PMOVSXBW128,
25995 IX86_BUILTIN_PMOVSXBD128,
25996 IX86_BUILTIN_PMOVSXBQ128,
25997 IX86_BUILTIN_PMOVSXWD128,
25998 IX86_BUILTIN_PMOVSXWQ128,
25999 IX86_BUILTIN_PMOVSXDQ128,
26001 IX86_BUILTIN_PMOVZXBW128,
26002 IX86_BUILTIN_PMOVZXBD128,
26003 IX86_BUILTIN_PMOVZXBQ128,
26004 IX86_BUILTIN_PMOVZXWD128,
26005 IX86_BUILTIN_PMOVZXWQ128,
26006 IX86_BUILTIN_PMOVZXDQ128,
26008 IX86_BUILTIN_PMULDQ128,
26009 IX86_BUILTIN_PMULLD128,
26011 IX86_BUILTIN_ROUNDSD,
26012 IX86_BUILTIN_ROUNDSS,
26014 IX86_BUILTIN_ROUNDPD,
26015 IX86_BUILTIN_ROUNDPS,
26017 IX86_BUILTIN_FLOORPD,
26018 IX86_BUILTIN_CEILPD,
26019 IX86_BUILTIN_TRUNCPD,
26020 IX86_BUILTIN_RINTPD,
26021 IX86_BUILTIN_ROUNDPD_AZ,
26023 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
26024 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
26025 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
26027 IX86_BUILTIN_FLOORPS,
26028 IX86_BUILTIN_CEILPS,
26029 IX86_BUILTIN_TRUNCPS,
26030 IX86_BUILTIN_RINTPS,
26031 IX86_BUILTIN_ROUNDPS_AZ,
26033 IX86_BUILTIN_FLOORPS_SFIX,
26034 IX86_BUILTIN_CEILPS_SFIX,
26035 IX86_BUILTIN_ROUNDPS_AZ_SFIX,
26037 IX86_BUILTIN_PTESTZ,
26038 IX86_BUILTIN_PTESTC,
26039 IX86_BUILTIN_PTESTNZC,
26041 IX86_BUILTIN_VEC_INIT_V2SI,
26042 IX86_BUILTIN_VEC_INIT_V4HI,
26043 IX86_BUILTIN_VEC_INIT_V8QI,
26044 IX86_BUILTIN_VEC_EXT_V2DF,
26045 IX86_BUILTIN_VEC_EXT_V2DI,
26046 IX86_BUILTIN_VEC_EXT_V4SF,
26047 IX86_BUILTIN_VEC_EXT_V4SI,
26048 IX86_BUILTIN_VEC_EXT_V8HI,
26049 IX86_BUILTIN_VEC_EXT_V2SI,
26050 IX86_BUILTIN_VEC_EXT_V4HI,
26051 IX86_BUILTIN_VEC_EXT_V16QI,
26052 IX86_BUILTIN_VEC_SET_V2DI,
26053 IX86_BUILTIN_VEC_SET_V4SF,
26054 IX86_BUILTIN_VEC_SET_V4SI,
26055 IX86_BUILTIN_VEC_SET_V8HI,
26056 IX86_BUILTIN_VEC_SET_V4HI,
26057 IX86_BUILTIN_VEC_SET_V16QI,
26059 IX86_BUILTIN_VEC_PACK_SFIX,
26060 IX86_BUILTIN_VEC_PACK_SFIX256,
26062 /* SSE4.2. */
26063 IX86_BUILTIN_CRC32QI,
26064 IX86_BUILTIN_CRC32HI,
26065 IX86_BUILTIN_CRC32SI,
26066 IX86_BUILTIN_CRC32DI,
26068 IX86_BUILTIN_PCMPESTRI128,
26069 IX86_BUILTIN_PCMPESTRM128,
26070 IX86_BUILTIN_PCMPESTRA128,
26071 IX86_BUILTIN_PCMPESTRC128,
26072 IX86_BUILTIN_PCMPESTRO128,
26073 IX86_BUILTIN_PCMPESTRS128,
26074 IX86_BUILTIN_PCMPESTRZ128,
26075 IX86_BUILTIN_PCMPISTRI128,
26076 IX86_BUILTIN_PCMPISTRM128,
26077 IX86_BUILTIN_PCMPISTRA128,
26078 IX86_BUILTIN_PCMPISTRC128,
26079 IX86_BUILTIN_PCMPISTRO128,
26080 IX86_BUILTIN_PCMPISTRS128,
26081 IX86_BUILTIN_PCMPISTRZ128,
26083 IX86_BUILTIN_PCMPGTQ,
26085 /* AES instructions */
26086 IX86_BUILTIN_AESENC128,
26087 IX86_BUILTIN_AESENCLAST128,
26088 IX86_BUILTIN_AESDEC128,
26089 IX86_BUILTIN_AESDECLAST128,
26090 IX86_BUILTIN_AESIMC128,
26091 IX86_BUILTIN_AESKEYGENASSIST128,
26093 /* PCLMUL instruction */
26094 IX86_BUILTIN_PCLMULQDQ128,
26096 /* AVX */
26097 IX86_BUILTIN_ADDPD256,
26098 IX86_BUILTIN_ADDPS256,
26099 IX86_BUILTIN_ADDSUBPD256,
26100 IX86_BUILTIN_ADDSUBPS256,
26101 IX86_BUILTIN_ANDPD256,
26102 IX86_BUILTIN_ANDPS256,
26103 IX86_BUILTIN_ANDNPD256,
26104 IX86_BUILTIN_ANDNPS256,
26105 IX86_BUILTIN_BLENDPD256,
26106 IX86_BUILTIN_BLENDPS256,
26107 IX86_BUILTIN_BLENDVPD256,
26108 IX86_BUILTIN_BLENDVPS256,
26109 IX86_BUILTIN_DIVPD256,
26110 IX86_BUILTIN_DIVPS256,
26111 IX86_BUILTIN_DPPS256,
26112 IX86_BUILTIN_HADDPD256,
26113 IX86_BUILTIN_HADDPS256,
26114 IX86_BUILTIN_HSUBPD256,
26115 IX86_BUILTIN_HSUBPS256,
26116 IX86_BUILTIN_MAXPD256,
26117 IX86_BUILTIN_MAXPS256,
26118 IX86_BUILTIN_MINPD256,
26119 IX86_BUILTIN_MINPS256,
26120 IX86_BUILTIN_MULPD256,
26121 IX86_BUILTIN_MULPS256,
26122 IX86_BUILTIN_ORPD256,
26123 IX86_BUILTIN_ORPS256,
26124 IX86_BUILTIN_SHUFPD256,
26125 IX86_BUILTIN_SHUFPS256,
26126 IX86_BUILTIN_SUBPD256,
26127 IX86_BUILTIN_SUBPS256,
26128 IX86_BUILTIN_XORPD256,
26129 IX86_BUILTIN_XORPS256,
26130 IX86_BUILTIN_CMPSD,
26131 IX86_BUILTIN_CMPSS,
26132 IX86_BUILTIN_CMPPD,
26133 IX86_BUILTIN_CMPPS,
26134 IX86_BUILTIN_CMPPD256,
26135 IX86_BUILTIN_CMPPS256,
26136 IX86_BUILTIN_CVTDQ2PD256,
26137 IX86_BUILTIN_CVTDQ2PS256,
26138 IX86_BUILTIN_CVTPD2PS256,
26139 IX86_BUILTIN_CVTPS2DQ256,
26140 IX86_BUILTIN_CVTPS2PD256,
26141 IX86_BUILTIN_CVTTPD2DQ256,
26142 IX86_BUILTIN_CVTPD2DQ256,
26143 IX86_BUILTIN_CVTTPS2DQ256,
26144 IX86_BUILTIN_EXTRACTF128PD256,
26145 IX86_BUILTIN_EXTRACTF128PS256,
26146 IX86_BUILTIN_EXTRACTF128SI256,
26147 IX86_BUILTIN_VZEROALL,
26148 IX86_BUILTIN_VZEROUPPER,
26149 IX86_BUILTIN_VPERMILVARPD,
26150 IX86_BUILTIN_VPERMILVARPS,
26151 IX86_BUILTIN_VPERMILVARPD256,
26152 IX86_BUILTIN_VPERMILVARPS256,
26153 IX86_BUILTIN_VPERMILPD,
26154 IX86_BUILTIN_VPERMILPS,
26155 IX86_BUILTIN_VPERMILPD256,
26156 IX86_BUILTIN_VPERMILPS256,
26157 IX86_BUILTIN_VPERMIL2PD,
26158 IX86_BUILTIN_VPERMIL2PS,
26159 IX86_BUILTIN_VPERMIL2PD256,
26160 IX86_BUILTIN_VPERMIL2PS256,
26161 IX86_BUILTIN_VPERM2F128PD256,
26162 IX86_BUILTIN_VPERM2F128PS256,
26163 IX86_BUILTIN_VPERM2F128SI256,
26164 IX86_BUILTIN_VBROADCASTSS,
26165 IX86_BUILTIN_VBROADCASTSD256,
26166 IX86_BUILTIN_VBROADCASTSS256,
26167 IX86_BUILTIN_VBROADCASTPD256,
26168 IX86_BUILTIN_VBROADCASTPS256,
26169 IX86_BUILTIN_VINSERTF128PD256,
26170 IX86_BUILTIN_VINSERTF128PS256,
26171 IX86_BUILTIN_VINSERTF128SI256,
26172 IX86_BUILTIN_LOADUPD256,
26173 IX86_BUILTIN_LOADUPS256,
26174 IX86_BUILTIN_STOREUPD256,
26175 IX86_BUILTIN_STOREUPS256,
26176 IX86_BUILTIN_LDDQU256,
26177 IX86_BUILTIN_MOVNTDQ256,
26178 IX86_BUILTIN_MOVNTPD256,
26179 IX86_BUILTIN_MOVNTPS256,
26180 IX86_BUILTIN_LOADDQU256,
26181 IX86_BUILTIN_STOREDQU256,
26182 IX86_BUILTIN_MASKLOADPD,
26183 IX86_BUILTIN_MASKLOADPS,
26184 IX86_BUILTIN_MASKSTOREPD,
26185 IX86_BUILTIN_MASKSTOREPS,
26186 IX86_BUILTIN_MASKLOADPD256,
26187 IX86_BUILTIN_MASKLOADPS256,
26188 IX86_BUILTIN_MASKSTOREPD256,
26189 IX86_BUILTIN_MASKSTOREPS256,
26190 IX86_BUILTIN_MOVSHDUP256,
26191 IX86_BUILTIN_MOVSLDUP256,
26192 IX86_BUILTIN_MOVDDUP256,
26194 IX86_BUILTIN_SQRTPD256,
26195 IX86_BUILTIN_SQRTPS256,
26196 IX86_BUILTIN_SQRTPS_NR256,
26197 IX86_BUILTIN_RSQRTPS256,
26198 IX86_BUILTIN_RSQRTPS_NR256,
26200 IX86_BUILTIN_RCPPS256,
26202 IX86_BUILTIN_ROUNDPD256,
26203 IX86_BUILTIN_ROUNDPS256,
26205 IX86_BUILTIN_FLOORPD256,
26206 IX86_BUILTIN_CEILPD256,
26207 IX86_BUILTIN_TRUNCPD256,
26208 IX86_BUILTIN_RINTPD256,
26209 IX86_BUILTIN_ROUNDPD_AZ256,
26211 IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
26212 IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
26213 IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
26215 IX86_BUILTIN_FLOORPS256,
26216 IX86_BUILTIN_CEILPS256,
26217 IX86_BUILTIN_TRUNCPS256,
26218 IX86_BUILTIN_RINTPS256,
26219 IX86_BUILTIN_ROUNDPS_AZ256,
26221 IX86_BUILTIN_FLOORPS_SFIX256,
26222 IX86_BUILTIN_CEILPS_SFIX256,
26223 IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
26225 IX86_BUILTIN_UNPCKHPD256,
26226 IX86_BUILTIN_UNPCKLPD256,
26227 IX86_BUILTIN_UNPCKHPS256,
26228 IX86_BUILTIN_UNPCKLPS256,
26230 IX86_BUILTIN_SI256_SI,
26231 IX86_BUILTIN_PS256_PS,
26232 IX86_BUILTIN_PD256_PD,
26233 IX86_BUILTIN_SI_SI256,
26234 IX86_BUILTIN_PS_PS256,
26235 IX86_BUILTIN_PD_PD256,
26237 IX86_BUILTIN_VTESTZPD,
26238 IX86_BUILTIN_VTESTCPD,
26239 IX86_BUILTIN_VTESTNZCPD,
26240 IX86_BUILTIN_VTESTZPS,
26241 IX86_BUILTIN_VTESTCPS,
26242 IX86_BUILTIN_VTESTNZCPS,
26243 IX86_BUILTIN_VTESTZPD256,
26244 IX86_BUILTIN_VTESTCPD256,
26245 IX86_BUILTIN_VTESTNZCPD256,
26246 IX86_BUILTIN_VTESTZPS256,
26247 IX86_BUILTIN_VTESTCPS256,
26248 IX86_BUILTIN_VTESTNZCPS256,
26249 IX86_BUILTIN_PTESTZ256,
26250 IX86_BUILTIN_PTESTC256,
26251 IX86_BUILTIN_PTESTNZC256,
26253 IX86_BUILTIN_MOVMSKPD256,
26254 IX86_BUILTIN_MOVMSKPS256,
26256 /* AVX2 */
26257 IX86_BUILTIN_MPSADBW256,
26258 IX86_BUILTIN_PABSB256,
26259 IX86_BUILTIN_PABSW256,
26260 IX86_BUILTIN_PABSD256,
26261 IX86_BUILTIN_PACKSSDW256,
26262 IX86_BUILTIN_PACKSSWB256,
26263 IX86_BUILTIN_PACKUSDW256,
26264 IX86_BUILTIN_PACKUSWB256,
26265 IX86_BUILTIN_PADDB256,
26266 IX86_BUILTIN_PADDW256,
26267 IX86_BUILTIN_PADDD256,
26268 IX86_BUILTIN_PADDQ256,
26269 IX86_BUILTIN_PADDSB256,
26270 IX86_BUILTIN_PADDSW256,
26271 IX86_BUILTIN_PADDUSB256,
26272 IX86_BUILTIN_PADDUSW256,
26273 IX86_BUILTIN_PALIGNR256,
26274 IX86_BUILTIN_AND256I,
26275 IX86_BUILTIN_ANDNOT256I,
26276 IX86_BUILTIN_PAVGB256,
26277 IX86_BUILTIN_PAVGW256,
26278 IX86_BUILTIN_PBLENDVB256,
26279 IX86_BUILTIN_PBLENDVW256,
26280 IX86_BUILTIN_PCMPEQB256,
26281 IX86_BUILTIN_PCMPEQW256,
26282 IX86_BUILTIN_PCMPEQD256,
26283 IX86_BUILTIN_PCMPEQQ256,
26284 IX86_BUILTIN_PCMPGTB256,
26285 IX86_BUILTIN_PCMPGTW256,
26286 IX86_BUILTIN_PCMPGTD256,
26287 IX86_BUILTIN_PCMPGTQ256,
26288 IX86_BUILTIN_PHADDW256,
26289 IX86_BUILTIN_PHADDD256,
26290 IX86_BUILTIN_PHADDSW256,
26291 IX86_BUILTIN_PHSUBW256,
26292 IX86_BUILTIN_PHSUBD256,
26293 IX86_BUILTIN_PHSUBSW256,
26294 IX86_BUILTIN_PMADDUBSW256,
26295 IX86_BUILTIN_PMADDWD256,
26296 IX86_BUILTIN_PMAXSB256,
26297 IX86_BUILTIN_PMAXSW256,
26298 IX86_BUILTIN_PMAXSD256,
26299 IX86_BUILTIN_PMAXUB256,
26300 IX86_BUILTIN_PMAXUW256,
26301 IX86_BUILTIN_PMAXUD256,
26302 IX86_BUILTIN_PMINSB256,
26303 IX86_BUILTIN_PMINSW256,
26304 IX86_BUILTIN_PMINSD256,
26305 IX86_BUILTIN_PMINUB256,
26306 IX86_BUILTIN_PMINUW256,
26307 IX86_BUILTIN_PMINUD256,
26308 IX86_BUILTIN_PMOVMSKB256,
26309 IX86_BUILTIN_PMOVSXBW256,
26310 IX86_BUILTIN_PMOVSXBD256,
26311 IX86_BUILTIN_PMOVSXBQ256,
26312 IX86_BUILTIN_PMOVSXWD256,
26313 IX86_BUILTIN_PMOVSXWQ256,
26314 IX86_BUILTIN_PMOVSXDQ256,
26315 IX86_BUILTIN_PMOVZXBW256,
26316 IX86_BUILTIN_PMOVZXBD256,
26317 IX86_BUILTIN_PMOVZXBQ256,
26318 IX86_BUILTIN_PMOVZXWD256,
26319 IX86_BUILTIN_PMOVZXWQ256,
26320 IX86_BUILTIN_PMOVZXDQ256,
26321 IX86_BUILTIN_PMULDQ256,
26322 IX86_BUILTIN_PMULHRSW256,
26323 IX86_BUILTIN_PMULHUW256,
26324 IX86_BUILTIN_PMULHW256,
26325 IX86_BUILTIN_PMULLW256,
26326 IX86_BUILTIN_PMULLD256,
26327 IX86_BUILTIN_PMULUDQ256,
26328 IX86_BUILTIN_POR256,
26329 IX86_BUILTIN_PSADBW256,
26330 IX86_BUILTIN_PSHUFB256,
26331 IX86_BUILTIN_PSHUFD256,
26332 IX86_BUILTIN_PSHUFHW256,
26333 IX86_BUILTIN_PSHUFLW256,
26334 IX86_BUILTIN_PSIGNB256,
26335 IX86_BUILTIN_PSIGNW256,
26336 IX86_BUILTIN_PSIGND256,
26337 IX86_BUILTIN_PSLLDQI256,
26338 IX86_BUILTIN_PSLLWI256,
26339 IX86_BUILTIN_PSLLW256,
26340 IX86_BUILTIN_PSLLDI256,
26341 IX86_BUILTIN_PSLLD256,
26342 IX86_BUILTIN_PSLLQI256,
26343 IX86_BUILTIN_PSLLQ256,
26344 IX86_BUILTIN_PSRAWI256,
26345 IX86_BUILTIN_PSRAW256,
26346 IX86_BUILTIN_PSRADI256,
26347 IX86_BUILTIN_PSRAD256,
26348 IX86_BUILTIN_PSRLDQI256,
26349 IX86_BUILTIN_PSRLWI256,
26350 IX86_BUILTIN_PSRLW256,
26351 IX86_BUILTIN_PSRLDI256,
26352 IX86_BUILTIN_PSRLD256,
26353 IX86_BUILTIN_PSRLQI256,
26354 IX86_BUILTIN_PSRLQ256,
26355 IX86_BUILTIN_PSUBB256,
26356 IX86_BUILTIN_PSUBW256,
26357 IX86_BUILTIN_PSUBD256,
26358 IX86_BUILTIN_PSUBQ256,
26359 IX86_BUILTIN_PSUBSB256,
26360 IX86_BUILTIN_PSUBSW256,
26361 IX86_BUILTIN_PSUBUSB256,
26362 IX86_BUILTIN_PSUBUSW256,
26363 IX86_BUILTIN_PUNPCKHBW256,
26364 IX86_BUILTIN_PUNPCKHWD256,
26365 IX86_BUILTIN_PUNPCKHDQ256,
26366 IX86_BUILTIN_PUNPCKHQDQ256,
26367 IX86_BUILTIN_PUNPCKLBW256,
26368 IX86_BUILTIN_PUNPCKLWD256,
26369 IX86_BUILTIN_PUNPCKLDQ256,
26370 IX86_BUILTIN_PUNPCKLQDQ256,
26371 IX86_BUILTIN_PXOR256,
26372 IX86_BUILTIN_MOVNTDQA256,
26373 IX86_BUILTIN_VBROADCASTSS_PS,
26374 IX86_BUILTIN_VBROADCASTSS_PS256,
26375 IX86_BUILTIN_VBROADCASTSD_PD256,
26376 IX86_BUILTIN_VBROADCASTSI256,
26377 IX86_BUILTIN_PBLENDD256,
26378 IX86_BUILTIN_PBLENDD128,
26379 IX86_BUILTIN_PBROADCASTB256,
26380 IX86_BUILTIN_PBROADCASTW256,
26381 IX86_BUILTIN_PBROADCASTD256,
26382 IX86_BUILTIN_PBROADCASTQ256,
26383 IX86_BUILTIN_PBROADCASTB128,
26384 IX86_BUILTIN_PBROADCASTW128,
26385 IX86_BUILTIN_PBROADCASTD128,
26386 IX86_BUILTIN_PBROADCASTQ128,
26387 IX86_BUILTIN_VPERMVARSI256,
26388 IX86_BUILTIN_VPERMDF256,
26389 IX86_BUILTIN_VPERMVARSF256,
26390 IX86_BUILTIN_VPERMDI256,
26391 IX86_BUILTIN_VPERMTI256,
26392 IX86_BUILTIN_VEXTRACT128I256,
26393 IX86_BUILTIN_VINSERT128I256,
26394 IX86_BUILTIN_MASKLOADD,
26395 IX86_BUILTIN_MASKLOADQ,
26396 IX86_BUILTIN_MASKLOADD256,
26397 IX86_BUILTIN_MASKLOADQ256,
26398 IX86_BUILTIN_MASKSTORED,
26399 IX86_BUILTIN_MASKSTOREQ,
26400 IX86_BUILTIN_MASKSTORED256,
26401 IX86_BUILTIN_MASKSTOREQ256,
26402 IX86_BUILTIN_PSLLVV4DI,
26403 IX86_BUILTIN_PSLLVV2DI,
26404 IX86_BUILTIN_PSLLVV8SI,
26405 IX86_BUILTIN_PSLLVV4SI,
26406 IX86_BUILTIN_PSRAVV8SI,
26407 IX86_BUILTIN_PSRAVV4SI,
26408 IX86_BUILTIN_PSRLVV4DI,
26409 IX86_BUILTIN_PSRLVV2DI,
26410 IX86_BUILTIN_PSRLVV8SI,
26411 IX86_BUILTIN_PSRLVV4SI,
26413 IX86_BUILTIN_GATHERSIV2DF,
26414 IX86_BUILTIN_GATHERSIV4DF,
26415 IX86_BUILTIN_GATHERDIV2DF,
26416 IX86_BUILTIN_GATHERDIV4DF,
26417 IX86_BUILTIN_GATHERSIV4SF,
26418 IX86_BUILTIN_GATHERSIV8SF,
26419 IX86_BUILTIN_GATHERDIV4SF,
26420 IX86_BUILTIN_GATHERDIV8SF,
26421 IX86_BUILTIN_GATHERSIV2DI,
26422 IX86_BUILTIN_GATHERSIV4DI,
26423 IX86_BUILTIN_GATHERDIV2DI,
26424 IX86_BUILTIN_GATHERDIV4DI,
26425 IX86_BUILTIN_GATHERSIV4SI,
26426 IX86_BUILTIN_GATHERSIV8SI,
26427 IX86_BUILTIN_GATHERDIV4SI,
26428 IX86_BUILTIN_GATHERDIV8SI,
26430 /* Alternate 4 element gather for the vectorizer where
26431 all operands are 32-byte wide. */
26432 IX86_BUILTIN_GATHERALTSIV4DF,
26433 IX86_BUILTIN_GATHERALTDIV8SF,
26434 IX86_BUILTIN_GATHERALTSIV4DI,
26435 IX86_BUILTIN_GATHERALTDIV8SI,
26437 /* TFmode support builtins. */
26438 IX86_BUILTIN_INFQ,
26439 IX86_BUILTIN_HUGE_VALQ,
26440 IX86_BUILTIN_FABSQ,
26441 IX86_BUILTIN_COPYSIGNQ,
26443 /* Vectorizer support builtins. */
26444 IX86_BUILTIN_CPYSGNPS,
26445 IX86_BUILTIN_CPYSGNPD,
26446 IX86_BUILTIN_CPYSGNPS256,
26447 IX86_BUILTIN_CPYSGNPD256,
26449 /* FMA4 instructions. */
26450 IX86_BUILTIN_VFMADDSS,
26451 IX86_BUILTIN_VFMADDSD,
26452 IX86_BUILTIN_VFMADDPS,
26453 IX86_BUILTIN_VFMADDPD,
26454 IX86_BUILTIN_VFMADDPS256,
26455 IX86_BUILTIN_VFMADDPD256,
26456 IX86_BUILTIN_VFMADDSUBPS,
26457 IX86_BUILTIN_VFMADDSUBPD,
26458 IX86_BUILTIN_VFMADDSUBPS256,
26459 IX86_BUILTIN_VFMADDSUBPD256,
26461 /* FMA3 instructions. */
26462 IX86_BUILTIN_VFMADDSS3,
26463 IX86_BUILTIN_VFMADDSD3,
26465 /* XOP instructions. */
26466 IX86_BUILTIN_VPCMOV,
26467 IX86_BUILTIN_VPCMOV_V2DI,
26468 IX86_BUILTIN_VPCMOV_V4SI,
26469 IX86_BUILTIN_VPCMOV_V8HI,
26470 IX86_BUILTIN_VPCMOV_V16QI,
26471 IX86_BUILTIN_VPCMOV_V4SF,
26472 IX86_BUILTIN_VPCMOV_V2DF,
26473 IX86_BUILTIN_VPCMOV256,
26474 IX86_BUILTIN_VPCMOV_V4DI256,
26475 IX86_BUILTIN_VPCMOV_V8SI256,
26476 IX86_BUILTIN_VPCMOV_V16HI256,
26477 IX86_BUILTIN_VPCMOV_V32QI256,
26478 IX86_BUILTIN_VPCMOV_V8SF256,
26479 IX86_BUILTIN_VPCMOV_V4DF256,
26481 IX86_BUILTIN_VPPERM,
26483 IX86_BUILTIN_VPMACSSWW,
26484 IX86_BUILTIN_VPMACSWW,
26485 IX86_BUILTIN_VPMACSSWD,
26486 IX86_BUILTIN_VPMACSWD,
26487 IX86_BUILTIN_VPMACSSDD,
26488 IX86_BUILTIN_VPMACSDD,
26489 IX86_BUILTIN_VPMACSSDQL,
26490 IX86_BUILTIN_VPMACSSDQH,
26491 IX86_BUILTIN_VPMACSDQL,
26492 IX86_BUILTIN_VPMACSDQH,
26493 IX86_BUILTIN_VPMADCSSWD,
26494 IX86_BUILTIN_VPMADCSWD,
26496 IX86_BUILTIN_VPHADDBW,
26497 IX86_BUILTIN_VPHADDBD,
26498 IX86_BUILTIN_VPHADDBQ,
26499 IX86_BUILTIN_VPHADDWD,
26500 IX86_BUILTIN_VPHADDWQ,
26501 IX86_BUILTIN_VPHADDDQ,
26502 IX86_BUILTIN_VPHADDUBW,
26503 IX86_BUILTIN_VPHADDUBD,
26504 IX86_BUILTIN_VPHADDUBQ,
26505 IX86_BUILTIN_VPHADDUWD,
26506 IX86_BUILTIN_VPHADDUWQ,
26507 IX86_BUILTIN_VPHADDUDQ,
26508 IX86_BUILTIN_VPHSUBBW,
26509 IX86_BUILTIN_VPHSUBWD,
26510 IX86_BUILTIN_VPHSUBDQ,
26512 IX86_BUILTIN_VPROTB,
26513 IX86_BUILTIN_VPROTW,
26514 IX86_BUILTIN_VPROTD,
26515 IX86_BUILTIN_VPROTQ,
26516 IX86_BUILTIN_VPROTB_IMM,
26517 IX86_BUILTIN_VPROTW_IMM,
26518 IX86_BUILTIN_VPROTD_IMM,
26519 IX86_BUILTIN_VPROTQ_IMM,
26521 IX86_BUILTIN_VPSHLB,
26522 IX86_BUILTIN_VPSHLW,
26523 IX86_BUILTIN_VPSHLD,
26524 IX86_BUILTIN_VPSHLQ,
26525 IX86_BUILTIN_VPSHAB,
26526 IX86_BUILTIN_VPSHAW,
26527 IX86_BUILTIN_VPSHAD,
26528 IX86_BUILTIN_VPSHAQ,
26530 IX86_BUILTIN_VFRCZSS,
26531 IX86_BUILTIN_VFRCZSD,
26532 IX86_BUILTIN_VFRCZPS,
26533 IX86_BUILTIN_VFRCZPD,
26534 IX86_BUILTIN_VFRCZPS256,
26535 IX86_BUILTIN_VFRCZPD256,
26537 IX86_BUILTIN_VPCOMEQUB,
26538 IX86_BUILTIN_VPCOMNEUB,
26539 IX86_BUILTIN_VPCOMLTUB,
26540 IX86_BUILTIN_VPCOMLEUB,
26541 IX86_BUILTIN_VPCOMGTUB,
26542 IX86_BUILTIN_VPCOMGEUB,
26543 IX86_BUILTIN_VPCOMFALSEUB,
26544 IX86_BUILTIN_VPCOMTRUEUB,
26546 IX86_BUILTIN_VPCOMEQUW,
26547 IX86_BUILTIN_VPCOMNEUW,
26548 IX86_BUILTIN_VPCOMLTUW,
26549 IX86_BUILTIN_VPCOMLEUW,
26550 IX86_BUILTIN_VPCOMGTUW,
26551 IX86_BUILTIN_VPCOMGEUW,
26552 IX86_BUILTIN_VPCOMFALSEUW,
26553 IX86_BUILTIN_VPCOMTRUEUW,
26555 IX86_BUILTIN_VPCOMEQUD,
26556 IX86_BUILTIN_VPCOMNEUD,
26557 IX86_BUILTIN_VPCOMLTUD,
26558 IX86_BUILTIN_VPCOMLEUD,
26559 IX86_BUILTIN_VPCOMGTUD,
26560 IX86_BUILTIN_VPCOMGEUD,
26561 IX86_BUILTIN_VPCOMFALSEUD,
26562 IX86_BUILTIN_VPCOMTRUEUD,
26564 IX86_BUILTIN_VPCOMEQUQ,
26565 IX86_BUILTIN_VPCOMNEUQ,
26566 IX86_BUILTIN_VPCOMLTUQ,
26567 IX86_BUILTIN_VPCOMLEUQ,
26568 IX86_BUILTIN_VPCOMGTUQ,
26569 IX86_BUILTIN_VPCOMGEUQ,
26570 IX86_BUILTIN_VPCOMFALSEUQ,
26571 IX86_BUILTIN_VPCOMTRUEUQ,
26573 IX86_BUILTIN_VPCOMEQB,
26574 IX86_BUILTIN_VPCOMNEB,
26575 IX86_BUILTIN_VPCOMLTB,
26576 IX86_BUILTIN_VPCOMLEB,
26577 IX86_BUILTIN_VPCOMGTB,
26578 IX86_BUILTIN_VPCOMGEB,
26579 IX86_BUILTIN_VPCOMFALSEB,
26580 IX86_BUILTIN_VPCOMTRUEB,
26582 IX86_BUILTIN_VPCOMEQW,
26583 IX86_BUILTIN_VPCOMNEW,
26584 IX86_BUILTIN_VPCOMLTW,
26585 IX86_BUILTIN_VPCOMLEW,
26586 IX86_BUILTIN_VPCOMGTW,
26587 IX86_BUILTIN_VPCOMGEW,
26588 IX86_BUILTIN_VPCOMFALSEW,
26589 IX86_BUILTIN_VPCOMTRUEW,
26591 IX86_BUILTIN_VPCOMEQD,
26592 IX86_BUILTIN_VPCOMNED,
26593 IX86_BUILTIN_VPCOMLTD,
26594 IX86_BUILTIN_VPCOMLED,
26595 IX86_BUILTIN_VPCOMGTD,
26596 IX86_BUILTIN_VPCOMGED,
26597 IX86_BUILTIN_VPCOMFALSED,
26598 IX86_BUILTIN_VPCOMTRUED,
26600 IX86_BUILTIN_VPCOMEQQ,
26601 IX86_BUILTIN_VPCOMNEQ,
26602 IX86_BUILTIN_VPCOMLTQ,
26603 IX86_BUILTIN_VPCOMLEQ,
26604 IX86_BUILTIN_VPCOMGTQ,
26605 IX86_BUILTIN_VPCOMGEQ,
26606 IX86_BUILTIN_VPCOMFALSEQ,
26607 IX86_BUILTIN_VPCOMTRUEQ,
26609 /* LWP instructions. */
26610 IX86_BUILTIN_LLWPCB,
26611 IX86_BUILTIN_SLWPCB,
26612 IX86_BUILTIN_LWPVAL32,
26613 IX86_BUILTIN_LWPVAL64,
26614 IX86_BUILTIN_LWPINS32,
26615 IX86_BUILTIN_LWPINS64,
26617 IX86_BUILTIN_CLZS,
26619 /* RTM */
26620 IX86_BUILTIN_XBEGIN,
26621 IX86_BUILTIN_XEND,
26622 IX86_BUILTIN_XABORT,
26623 IX86_BUILTIN_XTEST,
26625 /* BMI instructions. */
26626 IX86_BUILTIN_BEXTR32,
26627 IX86_BUILTIN_BEXTR64,
26628 IX86_BUILTIN_CTZS,
26630 /* TBM instructions. */
26631 IX86_BUILTIN_BEXTRI32,
26632 IX86_BUILTIN_BEXTRI64,
26634 /* BMI2 instructions. */
26635 IX86_BUILTIN_BZHI32,
26636 IX86_BUILTIN_BZHI64,
26637 IX86_BUILTIN_PDEP32,
26638 IX86_BUILTIN_PDEP64,
26639 IX86_BUILTIN_PEXT32,
26640 IX86_BUILTIN_PEXT64,
26642 /* ADX instructions. */
26643 IX86_BUILTIN_ADDCARRYX32,
26644 IX86_BUILTIN_ADDCARRYX64,
26646 /* FSGSBASE instructions. */
26647 IX86_BUILTIN_RDFSBASE32,
26648 IX86_BUILTIN_RDFSBASE64,
26649 IX86_BUILTIN_RDGSBASE32,
26650 IX86_BUILTIN_RDGSBASE64,
26651 IX86_BUILTIN_WRFSBASE32,
26652 IX86_BUILTIN_WRFSBASE64,
26653 IX86_BUILTIN_WRGSBASE32,
26654 IX86_BUILTIN_WRGSBASE64,
26656 /* RDRND instructions. */
26657 IX86_BUILTIN_RDRAND16_STEP,
26658 IX86_BUILTIN_RDRAND32_STEP,
26659 IX86_BUILTIN_RDRAND64_STEP,
26661 /* RDSEED instructions. */
26662 IX86_BUILTIN_RDSEED16_STEP,
26663 IX86_BUILTIN_RDSEED32_STEP,
26664 IX86_BUILTIN_RDSEED64_STEP,
26666 /* F16C instructions. */
26667 IX86_BUILTIN_CVTPH2PS,
26668 IX86_BUILTIN_CVTPH2PS256,
26669 IX86_BUILTIN_CVTPS2PH,
26670 IX86_BUILTIN_CVTPS2PH256,
26672 /* CFString built-in for darwin */
26673 IX86_BUILTIN_CFSTRING,
26675 /* Builtins to get CPU type and supported features. */
26676 IX86_BUILTIN_CPU_INIT,
26677 IX86_BUILTIN_CPU_IS,
26678 IX86_BUILTIN_CPU_SUPPORTS,
26680 IX86_BUILTIN_MAX
26683 /* Table for the ix86 builtin decls. */
26684 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
26686 /* Table of all of the builtin functions that are possible with different ISA's
26687 but are waiting to be built until a function is declared to use that
26688 ISA. */
26689 struct builtin_isa {
26690 const char *name; /* function name */
26691 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
26692 HOST_WIDE_INT isa; /* isa_flags this builtin is defined for */
26693 bool const_p; /* true if the declaration is constant */
26694 bool set_and_not_built_p;
26697 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
26700 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
26701 of which isa_flags to use in the ix86_builtins_isa array. Stores the
26702 function decl in the ix86_builtins array. Returns the function decl or
26703 NULL_TREE, if the builtin was not added.
26705 If the front end has a special hook for builtin functions, delay adding
26706 builtin functions that aren't in the current ISA until the ISA is changed
26707 with function specific optimization. Doing so, can save about 300K for the
26708 default compiler. When the builtin is expanded, check at that time whether
26709 it is valid.
26711 If the front end doesn't have a special hook, record all builtins, even if
26712 it isn't an instruction set in the current ISA in case the user uses
26713 function specific options for a different ISA, so that we don't get scope
26714 errors if a builtin is added in the middle of a function scope. */
26716 static inline tree
26717 def_builtin (HOST_WIDE_INT mask, const char *name,
26718 enum ix86_builtin_func_type tcode,
26719 enum ix86_builtins code)
26721 tree decl = NULL_TREE;
26723 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
26725 ix86_builtins_isa[(int) code].isa = mask;
26727 mask &= ~OPTION_MASK_ISA_64BIT;
26728 if (mask == 0
26729 || (mask & ix86_isa_flags) != 0
26730 || (lang_hooks.builtin_function
26731 == lang_hooks.builtin_function_ext_scope))
26734 tree type = ix86_get_builtin_func_type (tcode);
26735 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
26736 NULL, NULL_TREE);
26737 ix86_builtins[(int) code] = decl;
26738 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
26740 else
26742 ix86_builtins[(int) code] = NULL_TREE;
26743 ix86_builtins_isa[(int) code].tcode = tcode;
26744 ix86_builtins_isa[(int) code].name = name;
26745 ix86_builtins_isa[(int) code].const_p = false;
26746 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
26750 return decl;
26753 /* Like def_builtin, but also marks the function decl "const". */
26755 static inline tree
26756 def_builtin_const (HOST_WIDE_INT mask, const char *name,
26757 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
26759 tree decl = def_builtin (mask, name, tcode, code);
26760 if (decl)
26761 TREE_READONLY (decl) = 1;
26762 else
26763 ix86_builtins_isa[(int) code].const_p = true;
26765 return decl;
26768 /* Add any new builtin functions for a given ISA that may not have been
26769 declared. This saves a bit of space compared to adding all of the
26770 declarations to the tree, even if we didn't use them. */
26772 static void
26773 ix86_add_new_builtins (HOST_WIDE_INT isa)
26775 int i;
26777 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
26779 if ((ix86_builtins_isa[i].isa & isa) != 0
26780 && ix86_builtins_isa[i].set_and_not_built_p)
26782 tree decl, type;
26784 /* Don't define the builtin again. */
26785 ix86_builtins_isa[i].set_and_not_built_p = false;
26787 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
26788 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
26789 type, i, BUILT_IN_MD, NULL,
26790 NULL_TREE);
26792 ix86_builtins[i] = decl;
26793 if (ix86_builtins_isa[i].const_p)
26794 TREE_READONLY (decl) = 1;
26799 /* Bits for builtin_description.flag. */
26801 /* Set when we don't support the comparison natively, and should
26802 swap_comparison in order to support it. */
26803 #define BUILTIN_DESC_SWAP_OPERANDS 1
26805 struct builtin_description
26807 const HOST_WIDE_INT mask;
26808 const enum insn_code icode;
26809 const char *const name;
26810 const enum ix86_builtins code;
26811 const enum rtx_code comparison;
26812 const int flag;
26815 static const struct builtin_description bdesc_comi[] =
26817 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
26818 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
26819 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
26820 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
26821 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
26822 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
26823 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
26824 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
26825 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
26826 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
26827 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
26828 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
26829 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
26830 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
26831 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
26832 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
26833 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
26834 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
26835 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
26836 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
26837 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
26838 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
26839 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
26840 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
26843 static const struct builtin_description bdesc_pcmpestr[] =
26845 /* SSE4.2 */
26846 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
26847 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
26848 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
26849 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
26850 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
26851 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
26852 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
26855 static const struct builtin_description bdesc_pcmpistr[] =
26857 /* SSE4.2 */
26858 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
26859 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
26860 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
26861 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
26862 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
26863 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
26864 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
26867 /* Special builtins with variable number of arguments. */
26868 static const struct builtin_description bdesc_special_args[] =
26870 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
26871 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
26872 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
26874 /* MMX */
26875 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26877 /* 3DNow! */
26878 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
26880 /* FXSR, XSAVE and XSAVEOPT */
26881 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxsave", IX86_BUILTIN_FXSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID },
26882 { OPTION_MASK_ISA_FXSR, CODE_FOR_nothing, "__builtin_ia32_fxrstor", IX86_BUILTIN_FXRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID },
26883 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xsave", IX86_BUILTIN_XSAVE, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26884 { OPTION_MASK_ISA_XSAVE, CODE_FOR_nothing, "__builtin_ia32_xrstor", IX86_BUILTIN_XRSTOR, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26885 { OPTION_MASK_ISA_XSAVEOPT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt", IX86_BUILTIN_XSAVEOPT, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26887 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxsave64", IX86_BUILTIN_FXSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26888 { OPTION_MASK_ISA_FXSR | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_fxrstor64", IX86_BUILTIN_FXRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID },
26889 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsave64", IX86_BUILTIN_XSAVE64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26890 { OPTION_MASK_ISA_XSAVE | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xrstor64", IX86_BUILTIN_XRSTOR64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26891 { OPTION_MASK_ISA_XSAVEOPT | OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_xsaveopt64", IX86_BUILTIN_XSAVEOPT64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64 },
26893 /* SSE */
26894 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26895 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26896 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26898 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26899 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
26900 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26901 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
26903 /* SSE or 3DNow!A */
26904 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26905 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
26907 /* SSE2 */
26908 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26909 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
26910 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26911 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
26912 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26913 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
26914 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
26915 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
26916 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
26917 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26919 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26920 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
26922 /* SSE3 */
26923 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
26925 /* SSE4.1 */
26926 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
26928 /* SSE4A */
26929 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
26930 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
26932 /* AVX */
26933 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
26934 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
26936 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
26937 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26938 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26939 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
26940 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
26942 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
26943 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
26944 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26945 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26946 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26947 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
26948 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
26950 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
26951 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
26952 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
26954 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
26955 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
26956 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
26957 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
26958 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
26959 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
26960 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
26961 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
26963 /* AVX2 */
26964 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_movntdqa, "__builtin_ia32_movntdqa256", IX86_BUILTIN_MOVNTDQA256, UNKNOWN, (int) V4DI_FTYPE_PV4DI },
26965 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd, "__builtin_ia32_maskloadd", IX86_BUILTIN_MASKLOADD, UNKNOWN, (int) V4SI_FTYPE_PCV4SI_V4SI },
26966 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq, "__builtin_ia32_maskloadq", IX86_BUILTIN_MASKLOADQ, UNKNOWN, (int) V2DI_FTYPE_PCV2DI_V2DI },
26967 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadd256, "__builtin_ia32_maskloadd256", IX86_BUILTIN_MASKLOADD256, UNKNOWN, (int) V8SI_FTYPE_PCV8SI_V8SI },
26968 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskloadq256, "__builtin_ia32_maskloadq256", IX86_BUILTIN_MASKLOADQ256, UNKNOWN, (int) V4DI_FTYPE_PCV4DI_V4DI },
26969 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored, "__builtin_ia32_maskstored", IX86_BUILTIN_MASKSTORED, UNKNOWN, (int) VOID_FTYPE_PV4SI_V4SI_V4SI },
26970 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq, "__builtin_ia32_maskstoreq", IX86_BUILTIN_MASKSTOREQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI_V2DI },
26971 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstored256, "__builtin_ia32_maskstored256", IX86_BUILTIN_MASKSTORED256, UNKNOWN, (int) VOID_FTYPE_PV8SI_V8SI_V8SI },
26972 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_maskstoreq256, "__builtin_ia32_maskstoreq256", IX86_BUILTIN_MASKSTOREQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI_V4DI },
26974 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
26975 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
26976 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
26977 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
26978 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
26979 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
26981 /* FSGSBASE */
26982 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26983 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26984 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26985 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
26986 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26987 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26988 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
26989 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
26991 /* RTM */
26992 { OPTION_MASK_ISA_RTM, CODE_FOR_xbegin, "__builtin_ia32_xbegin", IX86_BUILTIN_XBEGIN, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
26993 { OPTION_MASK_ISA_RTM, CODE_FOR_xend, "__builtin_ia32_xend", IX86_BUILTIN_XEND, UNKNOWN, (int) VOID_FTYPE_VOID },
26994 { OPTION_MASK_ISA_RTM, CODE_FOR_xtest, "__builtin_ia32_xtest", IX86_BUILTIN_XTEST, UNKNOWN, (int) INT_FTYPE_VOID },
26997 /* Builtins with variable number of arguments. */
26998 static const struct builtin_description bdesc_args[] =
27000 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
27001 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
27002 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_nothing, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
27003 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27004 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27005 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
27006 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
27008 /* MMX */
27009 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27010 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27011 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27012 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27013 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27014 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27016 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27017 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27018 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27019 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27020 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27021 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27022 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27023 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27025 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27026 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27028 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27029 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27030 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27031 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27033 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27034 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27035 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27036 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27037 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27038 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27040 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27041 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27042 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27043 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27044 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
27045 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
27047 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27048 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
27049 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
27051 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
27053 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27054 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27055 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27056 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27057 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27058 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27060 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27061 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27062 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
27063 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27064 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27065 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
27067 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
27068 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
27069 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
27070 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
27072 /* 3DNow! */
27073 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27074 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27075 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27076 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27078 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27079 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27080 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27081 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27082 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27083 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
27084 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27085 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27086 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27087 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27088 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27089 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27090 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27091 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27092 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27094 /* 3DNow!A */
27095 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
27096 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
27097 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27098 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
27099 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27100 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
27102 /* SSE */
27103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
27104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27105 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27107 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27108 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27109 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27110 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27111 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27112 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
27113 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
27114 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
27116 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27118 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27119 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27120 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27121 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27122 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27123 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27124 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27125 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27127 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27128 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27129 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27130 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27131 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27132 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27133 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27134 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27135 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27136 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27137 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
27138 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27139 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
27140 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
27141 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
27142 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27143 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
27144 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
27145 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
27146 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27147 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
27148 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
27150 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27151 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27152 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27153 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27155 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27156 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27157 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27158 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27160 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27162 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27163 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27164 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27165 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27166 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27168 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
27169 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
27170 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
27172 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
27174 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27175 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27176 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
27178 { OPTION_MASK_ISA_SSE, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
27179 { OPTION_MASK_ISA_SSE, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
27181 /* SSE MMX or 3Dnow!A */
27182 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27183 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27184 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27186 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27187 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27188 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27189 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27191 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
27192 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
27194 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
27196 /* SSE2 */
27197 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27199 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
27200 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
27201 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27202 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
27203 { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
27205 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27206 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27207 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
27208 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
27209 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
27211 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
27213 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27214 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
27215 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27216 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
27218 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27219 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
27220 { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27222 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27223 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27224 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27225 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27226 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27227 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27228 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27229 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27231 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27232 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27233 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27234 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27235 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
27236 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27237 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27238 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27239 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27240 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27241 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
27242 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27243 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
27244 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
27245 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
27246 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27247 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
27248 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
27249 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
27250 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
27252 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27253 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27254 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27255 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27257 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27258 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27259 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27260 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27262 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27264 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27265 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27266 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27268 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27270 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27271 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27272 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27273 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27274 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27275 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27276 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27277 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27279 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27280 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27281 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27282 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27283 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27284 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27285 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27286 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27288 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27289 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
27291 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27292 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27293 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27294 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27296 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27297 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27299 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27300 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27301 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27302 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27303 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27304 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27306 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27307 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27308 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27309 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27311 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27312 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27313 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27314 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27315 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27316 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27317 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27318 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27320 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27321 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27322 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
27324 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27325 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
27327 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
27328 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_even_v4si, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27330 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
27332 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
27333 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
27334 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
27335 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
27337 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27338 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27339 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27340 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27341 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27342 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27343 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27345 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
27346 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27347 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27348 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
27349 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27350 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27351 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
27353 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
27354 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
27355 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
27356 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
27358 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
27359 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27360 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
27362 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
27364 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27366 /* SSE2 MMX */
27367 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27368 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
27370 /* SSE3 */
27371 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
27372 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27374 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27375 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27376 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27377 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27378 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
27379 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
27381 /* SSSE3 */
27382 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27383 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
27384 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27385 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
27386 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27387 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
27389 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27390 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27391 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27392 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27393 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27394 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27395 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27396 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27397 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27398 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27399 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27400 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27401 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
27402 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
27403 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27404 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27405 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27406 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27407 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27408 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
27409 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27410 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
27411 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27412 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
27414 /* SSSE3. */
27415 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
27416 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
27418 /* SSE4.1 */
27419 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27420 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27421 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
27422 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
27423 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27424 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27425 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27426 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
27427 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
27428 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
27430 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27431 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27432 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27433 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27434 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27435 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27436 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
27437 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
27438 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
27439 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
27440 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
27441 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
27442 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27444 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
27445 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27446 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27447 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27448 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27449 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27450 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
27451 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27452 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27453 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
27454 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
27455 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27457 /* SSE4.1 */
27458 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27459 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27460 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27461 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27463 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
27464 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
27465 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
27466 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
27468 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27469 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
27471 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
27472 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
27474 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
27475 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
27476 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
27477 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
27479 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
27480 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
27482 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27483 { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
27485 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27486 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27487 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
27489 /* SSE4.2 */
27490 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27491 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
27492 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
27493 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27494 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27496 /* SSE4A */
27497 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
27498 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
27499 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
27500 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27502 /* AES */
27503 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
27504 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27506 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27507 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27508 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27509 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27511 /* PCLMUL */
27512 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
27514 /* AVX */
27515 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27516 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27517 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27518 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27519 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27520 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27521 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27522 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27523 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27524 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27525 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27526 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27527 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27528 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27529 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27530 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27531 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27532 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27533 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27534 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27535 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27536 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27537 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27538 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27539 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27540 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27542 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
27543 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
27544 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
27545 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27547 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27548 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27549 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
27550 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
27551 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27552 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27553 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27554 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27555 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27556 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
27557 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
27558 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27559 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27560 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
27561 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
27562 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
27563 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
27564 { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
27565 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
27566 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27567 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
27568 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27569 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
27570 { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27571 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
27572 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
27573 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27574 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
27575 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
27576 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27577 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27578 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
27579 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
27580 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
27582 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27583 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27584 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27586 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27587 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27588 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27589 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27590 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27592 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27594 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27595 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
27597 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
27598 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
27599 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
27600 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
27602 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
27603 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27605 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27606 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
27608 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
27609 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
27610 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
27611 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
27613 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
27614 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
27616 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
27617 { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
27619 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27620 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27621 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27622 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27624 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27625 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27626 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27627 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
27628 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
27629 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
27631 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27632 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27633 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
27634 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27635 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27636 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
27637 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27638 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27639 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
27640 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27641 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27642 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
27643 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27644 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27645 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
27647 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
27648 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
27650 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
27651 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
27653 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
27655 /* AVX2 */
27656 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
27657 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
27658 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI },
27659 { OPTION_MASK_ISA_AVX2, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI },
27660 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packssdw, "__builtin_ia32_packssdw256", IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27661 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packsswb, "__builtin_ia32_packsswb256", IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27662 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packusdw, "__builtin_ia32_packusdw256", IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int) V16HI_FTYPE_V8SI_V8SI },
27663 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_packuswb, "__builtin_ia32_packuswb256", IX86_BUILTIN_PACKUSWB256, UNKNOWN, (int) V32QI_FTYPE_V16HI_V16HI },
27664 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv32qi3, "__builtin_ia32_paddb256", IX86_BUILTIN_PADDB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27665 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv16hi3, "__builtin_ia32_paddw256", IX86_BUILTIN_PADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27666 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv8si3, "__builtin_ia32_paddd256", IX86_BUILTIN_PADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27667 { OPTION_MASK_ISA_AVX2, CODE_FOR_addv4di3, "__builtin_ia32_paddq256", IX86_BUILTIN_PADDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27668 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv32qi3, "__builtin_ia32_paddsb256", IX86_BUILTIN_PADDSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27669 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ssaddv16hi3, "__builtin_ia32_paddsw256", IX86_BUILTIN_PADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27670 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv32qi3, "__builtin_ia32_paddusb256", IX86_BUILTIN_PADDUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27671 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_usaddv16hi3, "__builtin_ia32_paddusw256", IX86_BUILTIN_PADDUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27672 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_palignrv2ti, "__builtin_ia32_palignr256", IX86_BUILTIN_PALIGNR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT_CONVERT },
27673 { OPTION_MASK_ISA_AVX2, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX86_BUILTIN_AND256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27674 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27675 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27676 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27677 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI },
27678 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT },
27679 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv32qi3, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27680 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv16hi3, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27681 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv8si3, "__builtin_ia32_pcmpeqd256", IX86_BUILTIN_PCMPEQD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27682 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_eqv4di3, "__builtin_ia32_pcmpeqq256", IX86_BUILTIN_PCMPEQQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27683 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv32qi3, "__builtin_ia32_pcmpgtb256", IX86_BUILTIN_PCMPGTB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27684 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv16hi3, "__builtin_ia32_pcmpgtw256", IX86_BUILTIN_PCMPGTW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27685 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv8si3, "__builtin_ia32_pcmpgtd256", IX86_BUILTIN_PCMPGTD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27686 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_gtv4di3, "__builtin_ia32_pcmpgtq256", IX86_BUILTIN_PCMPGTQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27687 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddwv16hi3, "__builtin_ia32_phaddw256", IX86_BUILTIN_PHADDW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27688 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phadddv8si3, "__builtin_ia32_phaddd256", IX86_BUILTIN_PHADDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27689 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phaddswv16hi3, "__builtin_ia32_phaddsw256", IX86_BUILTIN_PHADDSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27690 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubwv16hi3, "__builtin_ia32_phsubw256", IX86_BUILTIN_PHSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27691 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubdv8si3, "__builtin_ia32_phsubd256", IX86_BUILTIN_PHSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27692 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_phsubswv16hi3, "__builtin_ia32_phsubsw256", IX86_BUILTIN_PHSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27693 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddubsw256, "__builtin_ia32_pmaddubsw256", IX86_BUILTIN_PMADDUBSW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27694 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmaddwd, "__builtin_ia32_pmaddwd256", IX86_BUILTIN_PMADDWD256, UNKNOWN, (int) V8SI_FTYPE_V16HI_V16HI },
27695 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv32qi3, "__builtin_ia32_pmaxsb256", IX86_BUILTIN_PMAXSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27696 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv16hi3, "__builtin_ia32_pmaxsw256", IX86_BUILTIN_PMAXSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27697 { OPTION_MASK_ISA_AVX2, CODE_FOR_smaxv8si3 , "__builtin_ia32_pmaxsd256", IX86_BUILTIN_PMAXSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27698 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv32qi3, "__builtin_ia32_pmaxub256", IX86_BUILTIN_PMAXUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27699 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv16hi3, "__builtin_ia32_pmaxuw256", IX86_BUILTIN_PMAXUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27700 { OPTION_MASK_ISA_AVX2, CODE_FOR_umaxv8si3 , "__builtin_ia32_pmaxud256", IX86_BUILTIN_PMAXUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27701 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv32qi3, "__builtin_ia32_pminsb256", IX86_BUILTIN_PMINSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27702 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv16hi3, "__builtin_ia32_pminsw256", IX86_BUILTIN_PMINSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27703 { OPTION_MASK_ISA_AVX2, CODE_FOR_sminv8si3 , "__builtin_ia32_pminsd256", IX86_BUILTIN_PMINSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27704 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv32qi3, "__builtin_ia32_pminub256", IX86_BUILTIN_PMINUB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27705 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv16hi3, "__builtin_ia32_pminuw256", IX86_BUILTIN_PMINUW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27706 { OPTION_MASK_ISA_AVX2, CODE_FOR_uminv8si3 , "__builtin_ia32_pminud256", IX86_BUILTIN_PMINUD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27707 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmovmskb, "__builtin_ia32_pmovmskb256", IX86_BUILTIN_PMOVMSKB256, UNKNOWN, (int) INT_FTYPE_V32QI },
27708 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv16qiv16hi2, "__builtin_ia32_pmovsxbw256", IX86_BUILTIN_PMOVSXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27709 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8qiv8si2 , "__builtin_ia32_pmovsxbd256", IX86_BUILTIN_PMOVSXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27710 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4qiv4di2 , "__builtin_ia32_pmovsxbq256", IX86_BUILTIN_PMOVSXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27711 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv8hiv8si2 , "__builtin_ia32_pmovsxwd256", IX86_BUILTIN_PMOVSXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27712 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4hiv4di2 , "__builtin_ia32_pmovsxwq256", IX86_BUILTIN_PMOVSXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27713 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sign_extendv4siv4di2 , "__builtin_ia32_pmovsxdq256", IX86_BUILTIN_PMOVSXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27714 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv16qiv16hi2, "__builtin_ia32_pmovzxbw256", IX86_BUILTIN_PMOVZXBW256, UNKNOWN, (int) V16HI_FTYPE_V16QI },
27715 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8qiv8si2 , "__builtin_ia32_pmovzxbd256", IX86_BUILTIN_PMOVZXBD256, UNKNOWN, (int) V8SI_FTYPE_V16QI },
27716 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4qiv4di2 , "__builtin_ia32_pmovzxbq256", IX86_BUILTIN_PMOVZXBQ256, UNKNOWN, (int) V4DI_FTYPE_V16QI },
27717 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv8hiv8si2 , "__builtin_ia32_pmovzxwd256", IX86_BUILTIN_PMOVZXWD256, UNKNOWN, (int) V8SI_FTYPE_V8HI },
27718 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
27719 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
27720 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_even_v8si, "__builtin_ia32_pmuldq256", IX86_BUILTIN_PMULDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27721 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pmulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27722 { OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27723 { OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27724 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27725 { OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27726 { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_even_v8si, "__builtin_ia32_pmuludq256", IX86_BUILTIN_PMULUDQ256, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
27727 { OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27728 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
27729 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27730 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufdv3, "__builtin_ia32_pshufd256", IX86_BUILTIN_PSHUFD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_INT },
27731 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufhwv3, "__builtin_ia32_pshufhw256", IX86_BUILTIN_PSHUFHW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27732 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshuflwv3, "__builtin_ia32_pshuflw256", IX86_BUILTIN_PSHUFLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_INT },
27733 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv32qi3, "__builtin_ia32_psignb256", IX86_BUILTIN_PSIGNB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27734 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27735 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27736 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27737 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27738 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27739 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27740 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27741 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27742 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27743 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27744 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27745 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27746 { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psrad256", IX86_BUILTIN_PSRAD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27747 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrv2ti3, "__builtin_ia32_psrldqi256", IX86_BUILTIN_PSRLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
27748 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlwi256", IX86_BUILTIN_PSRLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
27749 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv16hi3, "__builtin_ia32_psrlw256", IX86_BUILTIN_PSRLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
27750 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrldi256", IX86_BUILTIN_PSRLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
27751 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv8si3, "__builtin_ia32_psrld256", IX86_BUILTIN_PSRLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
27752 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlqi256", IX86_BUILTIN_PSRLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
27753 { OPTION_MASK_ISA_AVX2, CODE_FOR_lshrv4di3, "__builtin_ia32_psrlq256", IX86_BUILTIN_PSRLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
27754 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv32qi3, "__builtin_ia32_psubb256", IX86_BUILTIN_PSUBB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27755 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv16hi3, "__builtin_ia32_psubw256", IX86_BUILTIN_PSUBW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27756 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv8si3, "__builtin_ia32_psubd256", IX86_BUILTIN_PSUBD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27757 { OPTION_MASK_ISA_AVX2, CODE_FOR_subv4di3, "__builtin_ia32_psubq256", IX86_BUILTIN_PSUBQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27758 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv32qi3, "__builtin_ia32_psubsb256", IX86_BUILTIN_PSUBSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27759 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_sssubv16hi3, "__builtin_ia32_psubsw256", IX86_BUILTIN_PSUBSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27760 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv32qi3, "__builtin_ia32_psubusb256", IX86_BUILTIN_PSUBUSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27761 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ussubv16hi3, "__builtin_ia32_psubusw256", IX86_BUILTIN_PSUBUSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27762 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv32qi, "__builtin_ia32_punpckhbw256", IX86_BUILTIN_PUNPCKHBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27763 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv16hi, "__builtin_ia32_punpckhwd256", IX86_BUILTIN_PUNPCKHWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27764 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv8si, "__builtin_ia32_punpckhdq256", IX86_BUILTIN_PUNPCKHDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27765 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_highv4di, "__builtin_ia32_punpckhqdq256", IX86_BUILTIN_PUNPCKHQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27766 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv32qi, "__builtin_ia32_punpcklbw256", IX86_BUILTIN_PUNPCKLBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
27767 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv16hi, "__builtin_ia32_punpcklwd256", IX86_BUILTIN_PUNPCKLWD256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
27768 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv8si, "__builtin_ia32_punpckldq256", IX86_BUILTIN_PUNPCKLDQ256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27769 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_interleave_lowv4di, "__builtin_ia32_punpcklqdq256", IX86_BUILTIN_PUNPCKLQDQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27770 { OPTION_MASK_ISA_AVX2, CODE_FOR_xorv4di3, "__builtin_ia32_pxor256", IX86_BUILTIN_PXOR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27771 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4sf, "__builtin_ia32_vbroadcastss_ps", IX86_BUILTIN_VBROADCASTSS_PS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
27772 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv8sf, "__builtin_ia32_vbroadcastss_ps256", IX86_BUILTIN_VBROADCASTSS_PS256, UNKNOWN, (int) V8SF_FTYPE_V4SF },
27773 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vec_dupv4df, "__builtin_ia32_vbroadcastsd_pd256", IX86_BUILTIN_VBROADCASTSD_PD256, UNKNOWN, (int) V4DF_FTYPE_V2DF },
27774 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_vbroadcasti128_v4di, "__builtin_ia32_vbroadcastsi256", IX86_BUILTIN_VBROADCASTSI256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27775 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv4si, "__builtin_ia32_pblendd128", IX86_BUILTIN_PBLENDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_INT },
27776 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pblenddv8si, "__builtin_ia32_pblendd256", IX86_BUILTIN_PBLENDD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
27777 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv32qi, "__builtin_ia32_pbroadcastb256", IX86_BUILTIN_PBROADCASTB256, UNKNOWN, (int) V32QI_FTYPE_V16QI },
27778 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16hi, "__builtin_ia32_pbroadcastw256", IX86_BUILTIN_PBROADCASTW256, UNKNOWN, (int) V16HI_FTYPE_V8HI },
27779 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8si, "__builtin_ia32_pbroadcastd256", IX86_BUILTIN_PBROADCASTD256, UNKNOWN, (int) V8SI_FTYPE_V4SI },
27780 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4di, "__builtin_ia32_pbroadcastq256", IX86_BUILTIN_PBROADCASTQ256, UNKNOWN, (int) V4DI_FTYPE_V2DI },
27781 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv16qi, "__builtin_ia32_pbroadcastb128", IX86_BUILTIN_PBROADCASTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
27782 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv8hi, "__builtin_ia32_pbroadcastw128", IX86_BUILTIN_PBROADCASTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
27783 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
27784 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
27785 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27786 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
27787 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
27788 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
27789 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
27790 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
27791 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
27792 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27793 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27794 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27795 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27796 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27797 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27798 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
27799 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv2di, "__builtin_ia32_psrlv2di", IX86_BUILTIN_PSRLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
27800 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv8si, "__builtin_ia32_psrlv8si", IX86_BUILTIN_PSRLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
27801 { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4si, "__builtin_ia32_psrlv4si", IX86_BUILTIN_PSRLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
27803 { OPTION_MASK_ISA_LZCNT, CODE_FOR_clzhi2_lzcnt, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27805 /* BMI */
27806 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27807 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27808 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
27810 /* TBM */
27811 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27812 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27814 /* F16C */
27815 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
27816 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
27817 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
27818 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
27820 /* BMI2 */
27821 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_si3, "__builtin_ia32_bzhi_si", IX86_BUILTIN_BZHI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27822 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_bzhi_di3, "__builtin_ia32_bzhi_di", IX86_BUILTIN_BZHI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27823 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_si3, "__builtin_ia32_pdep_si", IX86_BUILTIN_PDEP32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27824 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pdep_di3, "__builtin_ia32_pdep_di", IX86_BUILTIN_PDEP64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27825 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_si3, "__builtin_ia32_pext_si", IX86_BUILTIN_PEXT32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
27826 { OPTION_MASK_ISA_BMI2, CODE_FOR_bmi2_pext_di3, "__builtin_ia32_pext_di", IX86_BUILTIN_PEXT64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
27829 /* FMA4 and XOP. */
27830 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
27831 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
27832 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
27833 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
27834 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
27835 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
27836 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
27837 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
27838 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
27839 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
27840 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
27841 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
27842 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
27843 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
27844 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
27845 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
27846 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
27847 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
27848 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
27849 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
27850 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
27851 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
27852 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
27853 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
27854 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
27855 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
27856 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
27857 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
27858 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
27859 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
27860 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
27861 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
27862 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
27863 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
27864 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
27865 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
27866 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
27867 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
27868 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
27869 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
27870 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
27871 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
27872 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
27873 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
27874 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
27875 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
27876 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
27877 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
27878 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
27879 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
27880 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
27881 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
27883 static const struct builtin_description bdesc_multi_arg[] =
27885 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
27886 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
27887 UNKNOWN, (int)MULTI_ARG_3_SF },
27888 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
27889 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
27890 UNKNOWN, (int)MULTI_ARG_3_DF },
27892 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v4sf,
27893 "__builtin_ia32_vfmaddss3", IX86_BUILTIN_VFMADDSS3,
27894 UNKNOWN, (int)MULTI_ARG_3_SF },
27895 { OPTION_MASK_ISA_FMA, CODE_FOR_fmai_vmfmadd_v2df,
27896 "__builtin_ia32_vfmaddsd3", IX86_BUILTIN_VFMADDSD3,
27897 UNKNOWN, (int)MULTI_ARG_3_DF },
27899 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
27900 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
27901 UNKNOWN, (int)MULTI_ARG_3_SF },
27902 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
27903 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
27904 UNKNOWN, (int)MULTI_ARG_3_DF },
27905 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
27906 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
27907 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27908 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
27909 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
27910 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27912 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
27913 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
27914 UNKNOWN, (int)MULTI_ARG_3_SF },
27915 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
27916 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
27917 UNKNOWN, (int)MULTI_ARG_3_DF },
27918 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
27919 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
27920 UNKNOWN, (int)MULTI_ARG_3_SF2 },
27921 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
27922 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
27923 UNKNOWN, (int)MULTI_ARG_3_DF2 },
27925 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
27926 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
27927 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
27928 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
27929 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
27930 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
27931 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
27933 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27934 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
27935 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
27936 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
27937 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
27938 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
27939 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
27941 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
27943 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27944 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
27945 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27946 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27947 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27948 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
27949 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27950 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27951 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27952 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
27953 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27954 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
27956 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27957 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
27958 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
27959 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
27960 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
27961 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
27962 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
27963 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
27964 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27965 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
27966 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
27967 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
27968 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
27969 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
27970 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
27971 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
27973 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_1_SF },
27974 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_1_DF },
27975 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
27976 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
27977 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
27978 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
27980 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27981 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27982 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27983 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27984 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27985 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27986 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27987 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
27988 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
27989 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27990 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
27991 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27992 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
27993 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
27994 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
27996 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
27997 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27998 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
27999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
28000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
28001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
28002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
28004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
28005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28006 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
28007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
28008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
28009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
28010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
28012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
28013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28014 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
28015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
28016 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
28017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
28018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
28020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
28023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
28024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
28025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
28026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
28028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
28029 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
28031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
28032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
28033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
28034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
28036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
28037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
28039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
28040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
28041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
28042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
28044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
28045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28046 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
28047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
28048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
28049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
28050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
28052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
28053 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
28055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
28056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
28057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
28058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
28060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28064 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
28065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
28066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
28067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
28069 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28071 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
28074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
28075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
28076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
28078 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
28079 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
28080 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
28081 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
28085 /* TM vector builtins. */
28087 /* Reuse the existing x86-specific `struct builtin_description' cause
28088 we're lazy. Add casts to make them fit. */
28089 static const struct builtin_description bdesc_tm[] =
28091 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28092 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28093 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
28094 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28095 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28096 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28097 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
28099 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28100 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28101 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
28102 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28103 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28104 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28105 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
28107 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28108 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28109 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
28110 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28111 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28112 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28113 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
28115 { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
28116 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
28117 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
28120 /* TM callbacks. */
28122 /* Return the builtin decl needed to load a vector of TYPE. */
28124 static tree
28125 ix86_builtin_tm_load (tree type)
28127 if (TREE_CODE (type) == VECTOR_TYPE)
28129 switch (tree_low_cst (TYPE_SIZE (type), 1))
28131 case 64:
28132 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
28133 case 128:
28134 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
28135 case 256:
28136 return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
28139 return NULL_TREE;
28142 /* Return the builtin decl needed to store a vector of TYPE. */
28144 static tree
28145 ix86_builtin_tm_store (tree type)
28147 if (TREE_CODE (type) == VECTOR_TYPE)
28149 switch (tree_low_cst (TYPE_SIZE (type), 1))
28151 case 64:
28152 return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
28153 case 128:
28154 return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
28155 case 256:
28156 return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
28159 return NULL_TREE;
28162 /* Initialize the transactional memory vector load/store builtins. */
28164 static void
28165 ix86_init_tm_builtins (void)
28167 enum ix86_builtin_func_type ftype;
28168 const struct builtin_description *d;
28169 size_t i;
28170 tree decl;
28171 tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
28172 tree attrs_log, attrs_type_log;
28174 if (!flag_tm)
28175 return;
28177 /* If there are no builtins defined, we must be compiling in a
28178 language without trans-mem support. */
28179 if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
28180 return;
28182 /* Use whatever attributes a normal TM load has. */
28183 decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
28184 attrs_load = DECL_ATTRIBUTES (decl);
28185 attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28186 /* Use whatever attributes a normal TM store has. */
28187 decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
28188 attrs_store = DECL_ATTRIBUTES (decl);
28189 attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28190 /* Use whatever attributes a normal TM log has. */
28191 decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
28192 attrs_log = DECL_ATTRIBUTES (decl);
28193 attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
28195 for (i = 0, d = bdesc_tm;
28196 i < ARRAY_SIZE (bdesc_tm);
28197 i++, d++)
28199 if ((d->mask & ix86_isa_flags) != 0
28200 || (lang_hooks.builtin_function
28201 == lang_hooks.builtin_function_ext_scope))
28203 tree type, attrs, attrs_type;
28204 enum built_in_function code = (enum built_in_function) d->code;
28206 ftype = (enum ix86_builtin_func_type) d->flag;
28207 type = ix86_get_builtin_func_type (ftype);
28209 if (BUILTIN_TM_LOAD_P (code))
28211 attrs = attrs_load;
28212 attrs_type = attrs_type_load;
28214 else if (BUILTIN_TM_STORE_P (code))
28216 attrs = attrs_store;
28217 attrs_type = attrs_type_store;
28219 else
28221 attrs = attrs_log;
28222 attrs_type = attrs_type_log;
28224 decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
28225 /* The builtin without the prefix for
28226 calling it directly. */
28227 d->name + strlen ("__builtin_"),
28228 attrs);
28229 /* add_builtin_function() will set the DECL_ATTRIBUTES, now
28230 set the TYPE_ATTRIBUTES. */
28231 decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
28233 set_builtin_decl (code, decl, false);
28238 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
28239 in the current target ISA to allow the user to compile particular modules
28240 with different target specific options that differ from the command line
28241 options. */
28242 static void
28243 ix86_init_mmx_sse_builtins (void)
28245 const struct builtin_description * d;
28246 enum ix86_builtin_func_type ftype;
28247 size_t i;
28249 /* Add all special builtins with variable number of operands. */
28250 for (i = 0, d = bdesc_special_args;
28251 i < ARRAY_SIZE (bdesc_special_args);
28252 i++, d++)
28254 if (d->name == 0)
28255 continue;
28257 ftype = (enum ix86_builtin_func_type) d->flag;
28258 def_builtin (d->mask, d->name, ftype, d->code);
28261 /* Add all builtins with variable number of operands. */
28262 for (i = 0, d = bdesc_args;
28263 i < ARRAY_SIZE (bdesc_args);
28264 i++, d++)
28266 if (d->name == 0)
28267 continue;
28269 ftype = (enum ix86_builtin_func_type) d->flag;
28270 def_builtin_const (d->mask, d->name, ftype, d->code);
28273 /* pcmpestr[im] insns. */
28274 for (i = 0, d = bdesc_pcmpestr;
28275 i < ARRAY_SIZE (bdesc_pcmpestr);
28276 i++, d++)
28278 if (d->code == IX86_BUILTIN_PCMPESTRM128)
28279 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
28280 else
28281 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
28282 def_builtin_const (d->mask, d->name, ftype, d->code);
28285 /* pcmpistr[im] insns. */
28286 for (i = 0, d = bdesc_pcmpistr;
28287 i < ARRAY_SIZE (bdesc_pcmpistr);
28288 i++, d++)
28290 if (d->code == IX86_BUILTIN_PCMPISTRM128)
28291 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
28292 else
28293 ftype = INT_FTYPE_V16QI_V16QI_INT;
28294 def_builtin_const (d->mask, d->name, ftype, d->code);
28297 /* comi/ucomi insns. */
28298 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
28300 if (d->mask == OPTION_MASK_ISA_SSE2)
28301 ftype = INT_FTYPE_V2DF_V2DF;
28302 else
28303 ftype = INT_FTYPE_V4SF_V4SF;
28304 def_builtin_const (d->mask, d->name, ftype, d->code);
28307 /* SSE */
28308 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
28309 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
28310 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
28311 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
28313 /* SSE or 3DNow!A */
28314 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28315 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
28316 IX86_BUILTIN_MASKMOVQ);
28318 /* SSE2 */
28319 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
28320 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
28322 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
28323 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
28324 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
28325 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
28327 /* SSE3. */
28328 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
28329 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
28330 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
28331 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
28333 /* AES */
28334 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
28335 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
28336 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
28337 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
28338 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
28339 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
28340 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
28341 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
28342 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
28343 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
28344 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
28345 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
28347 /* PCLMUL */
28348 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
28349 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
28351 /* RDRND */
28352 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
28353 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
28354 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
28355 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
28356 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
28357 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
28358 IX86_BUILTIN_RDRAND64_STEP);
28360 /* AVX2 */
28361 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2df",
28362 V2DF_FTYPE_V2DF_PCDOUBLE_V4SI_V2DF_INT,
28363 IX86_BUILTIN_GATHERSIV2DF);
28365 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4df",
28366 V4DF_FTYPE_V4DF_PCDOUBLE_V4SI_V4DF_INT,
28367 IX86_BUILTIN_GATHERSIV4DF);
28369 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2df",
28370 V2DF_FTYPE_V2DF_PCDOUBLE_V2DI_V2DF_INT,
28371 IX86_BUILTIN_GATHERDIV2DF);
28373 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4df",
28374 V4DF_FTYPE_V4DF_PCDOUBLE_V4DI_V4DF_INT,
28375 IX86_BUILTIN_GATHERDIV4DF);
28377 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4sf",
28378 V4SF_FTYPE_V4SF_PCFLOAT_V4SI_V4SF_INT,
28379 IX86_BUILTIN_GATHERSIV4SF);
28381 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8sf",
28382 V8SF_FTYPE_V8SF_PCFLOAT_V8SI_V8SF_INT,
28383 IX86_BUILTIN_GATHERSIV8SF);
28385 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf",
28386 V4SF_FTYPE_V4SF_PCFLOAT_V2DI_V4SF_INT,
28387 IX86_BUILTIN_GATHERDIV4SF);
28389 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4sf256",
28390 V4SF_FTYPE_V4SF_PCFLOAT_V4DI_V4SF_INT,
28391 IX86_BUILTIN_GATHERDIV8SF);
28393 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv2di",
28394 V2DI_FTYPE_V2DI_PCINT64_V4SI_V2DI_INT,
28395 IX86_BUILTIN_GATHERSIV2DI);
28397 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4di",
28398 V4DI_FTYPE_V4DI_PCINT64_V4SI_V4DI_INT,
28399 IX86_BUILTIN_GATHERSIV4DI);
28401 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv2di",
28402 V2DI_FTYPE_V2DI_PCINT64_V2DI_V2DI_INT,
28403 IX86_BUILTIN_GATHERDIV2DI);
28405 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4di",
28406 V4DI_FTYPE_V4DI_PCINT64_V4DI_V4DI_INT,
28407 IX86_BUILTIN_GATHERDIV4DI);
28409 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv4si",
28410 V4SI_FTYPE_V4SI_PCINT_V4SI_V4SI_INT,
28411 IX86_BUILTIN_GATHERSIV4SI);
28413 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gathersiv8si",
28414 V8SI_FTYPE_V8SI_PCINT_V8SI_V8SI_INT,
28415 IX86_BUILTIN_GATHERSIV8SI);
28417 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si",
28418 V4SI_FTYPE_V4SI_PCINT_V2DI_V4SI_INT,
28419 IX86_BUILTIN_GATHERDIV4SI);
28421 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatherdiv4si256",
28422 V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
28423 IX86_BUILTIN_GATHERDIV8SI);
28425 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
28426 V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
28427 IX86_BUILTIN_GATHERALTSIV4DF);
28429 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
28430 V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
28431 IX86_BUILTIN_GATHERALTDIV8SF);
28433 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
28434 V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
28435 IX86_BUILTIN_GATHERALTSIV4DI);
28437 def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
28438 V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
28439 IX86_BUILTIN_GATHERALTDIV8SI);
28441 /* RTM. */
28442 def_builtin (OPTION_MASK_ISA_RTM, "__builtin_ia32_xabort",
28443 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_XABORT);
28445 /* MMX access to the vec_init patterns. */
28446 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
28447 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
28449 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
28450 V4HI_FTYPE_HI_HI_HI_HI,
28451 IX86_BUILTIN_VEC_INIT_V4HI);
28453 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
28454 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
28455 IX86_BUILTIN_VEC_INIT_V8QI);
28457 /* Access to the vec_extract patterns. */
28458 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
28459 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
28460 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
28461 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
28462 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
28463 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
28464 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
28465 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
28466 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
28467 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
28469 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28470 "__builtin_ia32_vec_ext_v4hi",
28471 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
28473 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
28474 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
28476 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
28477 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
28479 /* Access to the vec_set patterns. */
28480 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
28481 "__builtin_ia32_vec_set_v2di",
28482 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
28484 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
28485 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
28487 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
28488 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
28490 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
28491 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
28493 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
28494 "__builtin_ia32_vec_set_v4hi",
28495 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
28497 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
28498 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
28500 /* RDSEED */
28501 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_hi_step",
28502 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDSEED16_STEP);
28503 def_builtin (OPTION_MASK_ISA_RDSEED, "__builtin_ia32_rdseed_si_step",
28504 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDSEED32_STEP);
28505 def_builtin (OPTION_MASK_ISA_RDSEED | OPTION_MASK_ISA_64BIT,
28506 "__builtin_ia32_rdseed_di_step",
28507 INT_FTYPE_PULONGLONG, IX86_BUILTIN_RDSEED64_STEP);
28509 /* ADCX */
28510 def_builtin (0, "__builtin_ia32_addcarryx_u32",
28511 UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED, IX86_BUILTIN_ADDCARRYX32);
28512 def_builtin (OPTION_MASK_ISA_64BIT,
28513 "__builtin_ia32_addcarryx_u64",
28514 UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG,
28515 IX86_BUILTIN_ADDCARRYX64);
28517 /* Add FMA4 multi-arg argument instructions */
28518 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
28520 if (d->name == 0)
28521 continue;
28523 ftype = (enum ix86_builtin_func_type) d->flag;
28524 def_builtin_const (d->mask, d->name, ftype, d->code);
28528 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
28529 to return a pointer to VERSION_DECL if the outcome of the expression
28530 formed by PREDICATE_CHAIN is true. This function will be called during
28531 version dispatch to decide which function version to execute. It returns
28532 the basic block at the end, to which more conditions can be added. */
28534 static basic_block
28535 add_condition_to_bb (tree function_decl, tree version_decl,
28536 tree predicate_chain, basic_block new_bb)
28538 gimple return_stmt;
28539 tree convert_expr, result_var;
28540 gimple convert_stmt;
28541 gimple call_cond_stmt;
28542 gimple if_else_stmt;
28544 basic_block bb1, bb2, bb3;
28545 edge e12, e23;
28547 tree cond_var, and_expr_var = NULL_TREE;
28548 gimple_seq gseq;
28550 tree predicate_decl, predicate_arg;
28552 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
28554 gcc_assert (new_bb != NULL);
28555 gseq = bb_seq (new_bb);
28558 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
28559 build_fold_addr_expr (version_decl));
28560 result_var = create_tmp_var (ptr_type_node, NULL);
28561 convert_stmt = gimple_build_assign (result_var, convert_expr);
28562 return_stmt = gimple_build_return (result_var);
28564 if (predicate_chain == NULL_TREE)
28566 gimple_seq_add_stmt (&gseq, convert_stmt);
28567 gimple_seq_add_stmt (&gseq, return_stmt);
28568 set_bb_seq (new_bb, gseq);
28569 gimple_set_bb (convert_stmt, new_bb);
28570 gimple_set_bb (return_stmt, new_bb);
28571 pop_cfun ();
28572 return new_bb;
28575 while (predicate_chain != NULL)
28577 cond_var = create_tmp_var (integer_type_node, NULL);
28578 predicate_decl = TREE_PURPOSE (predicate_chain);
28579 predicate_arg = TREE_VALUE (predicate_chain);
28580 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
28581 gimple_call_set_lhs (call_cond_stmt, cond_var);
28583 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
28584 gimple_set_bb (call_cond_stmt, new_bb);
28585 gimple_seq_add_stmt (&gseq, call_cond_stmt);
28587 predicate_chain = TREE_CHAIN (predicate_chain);
28589 if (and_expr_var == NULL)
28590 and_expr_var = cond_var;
28591 else
28593 gimple assign_stmt;
28594 /* Use MIN_EXPR to check if any integer is zero?.
28595 and_expr_var = min_expr <cond_var, and_expr_var> */
28596 assign_stmt = gimple_build_assign (and_expr_var,
28597 build2 (MIN_EXPR, integer_type_node,
28598 cond_var, and_expr_var));
28600 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
28601 gimple_set_bb (assign_stmt, new_bb);
28602 gimple_seq_add_stmt (&gseq, assign_stmt);
28606 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
28607 integer_zero_node,
28608 NULL_TREE, NULL_TREE);
28609 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
28610 gimple_set_bb (if_else_stmt, new_bb);
28611 gimple_seq_add_stmt (&gseq, if_else_stmt);
28613 gimple_seq_add_stmt (&gseq, convert_stmt);
28614 gimple_seq_add_stmt (&gseq, return_stmt);
28615 set_bb_seq (new_bb, gseq);
28617 bb1 = new_bb;
28618 e12 = split_block (bb1, if_else_stmt);
28619 bb2 = e12->dest;
28620 e12->flags &= ~EDGE_FALLTHRU;
28621 e12->flags |= EDGE_TRUE_VALUE;
28623 e23 = split_block (bb2, return_stmt);
28625 gimple_set_bb (convert_stmt, bb2);
28626 gimple_set_bb (return_stmt, bb2);
28628 bb3 = e23->dest;
28629 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
28631 remove_edge (e23);
28632 make_edge (bb2, EXIT_BLOCK_PTR, 0);
28634 pop_cfun ();
28636 return bb3;
28639 /* This parses the attribute arguments to target in DECL and determines
28640 the right builtin to use to match the platform specification.
28641 It returns the priority value for this version decl. If PREDICATE_LIST
28642 is not NULL, it stores the list of cpu features that need to be checked
28643 before dispatching this function. */
28645 static unsigned int
28646 get_builtin_code_for_version (tree decl, tree *predicate_list)
28648 tree attrs;
28649 struct cl_target_option cur_target;
28650 tree target_node;
28651 struct cl_target_option *new_target;
28652 const char *arg_str = NULL;
28653 const char *attrs_str = NULL;
28654 char *tok_str = NULL;
28655 char *token;
28657 /* Priority of i386 features, greater value is higher priority. This is
28658 used to decide the order in which function dispatch must happen. For
28659 instance, a version specialized for SSE4.2 should be checked for dispatch
28660 before a version for SSE3, as SSE4.2 implies SSE3. */
28661 enum feature_priority
28663 P_ZERO = 0,
28664 P_MMX,
28665 P_SSE,
28666 P_SSE2,
28667 P_SSE3,
28668 P_SSSE3,
28669 P_PROC_SSSE3,
28670 P_SSE4_a,
28671 P_PROC_SSE4_a,
28672 P_SSE4_1,
28673 P_SSE4_2,
28674 P_PROC_SSE4_2,
28675 P_POPCNT,
28676 P_AVX,
28677 P_AVX2,
28678 P_FMA,
28679 P_PROC_FMA
28682 enum feature_priority priority = P_ZERO;
28684 /* These are the target attribute strings for which a dispatcher is
28685 available, from fold_builtin_cpu. */
28687 static struct _feature_list
28689 const char *const name;
28690 const enum feature_priority priority;
28692 const feature_list[] =
28694 {"mmx", P_MMX},
28695 {"sse", P_SSE},
28696 {"sse2", P_SSE2},
28697 {"sse3", P_SSE3},
28698 {"ssse3", P_SSSE3},
28699 {"sse4.1", P_SSE4_1},
28700 {"sse4.2", P_SSE4_2},
28701 {"popcnt", P_POPCNT},
28702 {"avx", P_AVX},
28703 {"avx2", P_AVX2}
28707 static unsigned int NUM_FEATURES
28708 = sizeof (feature_list) / sizeof (struct _feature_list);
28710 unsigned int i;
28712 tree predicate_chain = NULL_TREE;
28713 tree predicate_decl, predicate_arg;
28715 attrs = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
28716 gcc_assert (attrs != NULL);
28718 attrs = TREE_VALUE (TREE_VALUE (attrs));
28720 gcc_assert (TREE_CODE (attrs) == STRING_CST);
28721 attrs_str = TREE_STRING_POINTER (attrs);
28723 /* Return priority zero for default function. */
28724 if (strcmp (attrs_str, "default") == 0)
28725 return 0;
28727 /* Handle arch= if specified. For priority, set it to be 1 more than
28728 the best instruction set the processor can handle. For instance, if
28729 there is a version for atom and a version for ssse3 (the highest ISA
28730 priority for atom), the atom version must be checked for dispatch
28731 before the ssse3 version. */
28732 if (strstr (attrs_str, "arch=") != NULL)
28734 cl_target_option_save (&cur_target, &global_options);
28735 target_node = ix86_valid_target_attribute_tree (attrs);
28737 gcc_assert (target_node);
28738 new_target = TREE_TARGET_OPTION (target_node);
28739 gcc_assert (new_target);
28741 if (new_target->arch_specified && new_target->arch > 0)
28743 switch (new_target->arch)
28745 case PROCESSOR_CORE2:
28746 arg_str = "core2";
28747 priority = P_PROC_SSSE3;
28748 break;
28749 case PROCESSOR_COREI7:
28750 arg_str = "corei7";
28751 priority = P_PROC_SSE4_2;
28752 break;
28753 case PROCESSOR_ATOM:
28754 arg_str = "atom";
28755 priority = P_PROC_SSSE3;
28756 break;
28757 case PROCESSOR_AMDFAM10:
28758 arg_str = "amdfam10h";
28759 priority = P_PROC_SSE4_a;
28760 break;
28761 case PROCESSOR_BDVER1:
28762 arg_str = "bdver1";
28763 priority = P_PROC_FMA;
28764 break;
28765 case PROCESSOR_BDVER2:
28766 arg_str = "bdver2";
28767 priority = P_PROC_FMA;
28768 break;
28772 cl_target_option_restore (&global_options, &cur_target);
28774 if (predicate_list && arg_str == NULL)
28776 error_at (DECL_SOURCE_LOCATION (decl),
28777 "No dispatcher found for the versioning attributes");
28778 return 0;
28781 if (predicate_list)
28783 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS];
28784 /* For a C string literal the length includes the trailing NULL. */
28785 predicate_arg = build_string_literal (strlen (arg_str) + 1, arg_str);
28786 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28787 predicate_chain);
28791 /* Process feature name. */
28792 tok_str = (char *) xmalloc (strlen (attrs_str) + 1);
28793 strcpy (tok_str, attrs_str);
28794 token = strtok (tok_str, ",");
28795 predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_SUPPORTS];
28797 while (token != NULL)
28799 /* Do not process "arch=" */
28800 if (strncmp (token, "arch=", 5) == 0)
28802 token = strtok (NULL, ",");
28803 continue;
28805 for (i = 0; i < NUM_FEATURES; ++i)
28807 if (strcmp (token, feature_list[i].name) == 0)
28809 if (predicate_list)
28811 predicate_arg = build_string_literal (
28812 strlen (feature_list[i].name) + 1,
28813 feature_list[i].name);
28814 predicate_chain = tree_cons (predicate_decl, predicate_arg,
28815 predicate_chain);
28817 /* Find the maximum priority feature. */
28818 if (feature_list[i].priority > priority)
28819 priority = feature_list[i].priority;
28821 break;
28824 if (predicate_list && i == NUM_FEATURES)
28826 error_at (DECL_SOURCE_LOCATION (decl),
28827 "No dispatcher found for %s", token);
28828 return 0;
28830 token = strtok (NULL, ",");
28832 free (tok_str);
28834 if (predicate_list && predicate_chain == NULL_TREE)
28836 error_at (DECL_SOURCE_LOCATION (decl),
28837 "No dispatcher found for the versioning attributes : %s",
28838 attrs_str);
28839 return 0;
28841 else if (predicate_list)
28843 predicate_chain = nreverse (predicate_chain);
28844 *predicate_list = predicate_chain;
28847 return priority;
28850 /* This compares the priority of target features in function DECL1
28851 and DECL2. It returns positive value if DECL1 is higher priority,
28852 negative value if DECL2 is higher priority and 0 if they are the
28853 same. */
28855 static int
28856 ix86_compare_version_priority (tree decl1, tree decl2)
28858 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
28859 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
28861 return (int)priority1 - (int)priority2;
28864 /* V1 and V2 point to function versions with different priorities
28865 based on the target ISA. This function compares their priorities. */
28867 static int
28868 feature_compare (const void *v1, const void *v2)
28870 typedef struct _function_version_info
28872 tree version_decl;
28873 tree predicate_chain;
28874 unsigned int dispatch_priority;
28875 } function_version_info;
28877 const function_version_info c1 = *(const function_version_info *)v1;
28878 const function_version_info c2 = *(const function_version_info *)v2;
28879 return (c2.dispatch_priority - c1.dispatch_priority);
28882 /* This function generates the dispatch function for
28883 multi-versioned functions. DISPATCH_DECL is the function which will
28884 contain the dispatch logic. FNDECLS are the function choices for
28885 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
28886 in DISPATCH_DECL in which the dispatch code is generated. */
28888 static int
28889 dispatch_function_versions (tree dispatch_decl,
28890 void *fndecls_p,
28891 basic_block *empty_bb)
28893 tree default_decl;
28894 gimple ifunc_cpu_init_stmt;
28895 gimple_seq gseq;
28896 int ix;
28897 tree ele;
28898 vec<tree> *fndecls;
28899 unsigned int num_versions = 0;
28900 unsigned int actual_versions = 0;
28901 unsigned int i;
28903 struct _function_version_info
28905 tree version_decl;
28906 tree predicate_chain;
28907 unsigned int dispatch_priority;
28908 }*function_version_info;
28910 gcc_assert (dispatch_decl != NULL
28911 && fndecls_p != NULL
28912 && empty_bb != NULL);
28914 /*fndecls_p is actually a vector. */
28915 fndecls = static_cast<vec<tree> *> (fndecls_p);
28917 /* At least one more version other than the default. */
28918 num_versions = fndecls->length ();
28919 gcc_assert (num_versions >= 2);
28921 function_version_info = (struct _function_version_info *)
28922 XNEWVEC (struct _function_version_info, (num_versions - 1));
28924 /* The first version in the vector is the default decl. */
28925 default_decl = (*fndecls)[0];
28927 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
28929 gseq = bb_seq (*empty_bb);
28930 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
28931 constructors, so explicity call __builtin_cpu_init here. */
28932 ifunc_cpu_init_stmt = gimple_build_call_vec (
28933 ix86_builtins [(int) IX86_BUILTIN_CPU_INIT], vNULL);
28934 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
28935 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
28936 set_bb_seq (*empty_bb, gseq);
28938 pop_cfun ();
28941 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
28943 tree version_decl = ele;
28944 tree predicate_chain = NULL_TREE;
28945 unsigned int priority;
28946 /* Get attribute string, parse it and find the right predicate decl.
28947 The predicate function could be a lengthy combination of many
28948 features, like arch-type and various isa-variants. */
28949 priority = get_builtin_code_for_version (version_decl,
28950 &predicate_chain);
28952 if (predicate_chain == NULL_TREE)
28953 continue;
28955 function_version_info [actual_versions].version_decl = version_decl;
28956 function_version_info [actual_versions].predicate_chain
28957 = predicate_chain;
28958 function_version_info [actual_versions].dispatch_priority = priority;
28959 actual_versions++;
28962 /* Sort the versions according to descending order of dispatch priority. The
28963 priority is based on the ISA. This is not a perfect solution. There
28964 could still be ambiguity. If more than one function version is suitable
28965 to execute, which one should be dispatched? In future, allow the user
28966 to specify a dispatch priority next to the version. */
28967 qsort (function_version_info, actual_versions,
28968 sizeof (struct _function_version_info), feature_compare);
28970 for (i = 0; i < actual_versions; ++i)
28971 *empty_bb = add_condition_to_bb (dispatch_decl,
28972 function_version_info[i].version_decl,
28973 function_version_info[i].predicate_chain,
28974 *empty_bb);
28976 /* dispatch default version at the end. */
28977 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
28978 NULL, *empty_bb);
28980 free (function_version_info);
28981 return 0;
28984 /* Comparator function to be used in qsort routine to sort attribute
28985 specification strings to "target". */
28987 static int
28988 attr_strcmp (const void *v1, const void *v2)
28990 const char *c1 = *(char *const*)v1;
28991 const char *c2 = *(char *const*)v2;
28992 return strcmp (c1, c2);
28995 /* ARGLIST is the argument to target attribute. This function tokenizes
28996 the comma separated arguments, sorts them and returns a string which
28997 is a unique identifier for the comma separated arguments. It also
28998 replaces non-identifier characters "=,-" with "_". */
29000 static char *
29001 sorted_attr_string (tree arglist)
29003 tree arg;
29004 size_t str_len_sum = 0;
29005 char **args = NULL;
29006 char *attr_str, *ret_str;
29007 char *attr = NULL;
29008 unsigned int argnum = 1;
29009 unsigned int i;
29011 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29013 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29014 size_t len = strlen (str);
29015 str_len_sum += len + 1;
29016 if (arg != arglist)
29017 argnum++;
29018 for (i = 0; i < strlen (str); i++)
29019 if (str[i] == ',')
29020 argnum++;
29023 attr_str = XNEWVEC (char, str_len_sum);
29024 str_len_sum = 0;
29025 for (arg = arglist; arg; arg = TREE_CHAIN (arg))
29027 const char *str = TREE_STRING_POINTER (TREE_VALUE (arg));
29028 size_t len = strlen (str);
29029 memcpy (attr_str + str_len_sum, str, len);
29030 attr_str[str_len_sum + len] = TREE_CHAIN (arg) ? ',' : '\0';
29031 str_len_sum += len + 1;
29034 /* Replace "=,-" with "_". */
29035 for (i = 0; i < strlen (attr_str); i++)
29036 if (attr_str[i] == '=' || attr_str[i]== '-')
29037 attr_str[i] = '_';
29039 if (argnum == 1)
29040 return attr_str;
29042 args = XNEWVEC (char *, argnum);
29044 i = 0;
29045 attr = strtok (attr_str, ",");
29046 while (attr != NULL)
29048 args[i] = attr;
29049 i++;
29050 attr = strtok (NULL, ",");
29053 qsort (args, argnum, sizeof (char *), attr_strcmp);
29055 ret_str = XNEWVEC (char, str_len_sum);
29056 str_len_sum = 0;
29057 for (i = 0; i < argnum; i++)
29059 size_t len = strlen (args[i]);
29060 memcpy (ret_str + str_len_sum, args[i], len);
29061 ret_str[str_len_sum + len] = i < argnum - 1 ? '_' : '\0';
29062 str_len_sum += len + 1;
29065 XDELETEVEC (args);
29066 XDELETEVEC (attr_str);
29067 return ret_str;
29070 /* This function changes the assembler name for functions that are
29071 versions. If DECL is a function version and has a "target"
29072 attribute, it appends the attribute string to its assembler name. */
29074 static tree
29075 ix86_mangle_function_version_assembler_name (tree decl, tree id)
29077 tree version_attr;
29078 const char *orig_name, *version_string;
29079 char *attr_str, *assembler_name;
29081 if (DECL_DECLARED_INLINE_P (decl)
29082 && lookup_attribute ("gnu_inline",
29083 DECL_ATTRIBUTES (decl)))
29084 error_at (DECL_SOURCE_LOCATION (decl),
29085 "Function versions cannot be marked as gnu_inline,"
29086 " bodies have to be generated");
29088 if (DECL_VIRTUAL_P (decl)
29089 || DECL_VINDEX (decl))
29090 sorry ("Virtual function multiversioning not supported");
29092 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29094 /* target attribute string cannot be NULL. */
29095 gcc_assert (version_attr != NULL_TREE);
29097 orig_name = IDENTIFIER_POINTER (id);
29098 version_string
29099 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
29101 if (strcmp (version_string, "default") == 0)
29102 return id;
29104 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
29105 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
29107 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
29109 /* Allow assembler name to be modified if already set. */
29110 if (DECL_ASSEMBLER_NAME_SET_P (decl))
29111 SET_DECL_RTL (decl, NULL);
29113 tree ret = get_identifier (assembler_name);
29114 XDELETEVEC (attr_str);
29115 XDELETEVEC (assembler_name);
29116 return ret;
29119 /* This function returns true if FN1 and FN2 are versions of the same function,
29120 that is, the target strings of the function decls are different. This assumes
29121 that FN1 and FN2 have the same signature. */
29123 static bool
29124 ix86_function_versions (tree fn1, tree fn2)
29126 tree attr1, attr2;
29127 char *target1, *target2;
29128 bool result;
29130 if (TREE_CODE (fn1) != FUNCTION_DECL
29131 || TREE_CODE (fn2) != FUNCTION_DECL)
29132 return false;
29134 attr1 = lookup_attribute ("target", DECL_ATTRIBUTES (fn1));
29135 attr2 = lookup_attribute ("target", DECL_ATTRIBUTES (fn2));
29137 /* At least one function decl should have the target attribute specified. */
29138 if (attr1 == NULL_TREE && attr2 == NULL_TREE)
29139 return false;
29141 /* Diagnose missing target attribute if one of the decls is already
29142 multi-versioned. */
29143 if (attr1 == NULL_TREE || attr2 == NULL_TREE)
29145 if (DECL_FUNCTION_VERSIONED (fn1) || DECL_FUNCTION_VERSIONED (fn2))
29147 if (attr2 != NULL_TREE)
29149 tree tem = fn1;
29150 fn1 = fn2;
29151 fn2 = tem;
29152 attr1 = attr2;
29154 error_at (DECL_SOURCE_LOCATION (fn2),
29155 "missing %<target%> attribute for multi-versioned %D",
29156 fn2);
29157 error_at (DECL_SOURCE_LOCATION (fn1),
29158 "previous declaration of %D", fn1);
29159 /* Prevent diagnosing of the same error multiple times. */
29160 DECL_ATTRIBUTES (fn2)
29161 = tree_cons (get_identifier ("target"),
29162 copy_node (TREE_VALUE (attr1)),
29163 DECL_ATTRIBUTES (fn2));
29165 return false;
29168 target1 = sorted_attr_string (TREE_VALUE (attr1));
29169 target2 = sorted_attr_string (TREE_VALUE (attr2));
29171 /* The sorted target strings must be different for fn1 and fn2
29172 to be versions. */
29173 if (strcmp (target1, target2) == 0)
29174 result = false;
29175 else
29176 result = true;
29178 XDELETEVEC (target1);
29179 XDELETEVEC (target2);
29181 return result;
29184 static tree
29185 ix86_mangle_decl_assembler_name (tree decl, tree id)
29187 /* For function version, add the target suffix to the assembler name. */
29188 if (TREE_CODE (decl) == FUNCTION_DECL
29189 && DECL_FUNCTION_VERSIONED (decl))
29190 id = ix86_mangle_function_version_assembler_name (decl, id);
29191 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
29192 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
29193 #endif
29195 return id;
29198 /* Return a new name by appending SUFFIX to the DECL name. If make_unique
29199 is true, append the full path name of the source file. */
29201 static char *
29202 make_name (tree decl, const char *suffix, bool make_unique)
29204 char *global_var_name;
29205 int name_len;
29206 const char *name;
29207 const char *unique_name = NULL;
29209 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
29211 /* Get a unique name that can be used globally without any chances
29212 of collision at link time. */
29213 if (make_unique)
29214 unique_name = IDENTIFIER_POINTER (get_file_function_name ("\0"));
29216 name_len = strlen (name) + strlen (suffix) + 2;
29218 if (make_unique)
29219 name_len += strlen (unique_name) + 1;
29220 global_var_name = XNEWVEC (char, name_len);
29222 /* Use '.' to concatenate names as it is demangler friendly. */
29223 if (make_unique)
29224 snprintf (global_var_name, name_len, "%s.%s.%s", name, unique_name,
29225 suffix);
29226 else
29227 snprintf (global_var_name, name_len, "%s.%s", name, suffix);
29229 return global_var_name;
29232 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29234 /* Make a dispatcher declaration for the multi-versioned function DECL.
29235 Calls to DECL function will be replaced with calls to the dispatcher
29236 by the front-end. Return the decl created. */
29238 static tree
29239 make_dispatcher_decl (const tree decl)
29241 tree func_decl;
29242 char *func_name;
29243 tree fn_type, func_type;
29244 bool is_uniq = false;
29246 if (TREE_PUBLIC (decl) == 0)
29247 is_uniq = true;
29249 func_name = make_name (decl, "ifunc", is_uniq);
29251 fn_type = TREE_TYPE (decl);
29252 func_type = build_function_type (TREE_TYPE (fn_type),
29253 TYPE_ARG_TYPES (fn_type));
29255 func_decl = build_fn_decl (func_name, func_type);
29256 XDELETEVEC (func_name);
29257 TREE_USED (func_decl) = 1;
29258 DECL_CONTEXT (func_decl) = NULL_TREE;
29259 DECL_INITIAL (func_decl) = error_mark_node;
29260 DECL_ARTIFICIAL (func_decl) = 1;
29261 /* Mark this func as external, the resolver will flip it again if
29262 it gets generated. */
29263 DECL_EXTERNAL (func_decl) = 1;
29264 /* This will be of type IFUNCs have to be externally visible. */
29265 TREE_PUBLIC (func_decl) = 1;
29267 return func_decl;
29270 #endif
29272 /* Returns true if decl is multi-versioned and DECL is the default function,
29273 that is it is not tagged with target specific optimization. */
29275 static bool
29276 is_function_default_version (const tree decl)
29278 if (TREE_CODE (decl) != FUNCTION_DECL
29279 || !DECL_FUNCTION_VERSIONED (decl))
29280 return false;
29281 tree attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
29282 gcc_assert (attr);
29283 attr = TREE_VALUE (TREE_VALUE (attr));
29284 return (TREE_CODE (attr) == STRING_CST
29285 && strcmp (TREE_STRING_POINTER (attr), "default") == 0);
29288 /* Make a dispatcher declaration for the multi-versioned function DECL.
29289 Calls to DECL function will be replaced with calls to the dispatcher
29290 by the front-end. Returns the decl of the dispatcher function. */
29292 static tree
29293 ix86_get_function_versions_dispatcher (void *decl)
29295 tree fn = (tree) decl;
29296 struct cgraph_node *node = NULL;
29297 struct cgraph_node *default_node = NULL;
29298 struct cgraph_function_version_info *node_v = NULL;
29299 struct cgraph_function_version_info *first_v = NULL;
29301 tree dispatch_decl = NULL;
29303 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29304 struct cgraph_function_version_info *it_v = NULL;
29305 struct cgraph_node *dispatcher_node = NULL;
29306 struct cgraph_function_version_info *dispatcher_version_info = NULL;
29307 #endif
29309 struct cgraph_function_version_info *default_version_info = NULL;
29311 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
29313 node = cgraph_get_node (fn);
29314 gcc_assert (node != NULL);
29316 node_v = get_cgraph_node_version (node);
29317 gcc_assert (node_v != NULL);
29319 if (node_v->dispatcher_resolver != NULL)
29320 return node_v->dispatcher_resolver;
29322 /* Find the default version and make it the first node. */
29323 first_v = node_v;
29324 /* Go to the beginnig of the chain. */
29325 while (first_v->prev != NULL)
29326 first_v = first_v->prev;
29327 default_version_info = first_v;
29328 while (default_version_info != NULL)
29330 if (is_function_default_version
29331 (default_version_info->this_node->symbol.decl))
29332 break;
29333 default_version_info = default_version_info->next;
29336 /* If there is no default node, just return NULL. */
29337 if (default_version_info == NULL)
29338 return NULL;
29340 /* Make default info the first node. */
29341 if (first_v != default_version_info)
29343 default_version_info->prev->next = default_version_info->next;
29344 if (default_version_info->next)
29345 default_version_info->next->prev = default_version_info->prev;
29346 first_v->prev = default_version_info;
29347 default_version_info->next = first_v;
29348 default_version_info->prev = NULL;
29351 default_node = default_version_info->this_node;
29353 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE) && HAVE_GNU_INDIRECT_FUNCTION
29354 /* Right now, the dispatching is done via ifunc. */
29355 dispatch_decl = make_dispatcher_decl (default_node->symbol.decl);
29357 dispatcher_node = cgraph_get_create_node (dispatch_decl);
29358 gcc_assert (dispatcher_node != NULL);
29359 dispatcher_node->dispatcher_function = 1;
29360 dispatcher_version_info
29361 = insert_new_cgraph_node_version (dispatcher_node);
29362 dispatcher_version_info->next = default_version_info;
29363 dispatcher_node->local.finalized = 1;
29365 /* Set the dispatcher for all the versions. */
29366 it_v = default_version_info;
29367 while (it_v != NULL)
29369 it_v->dispatcher_resolver = dispatch_decl;
29370 it_v = it_v->next;
29372 #else
29373 error_at (DECL_SOURCE_LOCATION (default_node->symbol.decl),
29374 "multiversioning needs ifunc which is not supported "
29375 "in this configuration");
29376 #endif
29377 return dispatch_decl;
29380 /* Makes a function attribute of the form NAME(ARG_NAME) and chains
29381 it to CHAIN. */
29383 static tree
29384 make_attribute (const char *name, const char *arg_name, tree chain)
29386 tree attr_name;
29387 tree attr_arg_name;
29388 tree attr_args;
29389 tree attr;
29391 attr_name = get_identifier (name);
29392 attr_arg_name = build_string (strlen (arg_name), arg_name);
29393 attr_args = tree_cons (NULL_TREE, attr_arg_name, NULL_TREE);
29394 attr = tree_cons (attr_name, attr_args, chain);
29395 return attr;
29398 /* Make the resolver function decl to dispatch the versions of
29399 a multi-versioned function, DEFAULT_DECL. Create an
29400 empty basic block in the resolver and store the pointer in
29401 EMPTY_BB. Return the decl of the resolver function. */
29403 static tree
29404 make_resolver_func (const tree default_decl,
29405 const tree dispatch_decl,
29406 basic_block *empty_bb)
29408 char *resolver_name;
29409 tree decl, type, decl_name, t;
29410 bool is_uniq = false;
29412 /* IFUNC's have to be globally visible. So, if the default_decl is
29413 not, then the name of the IFUNC should be made unique. */
29414 if (TREE_PUBLIC (default_decl) == 0)
29415 is_uniq = true;
29417 /* Append the filename to the resolver function if the versions are
29418 not externally visible. This is because the resolver function has
29419 to be externally visible for the loader to find it. So, appending
29420 the filename will prevent conflicts with a resolver function from
29421 another module which is based on the same version name. */
29422 resolver_name = make_name (default_decl, "resolver", is_uniq);
29424 /* The resolver function should return a (void *). */
29425 type = build_function_type_list (ptr_type_node, NULL_TREE);
29427 decl = build_fn_decl (resolver_name, type);
29428 decl_name = get_identifier (resolver_name);
29429 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
29431 DECL_NAME (decl) = decl_name;
29432 TREE_USED (decl) = 1;
29433 DECL_ARTIFICIAL (decl) = 1;
29434 DECL_IGNORED_P (decl) = 0;
29435 /* IFUNC resolvers have to be externally visible. */
29436 TREE_PUBLIC (decl) = 1;
29437 DECL_UNINLINABLE (decl) = 0;
29439 /* Resolver is not external, body is generated. */
29440 DECL_EXTERNAL (decl) = 0;
29441 DECL_EXTERNAL (dispatch_decl) = 0;
29443 DECL_CONTEXT (decl) = NULL_TREE;
29444 DECL_INITIAL (decl) = make_node (BLOCK);
29445 DECL_STATIC_CONSTRUCTOR (decl) = 0;
29447 if (DECL_COMDAT_GROUP (default_decl)
29448 || TREE_PUBLIC (default_decl))
29450 /* In this case, each translation unit with a call to this
29451 versioned function will put out a resolver. Ensure it
29452 is comdat to keep just one copy. */
29453 DECL_COMDAT (decl) = 1;
29454 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
29456 /* Build result decl and add to function_decl. */
29457 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
29458 DECL_ARTIFICIAL (t) = 1;
29459 DECL_IGNORED_P (t) = 1;
29460 DECL_RESULT (decl) = t;
29462 gimplify_function_tree (decl);
29463 push_cfun (DECL_STRUCT_FUNCTION (decl));
29464 *empty_bb = init_lowered_empty_function (decl, false);
29466 cgraph_add_new_function (decl, true);
29467 cgraph_call_function_insertion_hooks (cgraph_get_create_node (decl));
29469 pop_cfun ();
29471 gcc_assert (dispatch_decl != NULL);
29472 /* Mark dispatch_decl as "ifunc" with resolver as resolver_name. */
29473 DECL_ATTRIBUTES (dispatch_decl)
29474 = make_attribute ("ifunc", resolver_name, DECL_ATTRIBUTES (dispatch_decl));
29476 /* Create the alias for dispatch to resolver here. */
29477 /*cgraph_create_function_alias (dispatch_decl, decl);*/
29478 cgraph_same_body_alias (NULL, dispatch_decl, decl);
29479 XDELETEVEC (resolver_name);
29480 return decl;
29483 /* Generate the dispatching code body to dispatch multi-versioned function
29484 DECL. The target hook is called to process the "target" attributes and
29485 provide the code to dispatch the right function at run-time. NODE points
29486 to the dispatcher decl whose body will be created. */
29488 static tree
29489 ix86_generate_version_dispatcher_body (void *node_p)
29491 tree resolver_decl;
29492 basic_block empty_bb;
29493 vec<tree> fn_ver_vec = vNULL;
29494 tree default_ver_decl;
29495 struct cgraph_node *versn;
29496 struct cgraph_node *node;
29498 struct cgraph_function_version_info *node_version_info = NULL;
29499 struct cgraph_function_version_info *versn_info = NULL;
29501 node = (cgraph_node *)node_p;
29503 node_version_info = get_cgraph_node_version (node);
29504 gcc_assert (node->dispatcher_function
29505 && node_version_info != NULL);
29507 if (node_version_info->dispatcher_resolver)
29508 return node_version_info->dispatcher_resolver;
29510 /* The first version in the chain corresponds to the default version. */
29511 default_ver_decl = node_version_info->next->this_node->symbol.decl;
29513 /* node is going to be an alias, so remove the finalized bit. */
29514 node->local.finalized = false;
29516 resolver_decl = make_resolver_func (default_ver_decl,
29517 node->symbol.decl, &empty_bb);
29519 node_version_info->dispatcher_resolver = resolver_decl;
29521 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
29523 fn_ver_vec.create (2);
29525 for (versn_info = node_version_info->next; versn_info;
29526 versn_info = versn_info->next)
29528 versn = versn_info->this_node;
29529 /* Check for virtual functions here again, as by this time it should
29530 have been determined if this function needs a vtable index or
29531 not. This happens for methods in derived classes that override
29532 virtual methods in base classes but are not explicitly marked as
29533 virtual. */
29534 if (DECL_VINDEX (versn->symbol.decl))
29535 sorry ("Virtual function multiversioning not supported");
29537 fn_ver_vec.safe_push (versn->symbol.decl);
29540 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
29541 fn_ver_vec.release ();
29542 rebuild_cgraph_edges ();
29543 pop_cfun ();
29544 return resolver_decl;
29546 /* This builds the processor_model struct type defined in
29547 libgcc/config/i386/cpuinfo.c */
29549 static tree
29550 build_processor_model_struct (void)
29552 const char *field_name[] = {"__cpu_vendor", "__cpu_type", "__cpu_subtype",
29553 "__cpu_features"};
29554 tree field = NULL_TREE, field_chain = NULL_TREE;
29555 int i;
29556 tree type = make_node (RECORD_TYPE);
29558 /* The first 3 fields are unsigned int. */
29559 for (i = 0; i < 3; ++i)
29561 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29562 get_identifier (field_name[i]), unsigned_type_node);
29563 if (field_chain != NULL_TREE)
29564 DECL_CHAIN (field) = field_chain;
29565 field_chain = field;
29568 /* The last field is an array of unsigned integers of size one. */
29569 field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
29570 get_identifier (field_name[3]),
29571 build_array_type (unsigned_type_node,
29572 build_index_type (size_one_node)));
29573 if (field_chain != NULL_TREE)
29574 DECL_CHAIN (field) = field_chain;
29575 field_chain = field;
29577 finish_builtin_struct (type, "__processor_model", field_chain, NULL_TREE);
29578 return type;
29581 /* Returns a extern, comdat VAR_DECL of type TYPE and name NAME. */
29583 static tree
29584 make_var_decl (tree type, const char *name)
29586 tree new_decl;
29588 new_decl = build_decl (UNKNOWN_LOCATION,
29589 VAR_DECL,
29590 get_identifier(name),
29591 type);
29593 DECL_EXTERNAL (new_decl) = 1;
29594 TREE_STATIC (new_decl) = 1;
29595 TREE_PUBLIC (new_decl) = 1;
29596 DECL_INITIAL (new_decl) = 0;
29597 DECL_ARTIFICIAL (new_decl) = 0;
29598 DECL_PRESERVE_P (new_decl) = 1;
29600 make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
29601 assemble_variable (new_decl, 0, 0, 0);
29603 return new_decl;
29606 /* FNDECL is a __builtin_cpu_is or a __builtin_cpu_supports call that is folded
29607 into an integer defined in libgcc/config/i386/cpuinfo.c */
29609 static tree
29610 fold_builtin_cpu (tree fndecl, tree *args)
29612 unsigned int i;
29613 enum ix86_builtins fn_code = (enum ix86_builtins)
29614 DECL_FUNCTION_CODE (fndecl);
29615 tree param_string_cst = NULL;
29617 /* This is the order of bit-fields in __processor_features in cpuinfo.c */
29618 enum processor_features
29620 F_CMOV = 0,
29621 F_MMX,
29622 F_POPCNT,
29623 F_SSE,
29624 F_SSE2,
29625 F_SSE3,
29626 F_SSSE3,
29627 F_SSE4_1,
29628 F_SSE4_2,
29629 F_AVX,
29630 F_AVX2,
29631 F_MAX
29634 /* These are the values for vendor types and cpu types and subtypes
29635 in cpuinfo.c. Cpu types and subtypes should be subtracted by
29636 the corresponding start value. */
29637 enum processor_model
29639 M_INTEL = 1,
29640 M_AMD,
29641 M_CPU_TYPE_START,
29642 M_INTEL_ATOM,
29643 M_INTEL_CORE2,
29644 M_INTEL_COREI7,
29645 M_AMDFAM10H,
29646 M_AMDFAM15H,
29647 M_CPU_SUBTYPE_START,
29648 M_INTEL_COREI7_NEHALEM,
29649 M_INTEL_COREI7_WESTMERE,
29650 M_INTEL_COREI7_SANDYBRIDGE,
29651 M_AMDFAM10H_BARCELONA,
29652 M_AMDFAM10H_SHANGHAI,
29653 M_AMDFAM10H_ISTANBUL,
29654 M_AMDFAM15H_BDVER1,
29655 M_AMDFAM15H_BDVER2,
29656 M_AMDFAM15H_BDVER3
29659 static struct _arch_names_table
29661 const char *const name;
29662 const enum processor_model model;
29664 const arch_names_table[] =
29666 {"amd", M_AMD},
29667 {"intel", M_INTEL},
29668 {"atom", M_INTEL_ATOM},
29669 {"core2", M_INTEL_CORE2},
29670 {"corei7", M_INTEL_COREI7},
29671 {"nehalem", M_INTEL_COREI7_NEHALEM},
29672 {"westmere", M_INTEL_COREI7_WESTMERE},
29673 {"sandybridge", M_INTEL_COREI7_SANDYBRIDGE},
29674 {"amdfam10h", M_AMDFAM10H},
29675 {"barcelona", M_AMDFAM10H_BARCELONA},
29676 {"shanghai", M_AMDFAM10H_SHANGHAI},
29677 {"istanbul", M_AMDFAM10H_ISTANBUL},
29678 {"amdfam15h", M_AMDFAM15H},
29679 {"bdver1", M_AMDFAM15H_BDVER1},
29680 {"bdver2", M_AMDFAM15H_BDVER2},
29681 {"bdver3", M_AMDFAM15H_BDVER3},
29684 static struct _isa_names_table
29686 const char *const name;
29687 const enum processor_features feature;
29689 const isa_names_table[] =
29691 {"cmov", F_CMOV},
29692 {"mmx", F_MMX},
29693 {"popcnt", F_POPCNT},
29694 {"sse", F_SSE},
29695 {"sse2", F_SSE2},
29696 {"sse3", F_SSE3},
29697 {"ssse3", F_SSSE3},
29698 {"sse4.1", F_SSE4_1},
29699 {"sse4.2", F_SSE4_2},
29700 {"avx", F_AVX},
29701 {"avx2", F_AVX2}
29704 tree __processor_model_type = build_processor_model_struct ();
29705 tree __cpu_model_var = make_var_decl (__processor_model_type,
29706 "__cpu_model");
29708 gcc_assert ((args != NULL) && (*args != NULL));
29710 param_string_cst = *args;
29711 while (param_string_cst
29712 && TREE_CODE (param_string_cst) != STRING_CST)
29714 /* *args must be a expr that can contain other EXPRS leading to a
29715 STRING_CST. */
29716 if (!EXPR_P (param_string_cst))
29718 error ("Parameter to builtin must be a string constant or literal");
29719 return integer_zero_node;
29721 param_string_cst = TREE_OPERAND (EXPR_CHECK (param_string_cst), 0);
29724 gcc_assert (param_string_cst);
29726 if (fn_code == IX86_BUILTIN_CPU_IS)
29728 tree ref;
29729 tree field;
29730 tree final;
29732 unsigned int field_val = 0;
29733 unsigned int NUM_ARCH_NAMES
29734 = sizeof (arch_names_table) / sizeof (struct _arch_names_table);
29736 for (i = 0; i < NUM_ARCH_NAMES; i++)
29737 if (strcmp (arch_names_table[i].name,
29738 TREE_STRING_POINTER (param_string_cst)) == 0)
29739 break;
29741 if (i == NUM_ARCH_NAMES)
29743 error ("Parameter to builtin not valid: %s",
29744 TREE_STRING_POINTER (param_string_cst));
29745 return integer_zero_node;
29748 field = TYPE_FIELDS (__processor_model_type);
29749 field_val = arch_names_table[i].model;
29751 /* CPU types are stored in the next field. */
29752 if (field_val > M_CPU_TYPE_START
29753 && field_val < M_CPU_SUBTYPE_START)
29755 field = DECL_CHAIN (field);
29756 field_val -= M_CPU_TYPE_START;
29759 /* CPU subtypes are stored in the next field. */
29760 if (field_val > M_CPU_SUBTYPE_START)
29762 field = DECL_CHAIN ( DECL_CHAIN (field));
29763 field_val -= M_CPU_SUBTYPE_START;
29766 /* Get the appropriate field in __cpu_model. */
29767 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29768 field, NULL_TREE);
29770 /* Check the value. */
29771 final = build2 (EQ_EXPR, unsigned_type_node, ref,
29772 build_int_cstu (unsigned_type_node, field_val));
29773 return build1 (CONVERT_EXPR, integer_type_node, final);
29775 else if (fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29777 tree ref;
29778 tree array_elt;
29779 tree field;
29780 tree final;
29782 unsigned int field_val = 0;
29783 unsigned int NUM_ISA_NAMES
29784 = sizeof (isa_names_table) / sizeof (struct _isa_names_table);
29786 for (i = 0; i < NUM_ISA_NAMES; i++)
29787 if (strcmp (isa_names_table[i].name,
29788 TREE_STRING_POINTER (param_string_cst)) == 0)
29789 break;
29791 if (i == NUM_ISA_NAMES)
29793 error ("Parameter to builtin not valid: %s",
29794 TREE_STRING_POINTER (param_string_cst));
29795 return integer_zero_node;
29798 field = TYPE_FIELDS (__processor_model_type);
29799 /* Get the last field, which is __cpu_features. */
29800 while (DECL_CHAIN (field))
29801 field = DECL_CHAIN (field);
29803 /* Get the appropriate field: __cpu_model.__cpu_features */
29804 ref = build3 (COMPONENT_REF, TREE_TYPE (field), __cpu_model_var,
29805 field, NULL_TREE);
29807 /* Access the 0th element of __cpu_features array. */
29808 array_elt = build4 (ARRAY_REF, unsigned_type_node, ref,
29809 integer_zero_node, NULL_TREE, NULL_TREE);
29811 field_val = (1 << isa_names_table[i].feature);
29812 /* Return __cpu_model.__cpu_features[0] & field_val */
29813 final = build2 (BIT_AND_EXPR, unsigned_type_node, array_elt,
29814 build_int_cstu (unsigned_type_node, field_val));
29815 return build1 (CONVERT_EXPR, integer_type_node, final);
29817 gcc_unreachable ();
29820 static tree
29821 ix86_fold_builtin (tree fndecl, int n_args,
29822 tree *args, bool ignore ATTRIBUTE_UNUSED)
29824 if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
29826 enum ix86_builtins fn_code = (enum ix86_builtins)
29827 DECL_FUNCTION_CODE (fndecl);
29828 if (fn_code == IX86_BUILTIN_CPU_IS
29829 || fn_code == IX86_BUILTIN_CPU_SUPPORTS)
29831 gcc_assert (n_args == 1);
29832 return fold_builtin_cpu (fndecl, args);
29836 #ifdef SUBTARGET_FOLD_BUILTIN
29837 return SUBTARGET_FOLD_BUILTIN (fndecl, n_args, args, ignore);
29838 #endif
29840 return NULL_TREE;
29843 /* Make builtins to detect cpu type and features supported. NAME is
29844 the builtin name, CODE is the builtin code, and FTYPE is the function
29845 type of the builtin. */
29847 static void
29848 make_cpu_type_builtin (const char* name, int code,
29849 enum ix86_builtin_func_type ftype, bool is_const)
29851 tree decl;
29852 tree type;
29854 type = ix86_get_builtin_func_type (ftype);
29855 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
29856 NULL, NULL_TREE);
29857 gcc_assert (decl != NULL_TREE);
29858 ix86_builtins[(int) code] = decl;
29859 TREE_READONLY (decl) = is_const;
29862 /* Make builtins to get CPU type and features supported. The created
29863 builtins are :
29865 __builtin_cpu_init (), to detect cpu type and features,
29866 __builtin_cpu_is ("<CPUNAME>"), to check if cpu is of type <CPUNAME>,
29867 __builtin_cpu_supports ("<FEATURE>"), to check if cpu supports <FEATURE>
29870 static void
29871 ix86_init_platform_type_builtins (void)
29873 make_cpu_type_builtin ("__builtin_cpu_init", IX86_BUILTIN_CPU_INIT,
29874 INT_FTYPE_VOID, false);
29875 make_cpu_type_builtin ("__builtin_cpu_is", IX86_BUILTIN_CPU_IS,
29876 INT_FTYPE_PCCHAR, true);
29877 make_cpu_type_builtin ("__builtin_cpu_supports", IX86_BUILTIN_CPU_SUPPORTS,
29878 INT_FTYPE_PCCHAR, true);
29881 /* Internal method for ix86_init_builtins. */
29883 static void
29884 ix86_init_builtins_va_builtins_abi (void)
29886 tree ms_va_ref, sysv_va_ref;
29887 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
29888 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
29889 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
29890 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
29892 if (!TARGET_64BIT)
29893 return;
29894 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
29895 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
29896 ms_va_ref = build_reference_type (ms_va_list_type_node);
29897 sysv_va_ref =
29898 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
29900 fnvoid_va_end_ms =
29901 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29902 fnvoid_va_start_ms =
29903 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
29904 fnvoid_va_end_sysv =
29905 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
29906 fnvoid_va_start_sysv =
29907 build_varargs_function_type_list (void_type_node, sysv_va_ref,
29908 NULL_TREE);
29909 fnvoid_va_copy_ms =
29910 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
29911 NULL_TREE);
29912 fnvoid_va_copy_sysv =
29913 build_function_type_list (void_type_node, sysv_va_ref,
29914 sysv_va_ref, NULL_TREE);
29916 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
29917 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
29918 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
29919 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
29920 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
29921 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
29922 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
29923 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29924 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
29925 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29926 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
29927 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
29930 static void
29931 ix86_init_builtin_types (void)
29933 tree float128_type_node, float80_type_node;
29935 /* The __float80 type. */
29936 float80_type_node = long_double_type_node;
29937 if (TYPE_MODE (float80_type_node) != XFmode)
29939 /* The __float80 type. */
29940 float80_type_node = make_node (REAL_TYPE);
29942 TYPE_PRECISION (float80_type_node) = 80;
29943 layout_type (float80_type_node);
29945 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
29947 /* The __float128 type. */
29948 float128_type_node = make_node (REAL_TYPE);
29949 TYPE_PRECISION (float128_type_node) = 128;
29950 layout_type (float128_type_node);
29951 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
29953 /* This macro is built by i386-builtin-types.awk. */
29954 DEFINE_BUILTIN_PRIMITIVE_TYPES;
29957 static void
29958 ix86_init_builtins (void)
29960 tree t;
29962 ix86_init_builtin_types ();
29964 /* Builtins to get CPU type and features. */
29965 ix86_init_platform_type_builtins ();
29967 /* TFmode support builtins. */
29968 def_builtin_const (0, "__builtin_infq",
29969 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
29970 def_builtin_const (0, "__builtin_huge_valq",
29971 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
29973 /* We will expand them to normal call if SSE isn't available since
29974 they are used by libgcc. */
29975 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
29976 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
29977 BUILT_IN_MD, "__fabstf2", NULL_TREE);
29978 TREE_READONLY (t) = 1;
29979 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
29981 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
29982 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
29983 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
29984 TREE_READONLY (t) = 1;
29985 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
29987 ix86_init_tm_builtins ();
29988 ix86_init_mmx_sse_builtins ();
29990 if (TARGET_LP64)
29991 ix86_init_builtins_va_builtins_abi ();
29993 #ifdef SUBTARGET_INIT_BUILTINS
29994 SUBTARGET_INIT_BUILTINS;
29995 #endif
29998 /* Return the ix86 builtin for CODE. */
30000 static tree
30001 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
30003 if (code >= IX86_BUILTIN_MAX)
30004 return error_mark_node;
30006 return ix86_builtins[code];
30009 /* Errors in the source file can cause expand_expr to return const0_rtx
30010 where we expect a vector. To avoid crashing, use one of the vector
30011 clear instructions. */
30012 static rtx
30013 safe_vector_operand (rtx x, enum machine_mode mode)
30015 if (x == const0_rtx)
30016 x = CONST0_RTX (mode);
30017 return x;
30020 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
30022 static rtx
30023 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
30025 rtx pat;
30026 tree arg0 = CALL_EXPR_ARG (exp, 0);
30027 tree arg1 = CALL_EXPR_ARG (exp, 1);
30028 rtx op0 = expand_normal (arg0);
30029 rtx op1 = expand_normal (arg1);
30030 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30031 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30032 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
30034 if (VECTOR_MODE_P (mode0))
30035 op0 = safe_vector_operand (op0, mode0);
30036 if (VECTOR_MODE_P (mode1))
30037 op1 = safe_vector_operand (op1, mode1);
30039 if (optimize || !target
30040 || GET_MODE (target) != tmode
30041 || !insn_data[icode].operand[0].predicate (target, tmode))
30042 target = gen_reg_rtx (tmode);
30044 if (GET_MODE (op1) == SImode && mode1 == TImode)
30046 rtx x = gen_reg_rtx (V4SImode);
30047 emit_insn (gen_sse2_loadd (x, op1));
30048 op1 = gen_lowpart (TImode, x);
30051 if (!insn_data[icode].operand[1].predicate (op0, mode0))
30052 op0 = copy_to_mode_reg (mode0, op0);
30053 if (!insn_data[icode].operand[2].predicate (op1, mode1))
30054 op1 = copy_to_mode_reg (mode1, op1);
30056 pat = GEN_FCN (icode) (target, op0, op1);
30057 if (! pat)
30058 return 0;
30060 emit_insn (pat);
30062 return target;
30065 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
30067 static rtx
30068 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
30069 enum ix86_builtin_func_type m_type,
30070 enum rtx_code sub_code)
30072 rtx pat;
30073 int i;
30074 int nargs;
30075 bool comparison_p = false;
30076 bool tf_p = false;
30077 bool last_arg_constant = false;
30078 int num_memory = 0;
30079 struct {
30080 rtx op;
30081 enum machine_mode mode;
30082 } args[4];
30084 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30086 switch (m_type)
30088 case MULTI_ARG_4_DF2_DI_I:
30089 case MULTI_ARG_4_DF2_DI_I1:
30090 case MULTI_ARG_4_SF2_SI_I:
30091 case MULTI_ARG_4_SF2_SI_I1:
30092 nargs = 4;
30093 last_arg_constant = true;
30094 break;
30096 case MULTI_ARG_3_SF:
30097 case MULTI_ARG_3_DF:
30098 case MULTI_ARG_3_SF2:
30099 case MULTI_ARG_3_DF2:
30100 case MULTI_ARG_3_DI:
30101 case MULTI_ARG_3_SI:
30102 case MULTI_ARG_3_SI_DI:
30103 case MULTI_ARG_3_HI:
30104 case MULTI_ARG_3_HI_SI:
30105 case MULTI_ARG_3_QI:
30106 case MULTI_ARG_3_DI2:
30107 case MULTI_ARG_3_SI2:
30108 case MULTI_ARG_3_HI2:
30109 case MULTI_ARG_3_QI2:
30110 nargs = 3;
30111 break;
30113 case MULTI_ARG_2_SF:
30114 case MULTI_ARG_2_DF:
30115 case MULTI_ARG_2_DI:
30116 case MULTI_ARG_2_SI:
30117 case MULTI_ARG_2_HI:
30118 case MULTI_ARG_2_QI:
30119 nargs = 2;
30120 break;
30122 case MULTI_ARG_2_DI_IMM:
30123 case MULTI_ARG_2_SI_IMM:
30124 case MULTI_ARG_2_HI_IMM:
30125 case MULTI_ARG_2_QI_IMM:
30126 nargs = 2;
30127 last_arg_constant = true;
30128 break;
30130 case MULTI_ARG_1_SF:
30131 case MULTI_ARG_1_DF:
30132 case MULTI_ARG_1_SF2:
30133 case MULTI_ARG_1_DF2:
30134 case MULTI_ARG_1_DI:
30135 case MULTI_ARG_1_SI:
30136 case MULTI_ARG_1_HI:
30137 case MULTI_ARG_1_QI:
30138 case MULTI_ARG_1_SI_DI:
30139 case MULTI_ARG_1_HI_DI:
30140 case MULTI_ARG_1_HI_SI:
30141 case MULTI_ARG_1_QI_DI:
30142 case MULTI_ARG_1_QI_SI:
30143 case MULTI_ARG_1_QI_HI:
30144 nargs = 1;
30145 break;
30147 case MULTI_ARG_2_DI_CMP:
30148 case MULTI_ARG_2_SI_CMP:
30149 case MULTI_ARG_2_HI_CMP:
30150 case MULTI_ARG_2_QI_CMP:
30151 nargs = 2;
30152 comparison_p = true;
30153 break;
30155 case MULTI_ARG_2_SF_TF:
30156 case MULTI_ARG_2_DF_TF:
30157 case MULTI_ARG_2_DI_TF:
30158 case MULTI_ARG_2_SI_TF:
30159 case MULTI_ARG_2_HI_TF:
30160 case MULTI_ARG_2_QI_TF:
30161 nargs = 2;
30162 tf_p = true;
30163 break;
30165 default:
30166 gcc_unreachable ();
30169 if (optimize || !target
30170 || GET_MODE (target) != tmode
30171 || !insn_data[icode].operand[0].predicate (target, tmode))
30172 target = gen_reg_rtx (tmode);
30174 gcc_assert (nargs <= 4);
30176 for (i = 0; i < nargs; i++)
30178 tree arg = CALL_EXPR_ARG (exp, i);
30179 rtx op = expand_normal (arg);
30180 int adjust = (comparison_p) ? 1 : 0;
30181 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
30183 if (last_arg_constant && i == nargs - 1)
30185 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
30187 enum insn_code new_icode = icode;
30188 switch (icode)
30190 case CODE_FOR_xop_vpermil2v2df3:
30191 case CODE_FOR_xop_vpermil2v4sf3:
30192 case CODE_FOR_xop_vpermil2v4df3:
30193 case CODE_FOR_xop_vpermil2v8sf3:
30194 error ("the last argument must be a 2-bit immediate");
30195 return gen_reg_rtx (tmode);
30196 case CODE_FOR_xop_rotlv2di3:
30197 new_icode = CODE_FOR_rotlv2di3;
30198 goto xop_rotl;
30199 case CODE_FOR_xop_rotlv4si3:
30200 new_icode = CODE_FOR_rotlv4si3;
30201 goto xop_rotl;
30202 case CODE_FOR_xop_rotlv8hi3:
30203 new_icode = CODE_FOR_rotlv8hi3;
30204 goto xop_rotl;
30205 case CODE_FOR_xop_rotlv16qi3:
30206 new_icode = CODE_FOR_rotlv16qi3;
30207 xop_rotl:
30208 if (CONST_INT_P (op))
30210 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
30211 op = GEN_INT (INTVAL (op) & mask);
30212 gcc_checking_assert
30213 (insn_data[icode].operand[i + 1].predicate (op, mode));
30215 else
30217 gcc_checking_assert
30218 (nargs == 2
30219 && insn_data[new_icode].operand[0].mode == tmode
30220 && insn_data[new_icode].operand[1].mode == tmode
30221 && insn_data[new_icode].operand[2].mode == mode
30222 && insn_data[new_icode].operand[0].predicate
30223 == insn_data[icode].operand[0].predicate
30224 && insn_data[new_icode].operand[1].predicate
30225 == insn_data[icode].operand[1].predicate);
30226 icode = new_icode;
30227 goto non_constant;
30229 break;
30230 default:
30231 gcc_unreachable ();
30235 else
30237 non_constant:
30238 if (VECTOR_MODE_P (mode))
30239 op = safe_vector_operand (op, mode);
30241 /* If we aren't optimizing, only allow one memory operand to be
30242 generated. */
30243 if (memory_operand (op, mode))
30244 num_memory++;
30246 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
30248 if (optimize
30249 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
30250 || num_memory > 1)
30251 op = force_reg (mode, op);
30254 args[i].op = op;
30255 args[i].mode = mode;
30258 switch (nargs)
30260 case 1:
30261 pat = GEN_FCN (icode) (target, args[0].op);
30262 break;
30264 case 2:
30265 if (tf_p)
30266 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
30267 GEN_INT ((int)sub_code));
30268 else if (! comparison_p)
30269 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
30270 else
30272 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
30273 args[0].op,
30274 args[1].op);
30276 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
30278 break;
30280 case 3:
30281 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
30282 break;
30284 case 4:
30285 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
30286 break;
30288 default:
30289 gcc_unreachable ();
30292 if (! pat)
30293 return 0;
30295 emit_insn (pat);
30296 return target;
30299 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
30300 insns with vec_merge. */
30302 static rtx
30303 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
30304 rtx target)
30306 rtx pat;
30307 tree arg0 = CALL_EXPR_ARG (exp, 0);
30308 rtx op1, op0 = expand_normal (arg0);
30309 enum machine_mode tmode = insn_data[icode].operand[0].mode;
30310 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
30312 if (optimize || !target
30313 || GET_MODE (target) != tmode
30314 || !insn_data[icode].operand[0].predicate (target, tmode))
30315 target = gen_reg_rtx (tmode);
30317 if (VECTOR_MODE_P (mode0))
30318 op0 = safe_vector_operand (op0, mode0);
30320 if ((optimize && !register_operand (op0, mode0))
30321 || !insn_data[icode].operand[1].predicate (op0, mode0))
30322 op0 = copy_to_mode_reg (mode0, op0);
30324 op1 = op0;
30325 if (!insn_data[icode].operand[2].predicate (op1, mode0))
30326 op1 = copy_to_mode_reg (mode0, op1);
30328 pat = GEN_FCN (icode) (target, op0, op1);
30329 if (! pat)
30330 return 0;
30331 emit_insn (pat);
30332 return target;
30335 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
30337 static rtx
30338 ix86_expand_sse_compare (const struct builtin_description *d,
30339 tree exp, rtx target, bool swap)
30341 rtx pat;
30342 tree arg0 = CALL_EXPR_ARG (exp, 0);
30343 tree arg1 = CALL_EXPR_ARG (exp, 1);
30344 rtx op0 = expand_normal (arg0);
30345 rtx op1 = expand_normal (arg1);
30346 rtx op2;
30347 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30348 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30349 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30350 enum rtx_code comparison = d->comparison;
30352 if (VECTOR_MODE_P (mode0))
30353 op0 = safe_vector_operand (op0, mode0);
30354 if (VECTOR_MODE_P (mode1))
30355 op1 = safe_vector_operand (op1, mode1);
30357 /* Swap operands if we have a comparison that isn't available in
30358 hardware. */
30359 if (swap)
30361 rtx tmp = gen_reg_rtx (mode1);
30362 emit_move_insn (tmp, op1);
30363 op1 = op0;
30364 op0 = tmp;
30367 if (optimize || !target
30368 || GET_MODE (target) != tmode
30369 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30370 target = gen_reg_rtx (tmode);
30372 if ((optimize && !register_operand (op0, mode0))
30373 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
30374 op0 = copy_to_mode_reg (mode0, op0);
30375 if ((optimize && !register_operand (op1, mode1))
30376 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
30377 op1 = copy_to_mode_reg (mode1, op1);
30379 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
30380 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30381 if (! pat)
30382 return 0;
30383 emit_insn (pat);
30384 return target;
30387 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
30389 static rtx
30390 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
30391 rtx target)
30393 rtx pat;
30394 tree arg0 = CALL_EXPR_ARG (exp, 0);
30395 tree arg1 = CALL_EXPR_ARG (exp, 1);
30396 rtx op0 = expand_normal (arg0);
30397 rtx op1 = expand_normal (arg1);
30398 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30399 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30400 enum rtx_code comparison = d->comparison;
30402 if (VECTOR_MODE_P (mode0))
30403 op0 = safe_vector_operand (op0, mode0);
30404 if (VECTOR_MODE_P (mode1))
30405 op1 = safe_vector_operand (op1, mode1);
30407 /* Swap operands if we have a comparison that isn't available in
30408 hardware. */
30409 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
30411 rtx tmp = op1;
30412 op1 = op0;
30413 op0 = tmp;
30416 target = gen_reg_rtx (SImode);
30417 emit_move_insn (target, const0_rtx);
30418 target = gen_rtx_SUBREG (QImode, target, 0);
30420 if ((optimize && !register_operand (op0, mode0))
30421 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30422 op0 = copy_to_mode_reg (mode0, op0);
30423 if ((optimize && !register_operand (op1, mode1))
30424 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30425 op1 = copy_to_mode_reg (mode1, op1);
30427 pat = GEN_FCN (d->icode) (op0, op1);
30428 if (! pat)
30429 return 0;
30430 emit_insn (pat);
30431 emit_insn (gen_rtx_SET (VOIDmode,
30432 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30433 gen_rtx_fmt_ee (comparison, QImode,
30434 SET_DEST (pat),
30435 const0_rtx)));
30437 return SUBREG_REG (target);
30440 /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
30442 static rtx
30443 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
30444 rtx target)
30446 rtx pat;
30447 tree arg0 = CALL_EXPR_ARG (exp, 0);
30448 rtx op1, op0 = expand_normal (arg0);
30449 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30450 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30452 if (optimize || target == 0
30453 || GET_MODE (target) != tmode
30454 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30455 target = gen_reg_rtx (tmode);
30457 if (VECTOR_MODE_P (mode0))
30458 op0 = safe_vector_operand (op0, mode0);
30460 if ((optimize && !register_operand (op0, mode0))
30461 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30462 op0 = copy_to_mode_reg (mode0, op0);
30464 op1 = GEN_INT (d->comparison);
30466 pat = GEN_FCN (d->icode) (target, op0, op1);
30467 if (! pat)
30468 return 0;
30469 emit_insn (pat);
30470 return target;
30473 static rtx
30474 ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
30475 tree exp, rtx target)
30477 rtx pat;
30478 tree arg0 = CALL_EXPR_ARG (exp, 0);
30479 tree arg1 = CALL_EXPR_ARG (exp, 1);
30480 rtx op0 = expand_normal (arg0);
30481 rtx op1 = expand_normal (arg1);
30482 rtx op2;
30483 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
30484 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
30485 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
30487 if (optimize || target == 0
30488 || GET_MODE (target) != tmode
30489 || !insn_data[d->icode].operand[0].predicate (target, tmode))
30490 target = gen_reg_rtx (tmode);
30492 op0 = safe_vector_operand (op0, mode0);
30493 op1 = safe_vector_operand (op1, mode1);
30495 if ((optimize && !register_operand (op0, mode0))
30496 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30497 op0 = copy_to_mode_reg (mode0, op0);
30498 if ((optimize && !register_operand (op1, mode1))
30499 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30500 op1 = copy_to_mode_reg (mode1, op1);
30502 op2 = GEN_INT (d->comparison);
30504 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
30505 if (! pat)
30506 return 0;
30507 emit_insn (pat);
30508 return target;
30511 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
30513 static rtx
30514 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
30515 rtx target)
30517 rtx pat;
30518 tree arg0 = CALL_EXPR_ARG (exp, 0);
30519 tree arg1 = CALL_EXPR_ARG (exp, 1);
30520 rtx op0 = expand_normal (arg0);
30521 rtx op1 = expand_normal (arg1);
30522 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
30523 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
30524 enum rtx_code comparison = d->comparison;
30526 if (VECTOR_MODE_P (mode0))
30527 op0 = safe_vector_operand (op0, mode0);
30528 if (VECTOR_MODE_P (mode1))
30529 op1 = safe_vector_operand (op1, mode1);
30531 target = gen_reg_rtx (SImode);
30532 emit_move_insn (target, const0_rtx);
30533 target = gen_rtx_SUBREG (QImode, target, 0);
30535 if ((optimize && !register_operand (op0, mode0))
30536 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
30537 op0 = copy_to_mode_reg (mode0, op0);
30538 if ((optimize && !register_operand (op1, mode1))
30539 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
30540 op1 = copy_to_mode_reg (mode1, op1);
30542 pat = GEN_FCN (d->icode) (op0, op1);
30543 if (! pat)
30544 return 0;
30545 emit_insn (pat);
30546 emit_insn (gen_rtx_SET (VOIDmode,
30547 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30548 gen_rtx_fmt_ee (comparison, QImode,
30549 SET_DEST (pat),
30550 const0_rtx)));
30552 return SUBREG_REG (target);
30555 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
30557 static rtx
30558 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
30559 tree exp, rtx target)
30561 rtx pat;
30562 tree arg0 = CALL_EXPR_ARG (exp, 0);
30563 tree arg1 = CALL_EXPR_ARG (exp, 1);
30564 tree arg2 = CALL_EXPR_ARG (exp, 2);
30565 tree arg3 = CALL_EXPR_ARG (exp, 3);
30566 tree arg4 = CALL_EXPR_ARG (exp, 4);
30567 rtx scratch0, scratch1;
30568 rtx op0 = expand_normal (arg0);
30569 rtx op1 = expand_normal (arg1);
30570 rtx op2 = expand_normal (arg2);
30571 rtx op3 = expand_normal (arg3);
30572 rtx op4 = expand_normal (arg4);
30573 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
30575 tmode0 = insn_data[d->icode].operand[0].mode;
30576 tmode1 = insn_data[d->icode].operand[1].mode;
30577 modev2 = insn_data[d->icode].operand[2].mode;
30578 modei3 = insn_data[d->icode].operand[3].mode;
30579 modev4 = insn_data[d->icode].operand[4].mode;
30580 modei5 = insn_data[d->icode].operand[5].mode;
30581 modeimm = insn_data[d->icode].operand[6].mode;
30583 if (VECTOR_MODE_P (modev2))
30584 op0 = safe_vector_operand (op0, modev2);
30585 if (VECTOR_MODE_P (modev4))
30586 op2 = safe_vector_operand (op2, modev4);
30588 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30589 op0 = copy_to_mode_reg (modev2, op0);
30590 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
30591 op1 = copy_to_mode_reg (modei3, op1);
30592 if ((optimize && !register_operand (op2, modev4))
30593 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
30594 op2 = copy_to_mode_reg (modev4, op2);
30595 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
30596 op3 = copy_to_mode_reg (modei5, op3);
30598 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
30600 error ("the fifth argument must be an 8-bit immediate");
30601 return const0_rtx;
30604 if (d->code == IX86_BUILTIN_PCMPESTRI128)
30606 if (optimize || !target
30607 || GET_MODE (target) != tmode0
30608 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30609 target = gen_reg_rtx (tmode0);
30611 scratch1 = gen_reg_rtx (tmode1);
30613 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
30615 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
30617 if (optimize || !target
30618 || GET_MODE (target) != tmode1
30619 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30620 target = gen_reg_rtx (tmode1);
30622 scratch0 = gen_reg_rtx (tmode0);
30624 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
30626 else
30628 gcc_assert (d->flag);
30630 scratch0 = gen_reg_rtx (tmode0);
30631 scratch1 = gen_reg_rtx (tmode1);
30633 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
30636 if (! pat)
30637 return 0;
30639 emit_insn (pat);
30641 if (d->flag)
30643 target = gen_reg_rtx (SImode);
30644 emit_move_insn (target, const0_rtx);
30645 target = gen_rtx_SUBREG (QImode, target, 0);
30647 emit_insn
30648 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30649 gen_rtx_fmt_ee (EQ, QImode,
30650 gen_rtx_REG ((enum machine_mode) d->flag,
30651 FLAGS_REG),
30652 const0_rtx)));
30653 return SUBREG_REG (target);
30655 else
30656 return target;
30660 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
30662 static rtx
30663 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
30664 tree exp, rtx target)
30666 rtx pat;
30667 tree arg0 = CALL_EXPR_ARG (exp, 0);
30668 tree arg1 = CALL_EXPR_ARG (exp, 1);
30669 tree arg2 = CALL_EXPR_ARG (exp, 2);
30670 rtx scratch0, scratch1;
30671 rtx op0 = expand_normal (arg0);
30672 rtx op1 = expand_normal (arg1);
30673 rtx op2 = expand_normal (arg2);
30674 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
30676 tmode0 = insn_data[d->icode].operand[0].mode;
30677 tmode1 = insn_data[d->icode].operand[1].mode;
30678 modev2 = insn_data[d->icode].operand[2].mode;
30679 modev3 = insn_data[d->icode].operand[3].mode;
30680 modeimm = insn_data[d->icode].operand[4].mode;
30682 if (VECTOR_MODE_P (modev2))
30683 op0 = safe_vector_operand (op0, modev2);
30684 if (VECTOR_MODE_P (modev3))
30685 op1 = safe_vector_operand (op1, modev3);
30687 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
30688 op0 = copy_to_mode_reg (modev2, op0);
30689 if ((optimize && !register_operand (op1, modev3))
30690 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
30691 op1 = copy_to_mode_reg (modev3, op1);
30693 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
30695 error ("the third argument must be an 8-bit immediate");
30696 return const0_rtx;
30699 if (d->code == IX86_BUILTIN_PCMPISTRI128)
30701 if (optimize || !target
30702 || GET_MODE (target) != tmode0
30703 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
30704 target = gen_reg_rtx (tmode0);
30706 scratch1 = gen_reg_rtx (tmode1);
30708 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
30710 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
30712 if (optimize || !target
30713 || GET_MODE (target) != tmode1
30714 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
30715 target = gen_reg_rtx (tmode1);
30717 scratch0 = gen_reg_rtx (tmode0);
30719 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
30721 else
30723 gcc_assert (d->flag);
30725 scratch0 = gen_reg_rtx (tmode0);
30726 scratch1 = gen_reg_rtx (tmode1);
30728 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
30731 if (! pat)
30732 return 0;
30734 emit_insn (pat);
30736 if (d->flag)
30738 target = gen_reg_rtx (SImode);
30739 emit_move_insn (target, const0_rtx);
30740 target = gen_rtx_SUBREG (QImode, target, 0);
30742 emit_insn
30743 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
30744 gen_rtx_fmt_ee (EQ, QImode,
30745 gen_rtx_REG ((enum machine_mode) d->flag,
30746 FLAGS_REG),
30747 const0_rtx)));
30748 return SUBREG_REG (target);
30750 else
30751 return target;
30754 /* Subroutine of ix86_expand_builtin to take care of insns with
30755 variable number of operands. */
30757 static rtx
30758 ix86_expand_args_builtin (const struct builtin_description *d,
30759 tree exp, rtx target)
30761 rtx pat, real_target;
30762 unsigned int i, nargs;
30763 unsigned int nargs_constant = 0;
30764 int num_memory = 0;
30765 struct
30767 rtx op;
30768 enum machine_mode mode;
30769 } args[4];
30770 bool last_arg_count = false;
30771 enum insn_code icode = d->icode;
30772 const struct insn_data_d *insn_p = &insn_data[icode];
30773 enum machine_mode tmode = insn_p->operand[0].mode;
30774 enum machine_mode rmode = VOIDmode;
30775 bool swap = false;
30776 enum rtx_code comparison = d->comparison;
30778 switch ((enum ix86_builtin_func_type) d->flag)
30780 case V2DF_FTYPE_V2DF_ROUND:
30781 case V4DF_FTYPE_V4DF_ROUND:
30782 case V4SF_FTYPE_V4SF_ROUND:
30783 case V8SF_FTYPE_V8SF_ROUND:
30784 case V4SI_FTYPE_V4SF_ROUND:
30785 case V8SI_FTYPE_V8SF_ROUND:
30786 return ix86_expand_sse_round (d, exp, target);
30787 case V4SI_FTYPE_V2DF_V2DF_ROUND:
30788 case V8SI_FTYPE_V4DF_V4DF_ROUND:
30789 return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
30790 case INT_FTYPE_V8SF_V8SF_PTEST:
30791 case INT_FTYPE_V4DI_V4DI_PTEST:
30792 case INT_FTYPE_V4DF_V4DF_PTEST:
30793 case INT_FTYPE_V4SF_V4SF_PTEST:
30794 case INT_FTYPE_V2DI_V2DI_PTEST:
30795 case INT_FTYPE_V2DF_V2DF_PTEST:
30796 return ix86_expand_sse_ptest (d, exp, target);
30797 case FLOAT128_FTYPE_FLOAT128:
30798 case FLOAT_FTYPE_FLOAT:
30799 case INT_FTYPE_INT:
30800 case UINT64_FTYPE_INT:
30801 case UINT16_FTYPE_UINT16:
30802 case INT64_FTYPE_INT64:
30803 case INT64_FTYPE_V4SF:
30804 case INT64_FTYPE_V2DF:
30805 case INT_FTYPE_V16QI:
30806 case INT_FTYPE_V8QI:
30807 case INT_FTYPE_V8SF:
30808 case INT_FTYPE_V4DF:
30809 case INT_FTYPE_V4SF:
30810 case INT_FTYPE_V2DF:
30811 case INT_FTYPE_V32QI:
30812 case V16QI_FTYPE_V16QI:
30813 case V8SI_FTYPE_V8SF:
30814 case V8SI_FTYPE_V4SI:
30815 case V8HI_FTYPE_V8HI:
30816 case V8HI_FTYPE_V16QI:
30817 case V8QI_FTYPE_V8QI:
30818 case V8SF_FTYPE_V8SF:
30819 case V8SF_FTYPE_V8SI:
30820 case V8SF_FTYPE_V4SF:
30821 case V8SF_FTYPE_V8HI:
30822 case V4SI_FTYPE_V4SI:
30823 case V4SI_FTYPE_V16QI:
30824 case V4SI_FTYPE_V4SF:
30825 case V4SI_FTYPE_V8SI:
30826 case V4SI_FTYPE_V8HI:
30827 case V4SI_FTYPE_V4DF:
30828 case V4SI_FTYPE_V2DF:
30829 case V4HI_FTYPE_V4HI:
30830 case V4DF_FTYPE_V4DF:
30831 case V4DF_FTYPE_V4SI:
30832 case V4DF_FTYPE_V4SF:
30833 case V4DF_FTYPE_V2DF:
30834 case V4SF_FTYPE_V4SF:
30835 case V4SF_FTYPE_V4SI:
30836 case V4SF_FTYPE_V8SF:
30837 case V4SF_FTYPE_V4DF:
30838 case V4SF_FTYPE_V8HI:
30839 case V4SF_FTYPE_V2DF:
30840 case V2DI_FTYPE_V2DI:
30841 case V2DI_FTYPE_V16QI:
30842 case V2DI_FTYPE_V8HI:
30843 case V2DI_FTYPE_V4SI:
30844 case V2DF_FTYPE_V2DF:
30845 case V2DF_FTYPE_V4SI:
30846 case V2DF_FTYPE_V4DF:
30847 case V2DF_FTYPE_V4SF:
30848 case V2DF_FTYPE_V2SI:
30849 case V2SI_FTYPE_V2SI:
30850 case V2SI_FTYPE_V4SF:
30851 case V2SI_FTYPE_V2SF:
30852 case V2SI_FTYPE_V2DF:
30853 case V2SF_FTYPE_V2SF:
30854 case V2SF_FTYPE_V2SI:
30855 case V32QI_FTYPE_V32QI:
30856 case V32QI_FTYPE_V16QI:
30857 case V16HI_FTYPE_V16HI:
30858 case V16HI_FTYPE_V8HI:
30859 case V8SI_FTYPE_V8SI:
30860 case V16HI_FTYPE_V16QI:
30861 case V8SI_FTYPE_V16QI:
30862 case V4DI_FTYPE_V16QI:
30863 case V8SI_FTYPE_V8HI:
30864 case V4DI_FTYPE_V8HI:
30865 case V4DI_FTYPE_V4SI:
30866 case V4DI_FTYPE_V2DI:
30867 nargs = 1;
30868 break;
30869 case V4SF_FTYPE_V4SF_VEC_MERGE:
30870 case V2DF_FTYPE_V2DF_VEC_MERGE:
30871 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
30872 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
30873 case V16QI_FTYPE_V16QI_V16QI:
30874 case V16QI_FTYPE_V8HI_V8HI:
30875 case V8QI_FTYPE_V8QI_V8QI:
30876 case V8QI_FTYPE_V4HI_V4HI:
30877 case V8HI_FTYPE_V8HI_V8HI:
30878 case V8HI_FTYPE_V16QI_V16QI:
30879 case V8HI_FTYPE_V4SI_V4SI:
30880 case V8SF_FTYPE_V8SF_V8SF:
30881 case V8SF_FTYPE_V8SF_V8SI:
30882 case V4SI_FTYPE_V4SI_V4SI:
30883 case V4SI_FTYPE_V8HI_V8HI:
30884 case V4SI_FTYPE_V4SF_V4SF:
30885 case V4SI_FTYPE_V2DF_V2DF:
30886 case V4HI_FTYPE_V4HI_V4HI:
30887 case V4HI_FTYPE_V8QI_V8QI:
30888 case V4HI_FTYPE_V2SI_V2SI:
30889 case V4DF_FTYPE_V4DF_V4DF:
30890 case V4DF_FTYPE_V4DF_V4DI:
30891 case V4SF_FTYPE_V4SF_V4SF:
30892 case V4SF_FTYPE_V4SF_V4SI:
30893 case V4SF_FTYPE_V4SF_V2SI:
30894 case V4SF_FTYPE_V4SF_V2DF:
30895 case V4SF_FTYPE_V4SF_DI:
30896 case V4SF_FTYPE_V4SF_SI:
30897 case V2DI_FTYPE_V2DI_V2DI:
30898 case V2DI_FTYPE_V16QI_V16QI:
30899 case V2DI_FTYPE_V4SI_V4SI:
30900 case V2UDI_FTYPE_V4USI_V4USI:
30901 case V2DI_FTYPE_V2DI_V16QI:
30902 case V2DI_FTYPE_V2DF_V2DF:
30903 case V2SI_FTYPE_V2SI_V2SI:
30904 case V2SI_FTYPE_V4HI_V4HI:
30905 case V2SI_FTYPE_V2SF_V2SF:
30906 case V2DF_FTYPE_V2DF_V2DF:
30907 case V2DF_FTYPE_V2DF_V4SF:
30908 case V2DF_FTYPE_V2DF_V2DI:
30909 case V2DF_FTYPE_V2DF_DI:
30910 case V2DF_FTYPE_V2DF_SI:
30911 case V2SF_FTYPE_V2SF_V2SF:
30912 case V1DI_FTYPE_V1DI_V1DI:
30913 case V1DI_FTYPE_V8QI_V8QI:
30914 case V1DI_FTYPE_V2SI_V2SI:
30915 case V32QI_FTYPE_V16HI_V16HI:
30916 case V16HI_FTYPE_V8SI_V8SI:
30917 case V32QI_FTYPE_V32QI_V32QI:
30918 case V16HI_FTYPE_V32QI_V32QI:
30919 case V16HI_FTYPE_V16HI_V16HI:
30920 case V8SI_FTYPE_V4DF_V4DF:
30921 case V8SI_FTYPE_V8SI_V8SI:
30922 case V8SI_FTYPE_V16HI_V16HI:
30923 case V4DI_FTYPE_V4DI_V4DI:
30924 case V4DI_FTYPE_V8SI_V8SI:
30925 case V4UDI_FTYPE_V8USI_V8USI:
30926 if (comparison == UNKNOWN)
30927 return ix86_expand_binop_builtin (icode, exp, target);
30928 nargs = 2;
30929 break;
30930 case V4SF_FTYPE_V4SF_V4SF_SWAP:
30931 case V2DF_FTYPE_V2DF_V2DF_SWAP:
30932 gcc_assert (comparison != UNKNOWN);
30933 nargs = 2;
30934 swap = true;
30935 break;
30936 case V16HI_FTYPE_V16HI_V8HI_COUNT:
30937 case V16HI_FTYPE_V16HI_SI_COUNT:
30938 case V8SI_FTYPE_V8SI_V4SI_COUNT:
30939 case V8SI_FTYPE_V8SI_SI_COUNT:
30940 case V4DI_FTYPE_V4DI_V2DI_COUNT:
30941 case V4DI_FTYPE_V4DI_INT_COUNT:
30942 case V8HI_FTYPE_V8HI_V8HI_COUNT:
30943 case V8HI_FTYPE_V8HI_SI_COUNT:
30944 case V4SI_FTYPE_V4SI_V4SI_COUNT:
30945 case V4SI_FTYPE_V4SI_SI_COUNT:
30946 case V4HI_FTYPE_V4HI_V4HI_COUNT:
30947 case V4HI_FTYPE_V4HI_SI_COUNT:
30948 case V2DI_FTYPE_V2DI_V2DI_COUNT:
30949 case V2DI_FTYPE_V2DI_SI_COUNT:
30950 case V2SI_FTYPE_V2SI_V2SI_COUNT:
30951 case V2SI_FTYPE_V2SI_SI_COUNT:
30952 case V1DI_FTYPE_V1DI_V1DI_COUNT:
30953 case V1DI_FTYPE_V1DI_SI_COUNT:
30954 nargs = 2;
30955 last_arg_count = true;
30956 break;
30957 case UINT64_FTYPE_UINT64_UINT64:
30958 case UINT_FTYPE_UINT_UINT:
30959 case UINT_FTYPE_UINT_USHORT:
30960 case UINT_FTYPE_UINT_UCHAR:
30961 case UINT16_FTYPE_UINT16_INT:
30962 case UINT8_FTYPE_UINT8_INT:
30963 nargs = 2;
30964 break;
30965 case V2DI_FTYPE_V2DI_INT_CONVERT:
30966 nargs = 2;
30967 rmode = V1TImode;
30968 nargs_constant = 1;
30969 break;
30970 case V4DI_FTYPE_V4DI_INT_CONVERT:
30971 nargs = 2;
30972 rmode = V2TImode;
30973 nargs_constant = 1;
30974 break;
30975 case V8HI_FTYPE_V8HI_INT:
30976 case V8HI_FTYPE_V8SF_INT:
30977 case V8HI_FTYPE_V4SF_INT:
30978 case V8SF_FTYPE_V8SF_INT:
30979 case V4SI_FTYPE_V4SI_INT:
30980 case V4SI_FTYPE_V8SI_INT:
30981 case V4HI_FTYPE_V4HI_INT:
30982 case V4DF_FTYPE_V4DF_INT:
30983 case V4SF_FTYPE_V4SF_INT:
30984 case V4SF_FTYPE_V8SF_INT:
30985 case V2DI_FTYPE_V2DI_INT:
30986 case V2DF_FTYPE_V2DF_INT:
30987 case V2DF_FTYPE_V4DF_INT:
30988 case V16HI_FTYPE_V16HI_INT:
30989 case V8SI_FTYPE_V8SI_INT:
30990 case V4DI_FTYPE_V4DI_INT:
30991 case V2DI_FTYPE_V4DI_INT:
30992 nargs = 2;
30993 nargs_constant = 1;
30994 break;
30995 case V16QI_FTYPE_V16QI_V16QI_V16QI:
30996 case V8SF_FTYPE_V8SF_V8SF_V8SF:
30997 case V4DF_FTYPE_V4DF_V4DF_V4DF:
30998 case V4SF_FTYPE_V4SF_V4SF_V4SF:
30999 case V2DF_FTYPE_V2DF_V2DF_V2DF:
31000 case V32QI_FTYPE_V32QI_V32QI_V32QI:
31001 nargs = 3;
31002 break;
31003 case V32QI_FTYPE_V32QI_V32QI_INT:
31004 case V16HI_FTYPE_V16HI_V16HI_INT:
31005 case V16QI_FTYPE_V16QI_V16QI_INT:
31006 case V4DI_FTYPE_V4DI_V4DI_INT:
31007 case V8HI_FTYPE_V8HI_V8HI_INT:
31008 case V8SI_FTYPE_V8SI_V8SI_INT:
31009 case V8SI_FTYPE_V8SI_V4SI_INT:
31010 case V8SF_FTYPE_V8SF_V8SF_INT:
31011 case V8SF_FTYPE_V8SF_V4SF_INT:
31012 case V4SI_FTYPE_V4SI_V4SI_INT:
31013 case V4DF_FTYPE_V4DF_V4DF_INT:
31014 case V4DF_FTYPE_V4DF_V2DF_INT:
31015 case V4SF_FTYPE_V4SF_V4SF_INT:
31016 case V2DI_FTYPE_V2DI_V2DI_INT:
31017 case V4DI_FTYPE_V4DI_V2DI_INT:
31018 case V2DF_FTYPE_V2DF_V2DF_INT:
31019 nargs = 3;
31020 nargs_constant = 1;
31021 break;
31022 case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
31023 nargs = 3;
31024 rmode = V4DImode;
31025 nargs_constant = 1;
31026 break;
31027 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
31028 nargs = 3;
31029 rmode = V2DImode;
31030 nargs_constant = 1;
31031 break;
31032 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
31033 nargs = 3;
31034 rmode = DImode;
31035 nargs_constant = 1;
31036 break;
31037 case V2DI_FTYPE_V2DI_UINT_UINT:
31038 nargs = 3;
31039 nargs_constant = 2;
31040 break;
31041 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
31042 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
31043 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
31044 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
31045 nargs = 4;
31046 nargs_constant = 1;
31047 break;
31048 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
31049 nargs = 4;
31050 nargs_constant = 2;
31051 break;
31052 case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
31053 case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
31054 nargs = 4;
31055 break;
31056 default:
31057 gcc_unreachable ();
31060 gcc_assert (nargs <= ARRAY_SIZE (args));
31062 if (comparison != UNKNOWN)
31064 gcc_assert (nargs == 2);
31065 return ix86_expand_sse_compare (d, exp, target, swap);
31068 if (rmode == VOIDmode || rmode == tmode)
31070 if (optimize
31071 || target == 0
31072 || GET_MODE (target) != tmode
31073 || !insn_p->operand[0].predicate (target, tmode))
31074 target = gen_reg_rtx (tmode);
31075 real_target = target;
31077 else
31079 target = gen_reg_rtx (rmode);
31080 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
31083 for (i = 0; i < nargs; i++)
31085 tree arg = CALL_EXPR_ARG (exp, i);
31086 rtx op = expand_normal (arg);
31087 enum machine_mode mode = insn_p->operand[i + 1].mode;
31088 bool match = insn_p->operand[i + 1].predicate (op, mode);
31090 if (last_arg_count && (i + 1) == nargs)
31092 /* SIMD shift insns take either an 8-bit immediate or
31093 register as count. But builtin functions take int as
31094 count. If count doesn't match, we put it in register. */
31095 if (!match)
31097 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
31098 if (!insn_p->operand[i + 1].predicate (op, mode))
31099 op = copy_to_reg (op);
31102 else if ((nargs - i) <= nargs_constant)
31104 if (!match)
31105 switch (icode)
31107 case CODE_FOR_avx2_inserti128:
31108 case CODE_FOR_avx2_extracti128:
31109 error ("the last argument must be an 1-bit immediate");
31110 return const0_rtx;
31112 case CODE_FOR_sse4_1_roundsd:
31113 case CODE_FOR_sse4_1_roundss:
31115 case CODE_FOR_sse4_1_roundpd:
31116 case CODE_FOR_sse4_1_roundps:
31117 case CODE_FOR_avx_roundpd256:
31118 case CODE_FOR_avx_roundps256:
31120 case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
31121 case CODE_FOR_sse4_1_roundps_sfix:
31122 case CODE_FOR_avx_roundpd_vec_pack_sfix256:
31123 case CODE_FOR_avx_roundps_sfix256:
31125 case CODE_FOR_sse4_1_blendps:
31126 case CODE_FOR_avx_blendpd256:
31127 case CODE_FOR_avx_vpermilv4df:
31128 error ("the last argument must be a 4-bit immediate");
31129 return const0_rtx;
31131 case CODE_FOR_sse4_1_blendpd:
31132 case CODE_FOR_avx_vpermilv2df:
31133 case CODE_FOR_xop_vpermil2v2df3:
31134 case CODE_FOR_xop_vpermil2v4sf3:
31135 case CODE_FOR_xop_vpermil2v4df3:
31136 case CODE_FOR_xop_vpermil2v8sf3:
31137 error ("the last argument must be a 2-bit immediate");
31138 return const0_rtx;
31140 case CODE_FOR_avx_vextractf128v4df:
31141 case CODE_FOR_avx_vextractf128v8sf:
31142 case CODE_FOR_avx_vextractf128v8si:
31143 case CODE_FOR_avx_vinsertf128v4df:
31144 case CODE_FOR_avx_vinsertf128v8sf:
31145 case CODE_FOR_avx_vinsertf128v8si:
31146 error ("the last argument must be a 1-bit immediate");
31147 return const0_rtx;
31149 case CODE_FOR_avx_vmcmpv2df3:
31150 case CODE_FOR_avx_vmcmpv4sf3:
31151 case CODE_FOR_avx_cmpv2df3:
31152 case CODE_FOR_avx_cmpv4sf3:
31153 case CODE_FOR_avx_cmpv4df3:
31154 case CODE_FOR_avx_cmpv8sf3:
31155 error ("the last argument must be a 5-bit immediate");
31156 return const0_rtx;
31158 default:
31159 switch (nargs_constant)
31161 case 2:
31162 if ((nargs - i) == nargs_constant)
31164 error ("the next to last argument must be an 8-bit immediate");
31165 break;
31167 case 1:
31168 error ("the last argument must be an 8-bit immediate");
31169 break;
31170 default:
31171 gcc_unreachable ();
31173 return const0_rtx;
31176 else
31178 if (VECTOR_MODE_P (mode))
31179 op = safe_vector_operand (op, mode);
31181 /* If we aren't optimizing, only allow one memory operand to
31182 be generated. */
31183 if (memory_operand (op, mode))
31184 num_memory++;
31186 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
31188 if (optimize || !match || num_memory > 1)
31189 op = copy_to_mode_reg (mode, op);
31191 else
31193 op = copy_to_reg (op);
31194 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
31198 args[i].op = op;
31199 args[i].mode = mode;
31202 switch (nargs)
31204 case 1:
31205 pat = GEN_FCN (icode) (real_target, args[0].op);
31206 break;
31207 case 2:
31208 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
31209 break;
31210 case 3:
31211 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31212 args[2].op);
31213 break;
31214 case 4:
31215 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
31216 args[2].op, args[3].op);
31217 break;
31218 default:
31219 gcc_unreachable ();
31222 if (! pat)
31223 return 0;
31225 emit_insn (pat);
31226 return target;
31229 /* Subroutine of ix86_expand_builtin to take care of special insns
31230 with variable number of operands. */
31232 static rtx
31233 ix86_expand_special_args_builtin (const struct builtin_description *d,
31234 tree exp, rtx target)
31236 tree arg;
31237 rtx pat, op;
31238 unsigned int i, nargs, arg_adjust, memory;
31239 bool aligned_mem = false;
31240 struct
31242 rtx op;
31243 enum machine_mode mode;
31244 } args[3];
31245 enum insn_code icode = d->icode;
31246 bool last_arg_constant = false;
31247 const struct insn_data_d *insn_p = &insn_data[icode];
31248 enum machine_mode tmode = insn_p->operand[0].mode;
31249 enum { load, store } klass;
31251 switch ((enum ix86_builtin_func_type) d->flag)
31253 case VOID_FTYPE_VOID:
31254 emit_insn (GEN_FCN (icode) (target));
31255 return 0;
31256 case VOID_FTYPE_UINT64:
31257 case VOID_FTYPE_UNSIGNED:
31258 nargs = 0;
31259 klass = store;
31260 memory = 0;
31261 break;
31263 case INT_FTYPE_VOID:
31264 case UINT64_FTYPE_VOID:
31265 case UNSIGNED_FTYPE_VOID:
31266 nargs = 0;
31267 klass = load;
31268 memory = 0;
31269 break;
31270 case UINT64_FTYPE_PUNSIGNED:
31271 case V2DI_FTYPE_PV2DI:
31272 case V4DI_FTYPE_PV4DI:
31273 case V32QI_FTYPE_PCCHAR:
31274 case V16QI_FTYPE_PCCHAR:
31275 case V8SF_FTYPE_PCV4SF:
31276 case V8SF_FTYPE_PCFLOAT:
31277 case V4SF_FTYPE_PCFLOAT:
31278 case V4DF_FTYPE_PCV2DF:
31279 case V4DF_FTYPE_PCDOUBLE:
31280 case V2DF_FTYPE_PCDOUBLE:
31281 case VOID_FTYPE_PVOID:
31282 nargs = 1;
31283 klass = load;
31284 memory = 0;
31285 switch (icode)
31287 case CODE_FOR_sse4_1_movntdqa:
31288 case CODE_FOR_avx2_movntdqa:
31289 aligned_mem = true;
31290 break;
31291 default:
31292 break;
31294 break;
31295 case VOID_FTYPE_PV2SF_V4SF:
31296 case VOID_FTYPE_PV4DI_V4DI:
31297 case VOID_FTYPE_PV2DI_V2DI:
31298 case VOID_FTYPE_PCHAR_V32QI:
31299 case VOID_FTYPE_PCHAR_V16QI:
31300 case VOID_FTYPE_PFLOAT_V8SF:
31301 case VOID_FTYPE_PFLOAT_V4SF:
31302 case VOID_FTYPE_PDOUBLE_V4DF:
31303 case VOID_FTYPE_PDOUBLE_V2DF:
31304 case VOID_FTYPE_PLONGLONG_LONGLONG:
31305 case VOID_FTYPE_PULONGLONG_ULONGLONG:
31306 case VOID_FTYPE_PINT_INT:
31307 nargs = 1;
31308 klass = store;
31309 /* Reserve memory operand for target. */
31310 memory = ARRAY_SIZE (args);
31311 switch (icode)
31313 /* These builtins and instructions require the memory
31314 to be properly aligned. */
31315 case CODE_FOR_avx_movntv4di:
31316 case CODE_FOR_sse2_movntv2di:
31317 case CODE_FOR_avx_movntv8sf:
31318 case CODE_FOR_sse_movntv4sf:
31319 case CODE_FOR_sse4a_vmmovntv4sf:
31320 case CODE_FOR_avx_movntv4df:
31321 case CODE_FOR_sse2_movntv2df:
31322 case CODE_FOR_sse4a_vmmovntv2df:
31323 case CODE_FOR_sse2_movntidi:
31324 case CODE_FOR_sse_movntq:
31325 case CODE_FOR_sse2_movntisi:
31326 aligned_mem = true;
31327 break;
31328 default:
31329 break;
31331 break;
31332 case V4SF_FTYPE_V4SF_PCV2SF:
31333 case V2DF_FTYPE_V2DF_PCDOUBLE:
31334 nargs = 2;
31335 klass = load;
31336 memory = 1;
31337 break;
31338 case V8SF_FTYPE_PCV8SF_V8SI:
31339 case V4DF_FTYPE_PCV4DF_V4DI:
31340 case V4SF_FTYPE_PCV4SF_V4SI:
31341 case V2DF_FTYPE_PCV2DF_V2DI:
31342 case V8SI_FTYPE_PCV8SI_V8SI:
31343 case V4DI_FTYPE_PCV4DI_V4DI:
31344 case V4SI_FTYPE_PCV4SI_V4SI:
31345 case V2DI_FTYPE_PCV2DI_V2DI:
31346 nargs = 2;
31347 klass = load;
31348 memory = 0;
31349 break;
31350 case VOID_FTYPE_PV8SF_V8SI_V8SF:
31351 case VOID_FTYPE_PV4DF_V4DI_V4DF:
31352 case VOID_FTYPE_PV4SF_V4SI_V4SF:
31353 case VOID_FTYPE_PV2DF_V2DI_V2DF:
31354 case VOID_FTYPE_PV8SI_V8SI_V8SI:
31355 case VOID_FTYPE_PV4DI_V4DI_V4DI:
31356 case VOID_FTYPE_PV4SI_V4SI_V4SI:
31357 case VOID_FTYPE_PV2DI_V2DI_V2DI:
31358 nargs = 2;
31359 klass = store;
31360 /* Reserve memory operand for target. */
31361 memory = ARRAY_SIZE (args);
31362 break;
31363 case VOID_FTYPE_UINT_UINT_UINT:
31364 case VOID_FTYPE_UINT64_UINT_UINT:
31365 case UCHAR_FTYPE_UINT_UINT_UINT:
31366 case UCHAR_FTYPE_UINT64_UINT_UINT:
31367 nargs = 3;
31368 klass = load;
31369 memory = ARRAY_SIZE (args);
31370 last_arg_constant = true;
31371 break;
31372 default:
31373 gcc_unreachable ();
31376 gcc_assert (nargs <= ARRAY_SIZE (args));
31378 if (klass == store)
31380 arg = CALL_EXPR_ARG (exp, 0);
31381 op = expand_normal (arg);
31382 gcc_assert (target == 0);
31383 if (memory)
31385 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31386 target = gen_rtx_MEM (tmode, op);
31387 /* target at this point has just BITS_PER_UNIT MEM_ALIGN
31388 on it. Try to improve it using get_pointer_alignment,
31389 and if the special builtin is one that requires strict
31390 mode alignment, also from it's GET_MODE_ALIGNMENT.
31391 Failure to do so could lead to ix86_legitimate_combined_insn
31392 rejecting all changes to such insns. */
31393 unsigned int align = get_pointer_alignment (arg);
31394 if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
31395 align = GET_MODE_ALIGNMENT (tmode);
31396 if (MEM_ALIGN (target) < align)
31397 set_mem_align (target, align);
31399 else
31400 target = force_reg (tmode, op);
31401 arg_adjust = 1;
31403 else
31405 arg_adjust = 0;
31406 if (optimize
31407 || target == 0
31408 || !register_operand (target, tmode)
31409 || GET_MODE (target) != tmode)
31410 target = gen_reg_rtx (tmode);
31413 for (i = 0; i < nargs; i++)
31415 enum machine_mode mode = insn_p->operand[i + 1].mode;
31416 bool match;
31418 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
31419 op = expand_normal (arg);
31420 match = insn_p->operand[i + 1].predicate (op, mode);
31422 if (last_arg_constant && (i + 1) == nargs)
31424 if (!match)
31426 if (icode == CODE_FOR_lwp_lwpvalsi3
31427 || icode == CODE_FOR_lwp_lwpinssi3
31428 || icode == CODE_FOR_lwp_lwpvaldi3
31429 || icode == CODE_FOR_lwp_lwpinsdi3)
31430 error ("the last argument must be a 32-bit immediate");
31431 else
31432 error ("the last argument must be an 8-bit immediate");
31433 return const0_rtx;
31436 else
31438 if (i == memory)
31440 /* This must be the memory operand. */
31441 op = force_reg (Pmode, convert_to_mode (Pmode, op, 1));
31442 op = gen_rtx_MEM (mode, op);
31443 /* op at this point has just BITS_PER_UNIT MEM_ALIGN
31444 on it. Try to improve it using get_pointer_alignment,
31445 and if the special builtin is one that requires strict
31446 mode alignment, also from it's GET_MODE_ALIGNMENT.
31447 Failure to do so could lead to ix86_legitimate_combined_insn
31448 rejecting all changes to such insns. */
31449 unsigned int align = get_pointer_alignment (arg);
31450 if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
31451 align = GET_MODE_ALIGNMENT (mode);
31452 if (MEM_ALIGN (op) < align)
31453 set_mem_align (op, align);
31455 else
31457 /* This must be register. */
31458 if (VECTOR_MODE_P (mode))
31459 op = safe_vector_operand (op, mode);
31461 gcc_assert (GET_MODE (op) == mode
31462 || GET_MODE (op) == VOIDmode);
31463 op = copy_to_mode_reg (mode, op);
31467 args[i].op = op;
31468 args[i].mode = mode;
31471 switch (nargs)
31473 case 0:
31474 pat = GEN_FCN (icode) (target);
31475 break;
31476 case 1:
31477 pat = GEN_FCN (icode) (target, args[0].op);
31478 break;
31479 case 2:
31480 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
31481 break;
31482 case 3:
31483 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
31484 break;
31485 default:
31486 gcc_unreachable ();
31489 if (! pat)
31490 return 0;
31491 emit_insn (pat);
31492 return klass == store ? 0 : target;
31495 /* Return the integer constant in ARG. Constrain it to be in the range
31496 of the subparts of VEC_TYPE; issue an error if not. */
31498 static int
31499 get_element_number (tree vec_type, tree arg)
31501 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
31503 if (!host_integerp (arg, 1)
31504 || (elt = tree_low_cst (arg, 1), elt > max))
31506 error ("selector must be an integer constant in the range 0..%wi", max);
31507 return 0;
31510 return elt;
31513 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31514 ix86_expand_vector_init. We DO have language-level syntax for this, in
31515 the form of (type){ init-list }. Except that since we can't place emms
31516 instructions from inside the compiler, we can't allow the use of MMX
31517 registers unless the user explicitly asks for it. So we do *not* define
31518 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
31519 we have builtins invoked by mmintrin.h that gives us license to emit
31520 these sorts of instructions. */
31522 static rtx
31523 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
31525 enum machine_mode tmode = TYPE_MODE (type);
31526 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
31527 int i, n_elt = GET_MODE_NUNITS (tmode);
31528 rtvec v = rtvec_alloc (n_elt);
31530 gcc_assert (VECTOR_MODE_P (tmode));
31531 gcc_assert (call_expr_nargs (exp) == n_elt);
31533 for (i = 0; i < n_elt; ++i)
31535 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
31536 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
31539 if (!target || !register_operand (target, tmode))
31540 target = gen_reg_rtx (tmode);
31542 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
31543 return target;
31546 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31547 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
31548 had a language-level syntax for referencing vector elements. */
31550 static rtx
31551 ix86_expand_vec_ext_builtin (tree exp, rtx target)
31553 enum machine_mode tmode, mode0;
31554 tree arg0, arg1;
31555 int elt;
31556 rtx op0;
31558 arg0 = CALL_EXPR_ARG (exp, 0);
31559 arg1 = CALL_EXPR_ARG (exp, 1);
31561 op0 = expand_normal (arg0);
31562 elt = get_element_number (TREE_TYPE (arg0), arg1);
31564 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31565 mode0 = TYPE_MODE (TREE_TYPE (arg0));
31566 gcc_assert (VECTOR_MODE_P (mode0));
31568 op0 = force_reg (mode0, op0);
31570 if (optimize || !target || !register_operand (target, tmode))
31571 target = gen_reg_rtx (tmode);
31573 ix86_expand_vector_extract (true, target, op0, elt);
31575 return target;
31578 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
31579 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
31580 a language-level syntax for referencing vector elements. */
31582 static rtx
31583 ix86_expand_vec_set_builtin (tree exp)
31585 enum machine_mode tmode, mode1;
31586 tree arg0, arg1, arg2;
31587 int elt;
31588 rtx op0, op1, target;
31590 arg0 = CALL_EXPR_ARG (exp, 0);
31591 arg1 = CALL_EXPR_ARG (exp, 1);
31592 arg2 = CALL_EXPR_ARG (exp, 2);
31594 tmode = TYPE_MODE (TREE_TYPE (arg0));
31595 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
31596 gcc_assert (VECTOR_MODE_P (tmode));
31598 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
31599 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
31600 elt = get_element_number (TREE_TYPE (arg0), arg2);
31602 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
31603 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
31605 op0 = force_reg (tmode, op0);
31606 op1 = force_reg (mode1, op1);
31608 /* OP0 is the source of these builtin functions and shouldn't be
31609 modified. Create a copy, use it and return it as target. */
31610 target = gen_reg_rtx (tmode);
31611 emit_move_insn (target, op0);
31612 ix86_expand_vector_set (true, target, op1, elt);
31614 return target;
31617 /* Expand an expression EXP that calls a built-in function,
31618 with result going to TARGET if that's convenient
31619 (and in mode MODE if that's convenient).
31620 SUBTARGET may be used as the target for computing one of EXP's operands.
31621 IGNORE is nonzero if the value is to be ignored. */
31623 static rtx
31624 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
31625 enum machine_mode mode ATTRIBUTE_UNUSED,
31626 int ignore ATTRIBUTE_UNUSED)
31628 const struct builtin_description *d;
31629 size_t i;
31630 enum insn_code icode;
31631 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
31632 tree arg0, arg1, arg2, arg3, arg4;
31633 rtx op0, op1, op2, op3, op4, pat, insn;
31634 enum machine_mode mode0, mode1, mode2, mode3, mode4;
31635 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
31637 /* For CPU builtins that can be folded, fold first and expand the fold. */
31638 switch (fcode)
31640 case IX86_BUILTIN_CPU_INIT:
31642 /* Make it call __cpu_indicator_init in libgcc. */
31643 tree call_expr, fndecl, type;
31644 type = build_function_type_list (integer_type_node, NULL_TREE);
31645 fndecl = build_fn_decl ("__cpu_indicator_init", type);
31646 call_expr = build_call_expr (fndecl, 0);
31647 return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
31649 case IX86_BUILTIN_CPU_IS:
31650 case IX86_BUILTIN_CPU_SUPPORTS:
31652 tree arg0 = CALL_EXPR_ARG (exp, 0);
31653 tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
31654 gcc_assert (fold_expr != NULL_TREE);
31655 return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
31659 /* Determine whether the builtin function is available under the current ISA.
31660 Originally the builtin was not created if it wasn't applicable to the
31661 current ISA based on the command line switches. With function specific
31662 options, we need to check in the context of the function making the call
31663 whether it is supported. */
31664 if (ix86_builtins_isa[fcode].isa
31665 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
31667 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
31668 NULL, (enum fpmath_unit) 0, false);
31670 if (!opts)
31671 error ("%qE needs unknown isa option", fndecl);
31672 else
31674 gcc_assert (opts != NULL);
31675 error ("%qE needs isa option %s", fndecl, opts);
31676 free (opts);
31678 return const0_rtx;
31681 switch (fcode)
31683 case IX86_BUILTIN_MASKMOVQ:
31684 case IX86_BUILTIN_MASKMOVDQU:
31685 icode = (fcode == IX86_BUILTIN_MASKMOVQ
31686 ? CODE_FOR_mmx_maskmovq
31687 : CODE_FOR_sse2_maskmovdqu);
31688 /* Note the arg order is different from the operand order. */
31689 arg1 = CALL_EXPR_ARG (exp, 0);
31690 arg2 = CALL_EXPR_ARG (exp, 1);
31691 arg0 = CALL_EXPR_ARG (exp, 2);
31692 op0 = expand_normal (arg0);
31693 op1 = expand_normal (arg1);
31694 op2 = expand_normal (arg2);
31695 mode0 = insn_data[icode].operand[0].mode;
31696 mode1 = insn_data[icode].operand[1].mode;
31697 mode2 = insn_data[icode].operand[2].mode;
31699 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31700 op0 = gen_rtx_MEM (mode1, op0);
31702 if (!insn_data[icode].operand[0].predicate (op0, mode0))
31703 op0 = copy_to_mode_reg (mode0, op0);
31704 if (!insn_data[icode].operand[1].predicate (op1, mode1))
31705 op1 = copy_to_mode_reg (mode1, op1);
31706 if (!insn_data[icode].operand[2].predicate (op2, mode2))
31707 op2 = copy_to_mode_reg (mode2, op2);
31708 pat = GEN_FCN (icode) (op0, op1, op2);
31709 if (! pat)
31710 return 0;
31711 emit_insn (pat);
31712 return 0;
31714 case IX86_BUILTIN_LDMXCSR:
31715 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
31716 target = assign_386_stack_local (SImode, SLOT_TEMP);
31717 emit_move_insn (target, op0);
31718 emit_insn (gen_sse_ldmxcsr (target));
31719 return 0;
31721 case IX86_BUILTIN_STMXCSR:
31722 target = assign_386_stack_local (SImode, SLOT_TEMP);
31723 emit_insn (gen_sse_stmxcsr (target));
31724 return copy_to_mode_reg (SImode, target);
31726 case IX86_BUILTIN_CLFLUSH:
31727 arg0 = CALL_EXPR_ARG (exp, 0);
31728 op0 = expand_normal (arg0);
31729 icode = CODE_FOR_sse2_clflush;
31730 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31731 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31733 emit_insn (gen_sse2_clflush (op0));
31734 return 0;
31736 case IX86_BUILTIN_MONITOR:
31737 arg0 = CALL_EXPR_ARG (exp, 0);
31738 arg1 = CALL_EXPR_ARG (exp, 1);
31739 arg2 = CALL_EXPR_ARG (exp, 2);
31740 op0 = expand_normal (arg0);
31741 op1 = expand_normal (arg1);
31742 op2 = expand_normal (arg2);
31743 if (!REG_P (op0))
31744 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31745 if (!REG_P (op1))
31746 op1 = copy_to_mode_reg (SImode, op1);
31747 if (!REG_P (op2))
31748 op2 = copy_to_mode_reg (SImode, op2);
31749 emit_insn (ix86_gen_monitor (op0, op1, op2));
31750 return 0;
31752 case IX86_BUILTIN_MWAIT:
31753 arg0 = CALL_EXPR_ARG (exp, 0);
31754 arg1 = CALL_EXPR_ARG (exp, 1);
31755 op0 = expand_normal (arg0);
31756 op1 = expand_normal (arg1);
31757 if (!REG_P (op0))
31758 op0 = copy_to_mode_reg (SImode, op0);
31759 if (!REG_P (op1))
31760 op1 = copy_to_mode_reg (SImode, op1);
31761 emit_insn (gen_sse3_mwait (op0, op1));
31762 return 0;
31764 case IX86_BUILTIN_VEC_INIT_V2SI:
31765 case IX86_BUILTIN_VEC_INIT_V4HI:
31766 case IX86_BUILTIN_VEC_INIT_V8QI:
31767 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
31769 case IX86_BUILTIN_VEC_EXT_V2DF:
31770 case IX86_BUILTIN_VEC_EXT_V2DI:
31771 case IX86_BUILTIN_VEC_EXT_V4SF:
31772 case IX86_BUILTIN_VEC_EXT_V4SI:
31773 case IX86_BUILTIN_VEC_EXT_V8HI:
31774 case IX86_BUILTIN_VEC_EXT_V2SI:
31775 case IX86_BUILTIN_VEC_EXT_V4HI:
31776 case IX86_BUILTIN_VEC_EXT_V16QI:
31777 return ix86_expand_vec_ext_builtin (exp, target);
31779 case IX86_BUILTIN_VEC_SET_V2DI:
31780 case IX86_BUILTIN_VEC_SET_V4SF:
31781 case IX86_BUILTIN_VEC_SET_V4SI:
31782 case IX86_BUILTIN_VEC_SET_V8HI:
31783 case IX86_BUILTIN_VEC_SET_V4HI:
31784 case IX86_BUILTIN_VEC_SET_V16QI:
31785 return ix86_expand_vec_set_builtin (exp);
31787 case IX86_BUILTIN_INFQ:
31788 case IX86_BUILTIN_HUGE_VALQ:
31790 REAL_VALUE_TYPE inf;
31791 rtx tmp;
31793 real_inf (&inf);
31794 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
31796 tmp = validize_mem (force_const_mem (mode, tmp));
31798 if (target == 0)
31799 target = gen_reg_rtx (mode);
31801 emit_move_insn (target, tmp);
31802 return target;
31805 case IX86_BUILTIN_RDPMC:
31806 case IX86_BUILTIN_RDTSC:
31807 case IX86_BUILTIN_RDTSCP:
31809 op0 = gen_reg_rtx (DImode);
31810 op1 = gen_reg_rtx (DImode);
31812 if (fcode == IX86_BUILTIN_RDPMC)
31814 arg0 = CALL_EXPR_ARG (exp, 0);
31815 op2 = expand_normal (arg0);
31816 if (!register_operand (op2, SImode))
31817 op2 = copy_to_mode_reg (SImode, op2);
31819 insn = (TARGET_64BIT
31820 ? gen_rdpmc_rex64 (op0, op1, op2)
31821 : gen_rdpmc (op0, op2));
31822 emit_insn (insn);
31824 else if (fcode == IX86_BUILTIN_RDTSC)
31826 insn = (TARGET_64BIT
31827 ? gen_rdtsc_rex64 (op0, op1)
31828 : gen_rdtsc (op0));
31829 emit_insn (insn);
31831 else
31833 op2 = gen_reg_rtx (SImode);
31835 insn = (TARGET_64BIT
31836 ? gen_rdtscp_rex64 (op0, op1, op2)
31837 : gen_rdtscp (op0, op2));
31838 emit_insn (insn);
31840 arg0 = CALL_EXPR_ARG (exp, 0);
31841 op4 = expand_normal (arg0);
31842 if (!address_operand (op4, VOIDmode))
31844 op4 = convert_memory_address (Pmode, op4);
31845 op4 = copy_addr_to_reg (op4);
31847 emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
31850 if (target == 0)
31852 /* mode is VOIDmode if __builtin_rd* has been called
31853 without lhs. */
31854 if (mode == VOIDmode)
31855 return target;
31856 target = gen_reg_rtx (mode);
31859 if (TARGET_64BIT)
31861 op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
31862 op1, 1, OPTAB_DIRECT);
31863 op0 = expand_simple_binop (DImode, IOR, op0, op1,
31864 op0, 1, OPTAB_DIRECT);
31867 emit_move_insn (target, op0);
31868 return target;
31870 case IX86_BUILTIN_FXSAVE:
31871 case IX86_BUILTIN_FXRSTOR:
31872 case IX86_BUILTIN_FXSAVE64:
31873 case IX86_BUILTIN_FXRSTOR64:
31874 switch (fcode)
31876 case IX86_BUILTIN_FXSAVE:
31877 icode = CODE_FOR_fxsave;
31878 break;
31879 case IX86_BUILTIN_FXRSTOR:
31880 icode = CODE_FOR_fxrstor;
31881 break;
31882 case IX86_BUILTIN_FXSAVE64:
31883 icode = CODE_FOR_fxsave64;
31884 break;
31885 case IX86_BUILTIN_FXRSTOR64:
31886 icode = CODE_FOR_fxrstor64;
31887 break;
31888 default:
31889 gcc_unreachable ();
31892 arg0 = CALL_EXPR_ARG (exp, 0);
31893 op0 = expand_normal (arg0);
31895 if (!address_operand (op0, VOIDmode))
31897 op0 = convert_memory_address (Pmode, op0);
31898 op0 = copy_addr_to_reg (op0);
31900 op0 = gen_rtx_MEM (BLKmode, op0);
31902 pat = GEN_FCN (icode) (op0);
31903 if (pat)
31904 emit_insn (pat);
31905 return 0;
31907 case IX86_BUILTIN_XSAVE:
31908 case IX86_BUILTIN_XRSTOR:
31909 case IX86_BUILTIN_XSAVE64:
31910 case IX86_BUILTIN_XRSTOR64:
31911 case IX86_BUILTIN_XSAVEOPT:
31912 case IX86_BUILTIN_XSAVEOPT64:
31913 arg0 = CALL_EXPR_ARG (exp, 0);
31914 arg1 = CALL_EXPR_ARG (exp, 1);
31915 op0 = expand_normal (arg0);
31916 op1 = expand_normal (arg1);
31918 if (!address_operand (op0, VOIDmode))
31920 op0 = convert_memory_address (Pmode, op0);
31921 op0 = copy_addr_to_reg (op0);
31923 op0 = gen_rtx_MEM (BLKmode, op0);
31925 op1 = force_reg (DImode, op1);
31927 if (TARGET_64BIT)
31929 op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
31930 NULL, 1, OPTAB_DIRECT);
31931 switch (fcode)
31933 case IX86_BUILTIN_XSAVE:
31934 icode = CODE_FOR_xsave_rex64;
31935 break;
31936 case IX86_BUILTIN_XRSTOR:
31937 icode = CODE_FOR_xrstor_rex64;
31938 break;
31939 case IX86_BUILTIN_XSAVE64:
31940 icode = CODE_FOR_xsave64;
31941 break;
31942 case IX86_BUILTIN_XRSTOR64:
31943 icode = CODE_FOR_xrstor64;
31944 break;
31945 case IX86_BUILTIN_XSAVEOPT:
31946 icode = CODE_FOR_xsaveopt_rex64;
31947 break;
31948 case IX86_BUILTIN_XSAVEOPT64:
31949 icode = CODE_FOR_xsaveopt64;
31950 break;
31951 default:
31952 gcc_unreachable ();
31955 op2 = gen_lowpart (SImode, op2);
31956 op1 = gen_lowpart (SImode, op1);
31957 pat = GEN_FCN (icode) (op0, op1, op2);
31959 else
31961 switch (fcode)
31963 case IX86_BUILTIN_XSAVE:
31964 icode = CODE_FOR_xsave;
31965 break;
31966 case IX86_BUILTIN_XRSTOR:
31967 icode = CODE_FOR_xrstor;
31968 break;
31969 case IX86_BUILTIN_XSAVEOPT:
31970 icode = CODE_FOR_xsaveopt;
31971 break;
31972 default:
31973 gcc_unreachable ();
31975 pat = GEN_FCN (icode) (op0, op1);
31978 if (pat)
31979 emit_insn (pat);
31980 return 0;
31982 case IX86_BUILTIN_LLWPCB:
31983 arg0 = CALL_EXPR_ARG (exp, 0);
31984 op0 = expand_normal (arg0);
31985 icode = CODE_FOR_lwp_llwpcb;
31986 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
31987 op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
31988 emit_insn (gen_lwp_llwpcb (op0));
31989 return 0;
31991 case IX86_BUILTIN_SLWPCB:
31992 icode = CODE_FOR_lwp_slwpcb;
31993 if (!target
31994 || !insn_data[icode].operand[0].predicate (target, Pmode))
31995 target = gen_reg_rtx (Pmode);
31996 emit_insn (gen_lwp_slwpcb (target));
31997 return target;
31999 case IX86_BUILTIN_BEXTRI32:
32000 case IX86_BUILTIN_BEXTRI64:
32001 arg0 = CALL_EXPR_ARG (exp, 0);
32002 arg1 = CALL_EXPR_ARG (exp, 1);
32003 op0 = expand_normal (arg0);
32004 op1 = expand_normal (arg1);
32005 icode = (fcode == IX86_BUILTIN_BEXTRI32
32006 ? CODE_FOR_tbm_bextri_si
32007 : CODE_FOR_tbm_bextri_di);
32008 if (!CONST_INT_P (op1))
32010 error ("last argument must be an immediate");
32011 return const0_rtx;
32013 else
32015 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
32016 unsigned char lsb_index = INTVAL (op1) & 0xFF;
32017 op1 = GEN_INT (length);
32018 op2 = GEN_INT (lsb_index);
32019 pat = GEN_FCN (icode) (target, op0, op1, op2);
32020 if (pat)
32021 emit_insn (pat);
32022 return target;
32025 case IX86_BUILTIN_RDRAND16_STEP:
32026 icode = CODE_FOR_rdrandhi_1;
32027 mode0 = HImode;
32028 goto rdrand_step;
32030 case IX86_BUILTIN_RDRAND32_STEP:
32031 icode = CODE_FOR_rdrandsi_1;
32032 mode0 = SImode;
32033 goto rdrand_step;
32035 case IX86_BUILTIN_RDRAND64_STEP:
32036 icode = CODE_FOR_rdranddi_1;
32037 mode0 = DImode;
32039 rdrand_step:
32040 op0 = gen_reg_rtx (mode0);
32041 emit_insn (GEN_FCN (icode) (op0));
32043 arg0 = CALL_EXPR_ARG (exp, 0);
32044 op1 = expand_normal (arg0);
32045 if (!address_operand (op1, VOIDmode))
32047 op1 = convert_memory_address (Pmode, op1);
32048 op1 = copy_addr_to_reg (op1);
32050 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32052 op1 = gen_reg_rtx (SImode);
32053 emit_move_insn (op1, CONST1_RTX (SImode));
32055 /* Emit SImode conditional move. */
32056 if (mode0 == HImode)
32058 op2 = gen_reg_rtx (SImode);
32059 emit_insn (gen_zero_extendhisi2 (op2, op0));
32061 else if (mode0 == SImode)
32062 op2 = op0;
32063 else
32064 op2 = gen_rtx_SUBREG (SImode, op0, 0);
32066 if (target == 0
32067 || !register_operand (target, SImode))
32068 target = gen_reg_rtx (SImode);
32070 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
32071 const0_rtx);
32072 emit_insn (gen_rtx_SET (VOIDmode, target,
32073 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
32074 return target;
32076 case IX86_BUILTIN_RDSEED16_STEP:
32077 icode = CODE_FOR_rdseedhi_1;
32078 mode0 = HImode;
32079 goto rdseed_step;
32081 case IX86_BUILTIN_RDSEED32_STEP:
32082 icode = CODE_FOR_rdseedsi_1;
32083 mode0 = SImode;
32084 goto rdseed_step;
32086 case IX86_BUILTIN_RDSEED64_STEP:
32087 icode = CODE_FOR_rdseeddi_1;
32088 mode0 = DImode;
32090 rdseed_step:
32091 op0 = gen_reg_rtx (mode0);
32092 emit_insn (GEN_FCN (icode) (op0));
32094 arg0 = CALL_EXPR_ARG (exp, 0);
32095 op1 = expand_normal (arg0);
32096 if (!address_operand (op1, VOIDmode))
32098 op1 = convert_memory_address (Pmode, op1);
32099 op1 = copy_addr_to_reg (op1);
32101 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
32103 op2 = gen_reg_rtx (QImode);
32105 pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
32106 const0_rtx);
32107 emit_insn (gen_rtx_SET (VOIDmode, op2, pat));
32109 if (target == 0
32110 || !register_operand (target, SImode))
32111 target = gen_reg_rtx (SImode);
32113 emit_insn (gen_zero_extendqisi2 (target, op2));
32114 return target;
32116 case IX86_BUILTIN_ADDCARRYX32:
32117 icode = TARGET_ADX ? CODE_FOR_adcxsi3 : CODE_FOR_addsi3_carry;
32118 mode0 = SImode;
32119 goto addcarryx;
32121 case IX86_BUILTIN_ADDCARRYX64:
32122 icode = TARGET_ADX ? CODE_FOR_adcxdi3 : CODE_FOR_adddi3_carry;
32123 mode0 = DImode;
32125 addcarryx:
32126 arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
32127 arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
32128 arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
32129 arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
32131 op0 = gen_reg_rtx (QImode);
32133 /* Generate CF from input operand. */
32134 op1 = expand_normal (arg0);
32135 op1 = copy_to_mode_reg (QImode, convert_to_mode (QImode, op1, 1));
32136 emit_insn (gen_addqi3_cc (op0, op1, constm1_rtx));
32138 /* Gen ADCX instruction to compute X+Y+CF. */
32139 op2 = expand_normal (arg1);
32140 op3 = expand_normal (arg2);
32142 if (!REG_P (op2))
32143 op2 = copy_to_mode_reg (mode0, op2);
32144 if (!REG_P (op3))
32145 op3 = copy_to_mode_reg (mode0, op3);
32147 op0 = gen_reg_rtx (mode0);
32149 op4 = gen_rtx_REG (CCCmode, FLAGS_REG);
32150 pat = gen_rtx_LTU (VOIDmode, op4, const0_rtx);
32151 emit_insn (GEN_FCN (icode) (op0, op2, op3, op4, pat));
32153 /* Store the result. */
32154 op4 = expand_normal (arg3);
32155 if (!address_operand (op4, VOIDmode))
32157 op4 = convert_memory_address (Pmode, op4);
32158 op4 = copy_addr_to_reg (op4);
32160 emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
32162 /* Return current CF value. */
32163 if (target == 0)
32164 target = gen_reg_rtx (QImode);
32166 PUT_MODE (pat, QImode);
32167 emit_insn (gen_rtx_SET (VOIDmode, target, pat));
32168 return target;
32170 case IX86_BUILTIN_GATHERSIV2DF:
32171 icode = CODE_FOR_avx2_gathersiv2df;
32172 goto gather_gen;
32173 case IX86_BUILTIN_GATHERSIV4DF:
32174 icode = CODE_FOR_avx2_gathersiv4df;
32175 goto gather_gen;
32176 case IX86_BUILTIN_GATHERDIV2DF:
32177 icode = CODE_FOR_avx2_gatherdiv2df;
32178 goto gather_gen;
32179 case IX86_BUILTIN_GATHERDIV4DF:
32180 icode = CODE_FOR_avx2_gatherdiv4df;
32181 goto gather_gen;
32182 case IX86_BUILTIN_GATHERSIV4SF:
32183 icode = CODE_FOR_avx2_gathersiv4sf;
32184 goto gather_gen;
32185 case IX86_BUILTIN_GATHERSIV8SF:
32186 icode = CODE_FOR_avx2_gathersiv8sf;
32187 goto gather_gen;
32188 case IX86_BUILTIN_GATHERDIV4SF:
32189 icode = CODE_FOR_avx2_gatherdiv4sf;
32190 goto gather_gen;
32191 case IX86_BUILTIN_GATHERDIV8SF:
32192 icode = CODE_FOR_avx2_gatherdiv8sf;
32193 goto gather_gen;
32194 case IX86_BUILTIN_GATHERSIV2DI:
32195 icode = CODE_FOR_avx2_gathersiv2di;
32196 goto gather_gen;
32197 case IX86_BUILTIN_GATHERSIV4DI:
32198 icode = CODE_FOR_avx2_gathersiv4di;
32199 goto gather_gen;
32200 case IX86_BUILTIN_GATHERDIV2DI:
32201 icode = CODE_FOR_avx2_gatherdiv2di;
32202 goto gather_gen;
32203 case IX86_BUILTIN_GATHERDIV4DI:
32204 icode = CODE_FOR_avx2_gatherdiv4di;
32205 goto gather_gen;
32206 case IX86_BUILTIN_GATHERSIV4SI:
32207 icode = CODE_FOR_avx2_gathersiv4si;
32208 goto gather_gen;
32209 case IX86_BUILTIN_GATHERSIV8SI:
32210 icode = CODE_FOR_avx2_gathersiv8si;
32211 goto gather_gen;
32212 case IX86_BUILTIN_GATHERDIV4SI:
32213 icode = CODE_FOR_avx2_gatherdiv4si;
32214 goto gather_gen;
32215 case IX86_BUILTIN_GATHERDIV8SI:
32216 icode = CODE_FOR_avx2_gatherdiv8si;
32217 goto gather_gen;
32218 case IX86_BUILTIN_GATHERALTSIV4DF:
32219 icode = CODE_FOR_avx2_gathersiv4df;
32220 goto gather_gen;
32221 case IX86_BUILTIN_GATHERALTDIV8SF:
32222 icode = CODE_FOR_avx2_gatherdiv8sf;
32223 goto gather_gen;
32224 case IX86_BUILTIN_GATHERALTSIV4DI:
32225 icode = CODE_FOR_avx2_gathersiv4di;
32226 goto gather_gen;
32227 case IX86_BUILTIN_GATHERALTDIV8SI:
32228 icode = CODE_FOR_avx2_gatherdiv8si;
32229 goto gather_gen;
32231 gather_gen:
32232 arg0 = CALL_EXPR_ARG (exp, 0);
32233 arg1 = CALL_EXPR_ARG (exp, 1);
32234 arg2 = CALL_EXPR_ARG (exp, 2);
32235 arg3 = CALL_EXPR_ARG (exp, 3);
32236 arg4 = CALL_EXPR_ARG (exp, 4);
32237 op0 = expand_normal (arg0);
32238 op1 = expand_normal (arg1);
32239 op2 = expand_normal (arg2);
32240 op3 = expand_normal (arg3);
32241 op4 = expand_normal (arg4);
32242 /* Note the arg order is different from the operand order. */
32243 mode0 = insn_data[icode].operand[1].mode;
32244 mode2 = insn_data[icode].operand[3].mode;
32245 mode3 = insn_data[icode].operand[4].mode;
32246 mode4 = insn_data[icode].operand[5].mode;
32248 if (target == NULL_RTX
32249 || GET_MODE (target) != insn_data[icode].operand[0].mode
32250 || !insn_data[icode].operand[0].predicate (target,
32251 GET_MODE (target)))
32252 subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
32253 else
32254 subtarget = target;
32256 if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
32257 || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
32259 rtx half = gen_reg_rtx (V4SImode);
32260 if (!nonimmediate_operand (op2, V8SImode))
32261 op2 = copy_to_mode_reg (V8SImode, op2);
32262 emit_insn (gen_vec_extract_lo_v8si (half, op2));
32263 op2 = half;
32265 else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
32266 || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
32268 rtx (*gen) (rtx, rtx);
32269 rtx half = gen_reg_rtx (mode0);
32270 if (mode0 == V4SFmode)
32271 gen = gen_vec_extract_lo_v8sf;
32272 else
32273 gen = gen_vec_extract_lo_v8si;
32274 if (!nonimmediate_operand (op0, GET_MODE (op0)))
32275 op0 = copy_to_mode_reg (GET_MODE (op0), op0);
32276 emit_insn (gen (half, op0));
32277 op0 = half;
32278 if (!nonimmediate_operand (op3, GET_MODE (op3)))
32279 op3 = copy_to_mode_reg (GET_MODE (op3), op3);
32280 emit_insn (gen (half, op3));
32281 op3 = half;
32284 /* Force memory operand only with base register here. But we
32285 don't want to do it on memory operand for other builtin
32286 functions. */
32287 op1 = force_reg (Pmode, convert_to_mode (Pmode, op1, 1));
32289 if (!insn_data[icode].operand[1].predicate (op0, mode0))
32290 op0 = copy_to_mode_reg (mode0, op0);
32291 if (!insn_data[icode].operand[2].predicate (op1, Pmode))
32292 op1 = copy_to_mode_reg (Pmode, op1);
32293 if (!insn_data[icode].operand[3].predicate (op2, mode2))
32294 op2 = copy_to_mode_reg (mode2, op2);
32295 if (!insn_data[icode].operand[4].predicate (op3, mode3))
32296 op3 = copy_to_mode_reg (mode3, op3);
32297 if (!insn_data[icode].operand[5].predicate (op4, mode4))
32299 error ("last argument must be scale 1, 2, 4, 8");
32300 return const0_rtx;
32303 /* Optimize. If mask is known to have all high bits set,
32304 replace op0 with pc_rtx to signal that the instruction
32305 overwrites the whole destination and doesn't use its
32306 previous contents. */
32307 if (optimize)
32309 if (TREE_CODE (arg3) == VECTOR_CST)
32311 unsigned int negative = 0;
32312 for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
32314 tree cst = VECTOR_CST_ELT (arg3, i);
32315 if (TREE_CODE (cst) == INTEGER_CST
32316 && tree_int_cst_sign_bit (cst))
32317 negative++;
32318 else if (TREE_CODE (cst) == REAL_CST
32319 && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
32320 negative++;
32322 if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
32323 op0 = pc_rtx;
32325 else if (TREE_CODE (arg3) == SSA_NAME)
32327 /* Recognize also when mask is like:
32328 __v2df src = _mm_setzero_pd ();
32329 __v2df mask = _mm_cmpeq_pd (src, src);
32331 __v8sf src = _mm256_setzero_ps ();
32332 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
32333 as that is a cheaper way to load all ones into
32334 a register than having to load a constant from
32335 memory. */
32336 gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
32337 if (is_gimple_call (def_stmt))
32339 tree fndecl = gimple_call_fndecl (def_stmt);
32340 if (fndecl
32341 && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
32342 switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
32344 case IX86_BUILTIN_CMPPD:
32345 case IX86_BUILTIN_CMPPS:
32346 case IX86_BUILTIN_CMPPD256:
32347 case IX86_BUILTIN_CMPPS256:
32348 if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
32349 break;
32350 /* FALLTHRU */
32351 case IX86_BUILTIN_CMPEQPD:
32352 case IX86_BUILTIN_CMPEQPS:
32353 if (initializer_zerop (gimple_call_arg (def_stmt, 0))
32354 && initializer_zerop (gimple_call_arg (def_stmt,
32355 1)))
32356 op0 = pc_rtx;
32357 break;
32358 default:
32359 break;
32365 pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
32366 if (! pat)
32367 return const0_rtx;
32368 emit_insn (pat);
32370 if (fcode == IX86_BUILTIN_GATHERDIV8SF
32371 || fcode == IX86_BUILTIN_GATHERDIV8SI)
32373 enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
32374 ? V4SFmode : V4SImode;
32375 if (target == NULL_RTX)
32376 target = gen_reg_rtx (tmode);
32377 if (tmode == V4SFmode)
32378 emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
32379 else
32380 emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
32382 else
32383 target = subtarget;
32385 return target;
32387 case IX86_BUILTIN_XABORT:
32388 icode = CODE_FOR_xabort;
32389 arg0 = CALL_EXPR_ARG (exp, 0);
32390 op0 = expand_normal (arg0);
32391 mode0 = insn_data[icode].operand[0].mode;
32392 if (!insn_data[icode].operand[0].predicate (op0, mode0))
32394 error ("the xabort's argument must be an 8-bit immediate");
32395 return const0_rtx;
32397 emit_insn (gen_xabort (op0));
32398 return 0;
32400 default:
32401 break;
32404 for (i = 0, d = bdesc_special_args;
32405 i < ARRAY_SIZE (bdesc_special_args);
32406 i++, d++)
32407 if (d->code == fcode)
32408 return ix86_expand_special_args_builtin (d, exp, target);
32410 for (i = 0, d = bdesc_args;
32411 i < ARRAY_SIZE (bdesc_args);
32412 i++, d++)
32413 if (d->code == fcode)
32414 switch (fcode)
32416 case IX86_BUILTIN_FABSQ:
32417 case IX86_BUILTIN_COPYSIGNQ:
32418 if (!TARGET_SSE)
32419 /* Emit a normal call if SSE isn't available. */
32420 return expand_call (exp, target, ignore);
32421 default:
32422 return ix86_expand_args_builtin (d, exp, target);
32425 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
32426 if (d->code == fcode)
32427 return ix86_expand_sse_comi (d, exp, target);
32429 for (i = 0, d = bdesc_pcmpestr;
32430 i < ARRAY_SIZE (bdesc_pcmpestr);
32431 i++, d++)
32432 if (d->code == fcode)
32433 return ix86_expand_sse_pcmpestr (d, exp, target);
32435 for (i = 0, d = bdesc_pcmpistr;
32436 i < ARRAY_SIZE (bdesc_pcmpistr);
32437 i++, d++)
32438 if (d->code == fcode)
32439 return ix86_expand_sse_pcmpistr (d, exp, target);
32441 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
32442 if (d->code == fcode)
32443 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
32444 (enum ix86_builtin_func_type)
32445 d->flag, d->comparison);
32447 gcc_unreachable ();
32450 /* Returns a function decl for a vectorized version of the builtin function
32451 with builtin function code FN and the result vector type TYPE, or NULL_TREE
32452 if it is not available. */
32454 static tree
32455 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
32456 tree type_in)
32458 enum machine_mode in_mode, out_mode;
32459 int in_n, out_n;
32460 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
32462 if (TREE_CODE (type_out) != VECTOR_TYPE
32463 || TREE_CODE (type_in) != VECTOR_TYPE
32464 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
32465 return NULL_TREE;
32467 out_mode = TYPE_MODE (TREE_TYPE (type_out));
32468 out_n = TYPE_VECTOR_SUBPARTS (type_out);
32469 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32470 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32472 switch (fn)
32474 case BUILT_IN_SQRT:
32475 if (out_mode == DFmode && in_mode == DFmode)
32477 if (out_n == 2 && in_n == 2)
32478 return ix86_builtins[IX86_BUILTIN_SQRTPD];
32479 else if (out_n == 4 && in_n == 4)
32480 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
32482 break;
32484 case BUILT_IN_SQRTF:
32485 if (out_mode == SFmode && in_mode == SFmode)
32487 if (out_n == 4 && in_n == 4)
32488 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
32489 else if (out_n == 8 && in_n == 8)
32490 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
32492 break;
32494 case BUILT_IN_IFLOOR:
32495 case BUILT_IN_LFLOOR:
32496 case BUILT_IN_LLFLOOR:
32497 /* The round insn does not trap on denormals. */
32498 if (flag_trapping_math || !TARGET_ROUND)
32499 break;
32501 if (out_mode == SImode && in_mode == DFmode)
32503 if (out_n == 4 && in_n == 2)
32504 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
32505 else if (out_n == 8 && in_n == 4)
32506 return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
32508 break;
32510 case BUILT_IN_IFLOORF:
32511 case BUILT_IN_LFLOORF:
32512 case BUILT_IN_LLFLOORF:
32513 /* The round insn does not trap on denormals. */
32514 if (flag_trapping_math || !TARGET_ROUND)
32515 break;
32517 if (out_mode == SImode && in_mode == SFmode)
32519 if (out_n == 4 && in_n == 4)
32520 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
32521 else if (out_n == 8 && in_n == 8)
32522 return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
32524 break;
32526 case BUILT_IN_ICEIL:
32527 case BUILT_IN_LCEIL:
32528 case BUILT_IN_LLCEIL:
32529 /* The round insn does not trap on denormals. */
32530 if (flag_trapping_math || !TARGET_ROUND)
32531 break;
32533 if (out_mode == SImode && in_mode == DFmode)
32535 if (out_n == 4 && in_n == 2)
32536 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
32537 else if (out_n == 8 && in_n == 4)
32538 return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
32540 break;
32542 case BUILT_IN_ICEILF:
32543 case BUILT_IN_LCEILF:
32544 case BUILT_IN_LLCEILF:
32545 /* The round insn does not trap on denormals. */
32546 if (flag_trapping_math || !TARGET_ROUND)
32547 break;
32549 if (out_mode == SImode && in_mode == SFmode)
32551 if (out_n == 4 && in_n == 4)
32552 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
32553 else if (out_n == 8 && in_n == 8)
32554 return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
32556 break;
32558 case BUILT_IN_IRINT:
32559 case BUILT_IN_LRINT:
32560 case BUILT_IN_LLRINT:
32561 if (out_mode == SImode && in_mode == DFmode)
32563 if (out_n == 4 && in_n == 2)
32564 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
32565 else if (out_n == 8 && in_n == 4)
32566 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
32568 break;
32570 case BUILT_IN_IRINTF:
32571 case BUILT_IN_LRINTF:
32572 case BUILT_IN_LLRINTF:
32573 if (out_mode == SImode && in_mode == SFmode)
32575 if (out_n == 4 && in_n == 4)
32576 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
32577 else if (out_n == 8 && in_n == 8)
32578 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
32580 break;
32582 case BUILT_IN_IROUND:
32583 case BUILT_IN_LROUND:
32584 case BUILT_IN_LLROUND:
32585 /* The round insn does not trap on denormals. */
32586 if (flag_trapping_math || !TARGET_ROUND)
32587 break;
32589 if (out_mode == SImode && in_mode == DFmode)
32591 if (out_n == 4 && in_n == 2)
32592 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
32593 else if (out_n == 8 && in_n == 4)
32594 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
32596 break;
32598 case BUILT_IN_IROUNDF:
32599 case BUILT_IN_LROUNDF:
32600 case BUILT_IN_LLROUNDF:
32601 /* The round insn does not trap on denormals. */
32602 if (flag_trapping_math || !TARGET_ROUND)
32603 break;
32605 if (out_mode == SImode && in_mode == SFmode)
32607 if (out_n == 4 && in_n == 4)
32608 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
32609 else if (out_n == 8 && in_n == 8)
32610 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
32612 break;
32614 case BUILT_IN_COPYSIGN:
32615 if (out_mode == DFmode && in_mode == DFmode)
32617 if (out_n == 2 && in_n == 2)
32618 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
32619 else if (out_n == 4 && in_n == 4)
32620 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
32622 break;
32624 case BUILT_IN_COPYSIGNF:
32625 if (out_mode == SFmode && in_mode == SFmode)
32627 if (out_n == 4 && in_n == 4)
32628 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
32629 else if (out_n == 8 && in_n == 8)
32630 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
32632 break;
32634 case BUILT_IN_FLOOR:
32635 /* The round insn does not trap on denormals. */
32636 if (flag_trapping_math || !TARGET_ROUND)
32637 break;
32639 if (out_mode == DFmode && in_mode == DFmode)
32641 if (out_n == 2 && in_n == 2)
32642 return ix86_builtins[IX86_BUILTIN_FLOORPD];
32643 else if (out_n == 4 && in_n == 4)
32644 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
32646 break;
32648 case BUILT_IN_FLOORF:
32649 /* The round insn does not trap on denormals. */
32650 if (flag_trapping_math || !TARGET_ROUND)
32651 break;
32653 if (out_mode == SFmode && in_mode == SFmode)
32655 if (out_n == 4 && in_n == 4)
32656 return ix86_builtins[IX86_BUILTIN_FLOORPS];
32657 else if (out_n == 8 && in_n == 8)
32658 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
32660 break;
32662 case BUILT_IN_CEIL:
32663 /* The round insn does not trap on denormals. */
32664 if (flag_trapping_math || !TARGET_ROUND)
32665 break;
32667 if (out_mode == DFmode && in_mode == DFmode)
32669 if (out_n == 2 && in_n == 2)
32670 return ix86_builtins[IX86_BUILTIN_CEILPD];
32671 else if (out_n == 4 && in_n == 4)
32672 return ix86_builtins[IX86_BUILTIN_CEILPD256];
32674 break;
32676 case BUILT_IN_CEILF:
32677 /* The round insn does not trap on denormals. */
32678 if (flag_trapping_math || !TARGET_ROUND)
32679 break;
32681 if (out_mode == SFmode && in_mode == SFmode)
32683 if (out_n == 4 && in_n == 4)
32684 return ix86_builtins[IX86_BUILTIN_CEILPS];
32685 else if (out_n == 8 && in_n == 8)
32686 return ix86_builtins[IX86_BUILTIN_CEILPS256];
32688 break;
32690 case BUILT_IN_TRUNC:
32691 /* The round insn does not trap on denormals. */
32692 if (flag_trapping_math || !TARGET_ROUND)
32693 break;
32695 if (out_mode == DFmode && in_mode == DFmode)
32697 if (out_n == 2 && in_n == 2)
32698 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
32699 else if (out_n == 4 && in_n == 4)
32700 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
32702 break;
32704 case BUILT_IN_TRUNCF:
32705 /* The round insn does not trap on denormals. */
32706 if (flag_trapping_math || !TARGET_ROUND)
32707 break;
32709 if (out_mode == SFmode && in_mode == SFmode)
32711 if (out_n == 4 && in_n == 4)
32712 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
32713 else if (out_n == 8 && in_n == 8)
32714 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
32716 break;
32718 case BUILT_IN_RINT:
32719 /* The round insn does not trap on denormals. */
32720 if (flag_trapping_math || !TARGET_ROUND)
32721 break;
32723 if (out_mode == DFmode && in_mode == DFmode)
32725 if (out_n == 2 && in_n == 2)
32726 return ix86_builtins[IX86_BUILTIN_RINTPD];
32727 else if (out_n == 4 && in_n == 4)
32728 return ix86_builtins[IX86_BUILTIN_RINTPD256];
32730 break;
32732 case BUILT_IN_RINTF:
32733 /* The round insn does not trap on denormals. */
32734 if (flag_trapping_math || !TARGET_ROUND)
32735 break;
32737 if (out_mode == SFmode && in_mode == SFmode)
32739 if (out_n == 4 && in_n == 4)
32740 return ix86_builtins[IX86_BUILTIN_RINTPS];
32741 else if (out_n == 8 && in_n == 8)
32742 return ix86_builtins[IX86_BUILTIN_RINTPS256];
32744 break;
32746 case BUILT_IN_ROUND:
32747 /* The round insn does not trap on denormals. */
32748 if (flag_trapping_math || !TARGET_ROUND)
32749 break;
32751 if (out_mode == DFmode && in_mode == DFmode)
32753 if (out_n == 2 && in_n == 2)
32754 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ];
32755 else if (out_n == 4 && in_n == 4)
32756 return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ256];
32758 break;
32760 case BUILT_IN_ROUNDF:
32761 /* The round insn does not trap on denormals. */
32762 if (flag_trapping_math || !TARGET_ROUND)
32763 break;
32765 if (out_mode == SFmode && in_mode == SFmode)
32767 if (out_n == 4 && in_n == 4)
32768 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ];
32769 else if (out_n == 8 && in_n == 8)
32770 return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ256];
32772 break;
32774 case BUILT_IN_FMA:
32775 if (out_mode == DFmode && in_mode == DFmode)
32777 if (out_n == 2 && in_n == 2)
32778 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
32779 if (out_n == 4 && in_n == 4)
32780 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
32782 break;
32784 case BUILT_IN_FMAF:
32785 if (out_mode == SFmode && in_mode == SFmode)
32787 if (out_n == 4 && in_n == 4)
32788 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
32789 if (out_n == 8 && in_n == 8)
32790 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
32792 break;
32794 default:
32795 break;
32798 /* Dispatch to a handler for a vectorization library. */
32799 if (ix86_veclib_handler)
32800 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
32801 type_in);
32803 return NULL_TREE;
32806 /* Handler for an SVML-style interface to
32807 a library with vectorized intrinsics. */
32809 static tree
32810 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
32812 char name[20];
32813 tree fntype, new_fndecl, args;
32814 unsigned arity;
32815 const char *bname;
32816 enum machine_mode el_mode, in_mode;
32817 int n, in_n;
32819 /* The SVML is suitable for unsafe math only. */
32820 if (!flag_unsafe_math_optimizations)
32821 return NULL_TREE;
32823 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32824 n = TYPE_VECTOR_SUBPARTS (type_out);
32825 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32826 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32827 if (el_mode != in_mode
32828 || n != in_n)
32829 return NULL_TREE;
32831 switch (fn)
32833 case BUILT_IN_EXP:
32834 case BUILT_IN_LOG:
32835 case BUILT_IN_LOG10:
32836 case BUILT_IN_POW:
32837 case BUILT_IN_TANH:
32838 case BUILT_IN_TAN:
32839 case BUILT_IN_ATAN:
32840 case BUILT_IN_ATAN2:
32841 case BUILT_IN_ATANH:
32842 case BUILT_IN_CBRT:
32843 case BUILT_IN_SINH:
32844 case BUILT_IN_SIN:
32845 case BUILT_IN_ASINH:
32846 case BUILT_IN_ASIN:
32847 case BUILT_IN_COSH:
32848 case BUILT_IN_COS:
32849 case BUILT_IN_ACOSH:
32850 case BUILT_IN_ACOS:
32851 if (el_mode != DFmode || n != 2)
32852 return NULL_TREE;
32853 break;
32855 case BUILT_IN_EXPF:
32856 case BUILT_IN_LOGF:
32857 case BUILT_IN_LOG10F:
32858 case BUILT_IN_POWF:
32859 case BUILT_IN_TANHF:
32860 case BUILT_IN_TANF:
32861 case BUILT_IN_ATANF:
32862 case BUILT_IN_ATAN2F:
32863 case BUILT_IN_ATANHF:
32864 case BUILT_IN_CBRTF:
32865 case BUILT_IN_SINHF:
32866 case BUILT_IN_SINF:
32867 case BUILT_IN_ASINHF:
32868 case BUILT_IN_ASINF:
32869 case BUILT_IN_COSHF:
32870 case BUILT_IN_COSF:
32871 case BUILT_IN_ACOSHF:
32872 case BUILT_IN_ACOSF:
32873 if (el_mode != SFmode || n != 4)
32874 return NULL_TREE;
32875 break;
32877 default:
32878 return NULL_TREE;
32881 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32883 if (fn == BUILT_IN_LOGF)
32884 strcpy (name, "vmlsLn4");
32885 else if (fn == BUILT_IN_LOG)
32886 strcpy (name, "vmldLn2");
32887 else if (n == 4)
32889 sprintf (name, "vmls%s", bname+10);
32890 name[strlen (name)-1] = '4';
32892 else
32893 sprintf (name, "vmld%s2", bname+10);
32895 /* Convert to uppercase. */
32896 name[4] &= ~0x20;
32898 arity = 0;
32899 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32900 args;
32901 args = TREE_CHAIN (args))
32902 arity++;
32904 if (arity == 1)
32905 fntype = build_function_type_list (type_out, type_in, NULL);
32906 else
32907 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32909 /* Build a function declaration for the vectorized function. */
32910 new_fndecl = build_decl (BUILTINS_LOCATION,
32911 FUNCTION_DECL, get_identifier (name), fntype);
32912 TREE_PUBLIC (new_fndecl) = 1;
32913 DECL_EXTERNAL (new_fndecl) = 1;
32914 DECL_IS_NOVOPS (new_fndecl) = 1;
32915 TREE_READONLY (new_fndecl) = 1;
32917 return new_fndecl;
32920 /* Handler for an ACML-style interface to
32921 a library with vectorized intrinsics. */
32923 static tree
32924 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
32926 char name[20] = "__vr.._";
32927 tree fntype, new_fndecl, args;
32928 unsigned arity;
32929 const char *bname;
32930 enum machine_mode el_mode, in_mode;
32931 int n, in_n;
32933 /* The ACML is 64bits only and suitable for unsafe math only as
32934 it does not correctly support parts of IEEE with the required
32935 precision such as denormals. */
32936 if (!TARGET_64BIT
32937 || !flag_unsafe_math_optimizations)
32938 return NULL_TREE;
32940 el_mode = TYPE_MODE (TREE_TYPE (type_out));
32941 n = TYPE_VECTOR_SUBPARTS (type_out);
32942 in_mode = TYPE_MODE (TREE_TYPE (type_in));
32943 in_n = TYPE_VECTOR_SUBPARTS (type_in);
32944 if (el_mode != in_mode
32945 || n != in_n)
32946 return NULL_TREE;
32948 switch (fn)
32950 case BUILT_IN_SIN:
32951 case BUILT_IN_COS:
32952 case BUILT_IN_EXP:
32953 case BUILT_IN_LOG:
32954 case BUILT_IN_LOG2:
32955 case BUILT_IN_LOG10:
32956 name[4] = 'd';
32957 name[5] = '2';
32958 if (el_mode != DFmode
32959 || n != 2)
32960 return NULL_TREE;
32961 break;
32963 case BUILT_IN_SINF:
32964 case BUILT_IN_COSF:
32965 case BUILT_IN_EXPF:
32966 case BUILT_IN_POWF:
32967 case BUILT_IN_LOGF:
32968 case BUILT_IN_LOG2F:
32969 case BUILT_IN_LOG10F:
32970 name[4] = 's';
32971 name[5] = '4';
32972 if (el_mode != SFmode
32973 || n != 4)
32974 return NULL_TREE;
32975 break;
32977 default:
32978 return NULL_TREE;
32981 bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
32982 sprintf (name + 7, "%s", bname+10);
32984 arity = 0;
32985 for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
32986 args;
32987 args = TREE_CHAIN (args))
32988 arity++;
32990 if (arity == 1)
32991 fntype = build_function_type_list (type_out, type_in, NULL);
32992 else
32993 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
32995 /* Build a function declaration for the vectorized function. */
32996 new_fndecl = build_decl (BUILTINS_LOCATION,
32997 FUNCTION_DECL, get_identifier (name), fntype);
32998 TREE_PUBLIC (new_fndecl) = 1;
32999 DECL_EXTERNAL (new_fndecl) = 1;
33000 DECL_IS_NOVOPS (new_fndecl) = 1;
33001 TREE_READONLY (new_fndecl) = 1;
33003 return new_fndecl;
33006 /* Returns a decl of a function that implements gather load with
33007 memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
33008 Return NULL_TREE if it is not available. */
33010 static tree
33011 ix86_vectorize_builtin_gather (const_tree mem_vectype,
33012 const_tree index_type, int scale)
33014 bool si;
33015 enum ix86_builtins code;
33017 if (! TARGET_AVX2)
33018 return NULL_TREE;
33020 if ((TREE_CODE (index_type) != INTEGER_TYPE
33021 && !POINTER_TYPE_P (index_type))
33022 || (TYPE_MODE (index_type) != SImode
33023 && TYPE_MODE (index_type) != DImode))
33024 return NULL_TREE;
33026 if (TYPE_PRECISION (index_type) > POINTER_SIZE)
33027 return NULL_TREE;
33029 /* v*gather* insn sign extends index to pointer mode. */
33030 if (TYPE_PRECISION (index_type) < POINTER_SIZE
33031 && TYPE_UNSIGNED (index_type))
33032 return NULL_TREE;
33034 if (scale <= 0
33035 || scale > 8
33036 || (scale & (scale - 1)) != 0)
33037 return NULL_TREE;
33039 si = TYPE_MODE (index_type) == SImode;
33040 switch (TYPE_MODE (mem_vectype))
33042 case V2DFmode:
33043 code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
33044 break;
33045 case V4DFmode:
33046 code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
33047 break;
33048 case V2DImode:
33049 code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
33050 break;
33051 case V4DImode:
33052 code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
33053 break;
33054 case V4SFmode:
33055 code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
33056 break;
33057 case V8SFmode:
33058 code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
33059 break;
33060 case V4SImode:
33061 code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
33062 break;
33063 case V8SImode:
33064 code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
33065 break;
33066 default:
33067 return NULL_TREE;
33070 return ix86_builtins[code];
33073 /* Returns a code for a target-specific builtin that implements
33074 reciprocal of the function, or NULL_TREE if not available. */
33076 static tree
33077 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
33078 bool sqrt ATTRIBUTE_UNUSED)
33080 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
33081 && flag_finite_math_only && !flag_trapping_math
33082 && flag_unsafe_math_optimizations))
33083 return NULL_TREE;
33085 if (md_fn)
33086 /* Machine dependent builtins. */
33087 switch (fn)
33089 /* Vectorized version of sqrt to rsqrt conversion. */
33090 case IX86_BUILTIN_SQRTPS_NR:
33091 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
33093 case IX86_BUILTIN_SQRTPS_NR256:
33094 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
33096 default:
33097 return NULL_TREE;
33099 else
33100 /* Normal builtins. */
33101 switch (fn)
33103 /* Sqrt to rsqrt conversion. */
33104 case BUILT_IN_SQRTF:
33105 return ix86_builtins[IX86_BUILTIN_RSQRTF];
33107 default:
33108 return NULL_TREE;
33112 /* Helper for avx_vpermilps256_operand et al. This is also used by
33113 the expansion functions to turn the parallel back into a mask.
33114 The return value is 0 for no match and the imm8+1 for a match. */
33117 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
33119 unsigned i, nelt = GET_MODE_NUNITS (mode);
33120 unsigned mask = 0;
33121 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33123 if (XVECLEN (par, 0) != (int) nelt)
33124 return 0;
33126 /* Validate that all of the elements are constants, and not totally
33127 out of range. Copy the data into an integral array to make the
33128 subsequent checks easier. */
33129 for (i = 0; i < nelt; ++i)
33131 rtx er = XVECEXP (par, 0, i);
33132 unsigned HOST_WIDE_INT ei;
33134 if (!CONST_INT_P (er))
33135 return 0;
33136 ei = INTVAL (er);
33137 if (ei >= nelt)
33138 return 0;
33139 ipar[i] = ei;
33142 switch (mode)
33144 case V4DFmode:
33145 /* In the 256-bit DFmode case, we can only move elements within
33146 a 128-bit lane. */
33147 for (i = 0; i < 2; ++i)
33149 if (ipar[i] >= 2)
33150 return 0;
33151 mask |= ipar[i] << i;
33153 for (i = 2; i < 4; ++i)
33155 if (ipar[i] < 2)
33156 return 0;
33157 mask |= (ipar[i] - 2) << i;
33159 break;
33161 case V8SFmode:
33162 /* In the 256-bit SFmode case, we have full freedom of movement
33163 within the low 128-bit lane, but the high 128-bit lane must
33164 mirror the exact same pattern. */
33165 for (i = 0; i < 4; ++i)
33166 if (ipar[i] + 4 != ipar[i + 4])
33167 return 0;
33168 nelt = 4;
33169 /* FALLTHRU */
33171 case V2DFmode:
33172 case V4SFmode:
33173 /* In the 128-bit case, we've full freedom in the placement of
33174 the elements from the source operand. */
33175 for (i = 0; i < nelt; ++i)
33176 mask |= ipar[i] << (i * (nelt / 2));
33177 break;
33179 default:
33180 gcc_unreachable ();
33183 /* Make sure success has a non-zero value by adding one. */
33184 return mask + 1;
33187 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
33188 the expansion functions to turn the parallel back into a mask.
33189 The return value is 0 for no match and the imm8+1 for a match. */
33192 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
33194 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
33195 unsigned mask = 0;
33196 unsigned char ipar[8] = {}; /* Silence -Wuninitialized warning. */
33198 if (XVECLEN (par, 0) != (int) nelt)
33199 return 0;
33201 /* Validate that all of the elements are constants, and not totally
33202 out of range. Copy the data into an integral array to make the
33203 subsequent checks easier. */
33204 for (i = 0; i < nelt; ++i)
33206 rtx er = XVECEXP (par, 0, i);
33207 unsigned HOST_WIDE_INT ei;
33209 if (!CONST_INT_P (er))
33210 return 0;
33211 ei = INTVAL (er);
33212 if (ei >= 2 * nelt)
33213 return 0;
33214 ipar[i] = ei;
33217 /* Validate that the halves of the permute are halves. */
33218 for (i = 0; i < nelt2 - 1; ++i)
33219 if (ipar[i] + 1 != ipar[i + 1])
33220 return 0;
33221 for (i = nelt2; i < nelt - 1; ++i)
33222 if (ipar[i] + 1 != ipar[i + 1])
33223 return 0;
33225 /* Reconstruct the mask. */
33226 for (i = 0; i < 2; ++i)
33228 unsigned e = ipar[i * nelt2];
33229 if (e % nelt2)
33230 return 0;
33231 e /= nelt2;
33232 mask |= e << (i * 4);
33235 /* Make sure success has a non-zero value by adding one. */
33236 return mask + 1;
33239 /* Store OPERAND to the memory after reload is completed. This means
33240 that we can't easily use assign_stack_local. */
33242 ix86_force_to_memory (enum machine_mode mode, rtx operand)
33244 rtx result;
33246 gcc_assert (reload_completed);
33247 if (ix86_using_red_zone ())
33249 result = gen_rtx_MEM (mode,
33250 gen_rtx_PLUS (Pmode,
33251 stack_pointer_rtx,
33252 GEN_INT (-RED_ZONE_SIZE)));
33253 emit_move_insn (result, operand);
33255 else if (TARGET_64BIT)
33257 switch (mode)
33259 case HImode:
33260 case SImode:
33261 operand = gen_lowpart (DImode, operand);
33262 /* FALLTHRU */
33263 case DImode:
33264 emit_insn (
33265 gen_rtx_SET (VOIDmode,
33266 gen_rtx_MEM (DImode,
33267 gen_rtx_PRE_DEC (DImode,
33268 stack_pointer_rtx)),
33269 operand));
33270 break;
33271 default:
33272 gcc_unreachable ();
33274 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33276 else
33278 switch (mode)
33280 case DImode:
33282 rtx operands[2];
33283 split_double_mode (mode, &operand, 1, operands, operands + 1);
33284 emit_insn (
33285 gen_rtx_SET (VOIDmode,
33286 gen_rtx_MEM (SImode,
33287 gen_rtx_PRE_DEC (Pmode,
33288 stack_pointer_rtx)),
33289 operands[1]));
33290 emit_insn (
33291 gen_rtx_SET (VOIDmode,
33292 gen_rtx_MEM (SImode,
33293 gen_rtx_PRE_DEC (Pmode,
33294 stack_pointer_rtx)),
33295 operands[0]));
33297 break;
33298 case HImode:
33299 /* Store HImodes as SImodes. */
33300 operand = gen_lowpart (SImode, operand);
33301 /* FALLTHRU */
33302 case SImode:
33303 emit_insn (
33304 gen_rtx_SET (VOIDmode,
33305 gen_rtx_MEM (GET_MODE (operand),
33306 gen_rtx_PRE_DEC (SImode,
33307 stack_pointer_rtx)),
33308 operand));
33309 break;
33310 default:
33311 gcc_unreachable ();
33313 result = gen_rtx_MEM (mode, stack_pointer_rtx);
33315 return result;
33318 /* Free operand from the memory. */
33319 void
33320 ix86_free_from_memory (enum machine_mode mode)
33322 if (!ix86_using_red_zone ())
33324 int size;
33326 if (mode == DImode || TARGET_64BIT)
33327 size = 8;
33328 else
33329 size = 4;
33330 /* Use LEA to deallocate stack space. In peephole2 it will be converted
33331 to pop or add instruction if registers are available. */
33332 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
33333 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
33334 GEN_INT (size))));
33338 /* Return a register priority for hard reg REGNO. */
33339 static int
33340 ix86_register_priority (int hard_regno)
33342 /* ebp and r13 as the base always wants a displacement, r12 as the
33343 base always wants an index. So discourage their usage in an
33344 address. */
33345 if (hard_regno == R12_REG || hard_regno == R13_REG)
33346 return 0;
33347 if (hard_regno == BP_REG)
33348 return 1;
33349 /* New x86-64 int registers result in bigger code size. Discourage
33350 them. */
33351 if (FIRST_REX_INT_REG <= hard_regno && hard_regno <= LAST_REX_INT_REG)
33352 return 2;
33353 /* New x86-64 SSE registers result in bigger code size. Discourage
33354 them. */
33355 if (FIRST_REX_SSE_REG <= hard_regno && hard_regno <= LAST_REX_SSE_REG)
33356 return 2;
33357 /* Usage of AX register results in smaller code. Prefer it. */
33358 if (hard_regno == 0)
33359 return 4;
33360 return 3;
33363 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
33365 Put float CONST_DOUBLE in the constant pool instead of fp regs.
33366 QImode must go into class Q_REGS.
33367 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
33368 movdf to do mem-to-mem moves through integer regs. */
33370 static reg_class_t
33371 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
33373 enum machine_mode mode = GET_MODE (x);
33375 /* We're only allowed to return a subclass of CLASS. Many of the
33376 following checks fail for NO_REGS, so eliminate that early. */
33377 if (regclass == NO_REGS)
33378 return NO_REGS;
33380 /* All classes can load zeros. */
33381 if (x == CONST0_RTX (mode))
33382 return regclass;
33384 /* Force constants into memory if we are loading a (nonzero) constant into
33385 an MMX or SSE register. This is because there are no MMX/SSE instructions
33386 to load from a constant. */
33387 if (CONSTANT_P (x)
33388 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
33389 return NO_REGS;
33391 /* Prefer SSE regs only, if we can use them for math. */
33392 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
33393 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
33395 /* Floating-point constants need more complex checks. */
33396 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
33398 /* General regs can load everything. */
33399 if (reg_class_subset_p (regclass, GENERAL_REGS))
33400 return regclass;
33402 /* Floats can load 0 and 1 plus some others. Note that we eliminated
33403 zero above. We only want to wind up preferring 80387 registers if
33404 we plan on doing computation with them. */
33405 if (TARGET_80387
33406 && standard_80387_constant_p (x) > 0)
33408 /* Limit class to non-sse. */
33409 if (regclass == FLOAT_SSE_REGS)
33410 return FLOAT_REGS;
33411 if (regclass == FP_TOP_SSE_REGS)
33412 return FP_TOP_REG;
33413 if (regclass == FP_SECOND_SSE_REGS)
33414 return FP_SECOND_REG;
33415 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
33416 return regclass;
33419 return NO_REGS;
33422 /* Generally when we see PLUS here, it's the function invariant
33423 (plus soft-fp const_int). Which can only be computed into general
33424 regs. */
33425 if (GET_CODE (x) == PLUS)
33426 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
33428 /* QImode constants are easy to load, but non-constant QImode data
33429 must go into Q_REGS. */
33430 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
33432 if (reg_class_subset_p (regclass, Q_REGS))
33433 return regclass;
33434 if (reg_class_subset_p (Q_REGS, regclass))
33435 return Q_REGS;
33436 return NO_REGS;
33439 return regclass;
33442 /* Discourage putting floating-point values in SSE registers unless
33443 SSE math is being used, and likewise for the 387 registers. */
33444 static reg_class_t
33445 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
33447 enum machine_mode mode = GET_MODE (x);
33449 /* Restrict the output reload class to the register bank that we are doing
33450 math on. If we would like not to return a subset of CLASS, reject this
33451 alternative: if reload cannot do this, it will still use its choice. */
33452 mode = GET_MODE (x);
33453 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
33454 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
33456 if (X87_FLOAT_MODE_P (mode))
33458 if (regclass == FP_TOP_SSE_REGS)
33459 return FP_TOP_REG;
33460 else if (regclass == FP_SECOND_SSE_REGS)
33461 return FP_SECOND_REG;
33462 else
33463 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
33466 return regclass;
33469 static reg_class_t
33470 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
33471 enum machine_mode mode, secondary_reload_info *sri)
33473 /* Double-word spills from general registers to non-offsettable memory
33474 references (zero-extended addresses) require special handling. */
33475 if (TARGET_64BIT
33476 && MEM_P (x)
33477 && GET_MODE_SIZE (mode) > UNITS_PER_WORD
33478 && rclass == GENERAL_REGS
33479 && !offsettable_memref_p (x))
33481 sri->icode = (in_p
33482 ? CODE_FOR_reload_noff_load
33483 : CODE_FOR_reload_noff_store);
33484 /* Add the cost of moving address to a temporary. */
33485 sri->extra_cost = 1;
33487 return NO_REGS;
33490 /* QImode spills from non-QI registers require
33491 intermediate register on 32bit targets. */
33492 if (!TARGET_64BIT
33493 && !in_p && mode == QImode
33494 && (rclass == GENERAL_REGS
33495 || rclass == LEGACY_REGS
33496 || rclass == NON_Q_REGS
33497 || rclass == SIREG
33498 || rclass == DIREG
33499 || rclass == INDEX_REGS))
33501 int regno;
33503 if (REG_P (x))
33504 regno = REGNO (x);
33505 else
33506 regno = -1;
33508 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
33509 regno = true_regnum (x);
33511 /* Return Q_REGS if the operand is in memory. */
33512 if (regno == -1)
33513 return Q_REGS;
33516 /* This condition handles corner case where an expression involving
33517 pointers gets vectorized. We're trying to use the address of a
33518 stack slot as a vector initializer.
33520 (set (reg:V2DI 74 [ vect_cst_.2 ])
33521 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
33523 Eventually frame gets turned into sp+offset like this:
33525 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33526 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33527 (const_int 392 [0x188]))))
33529 That later gets turned into:
33531 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33532 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
33533 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
33535 We'll have the following reload recorded:
33537 Reload 0: reload_in (DI) =
33538 (plus:DI (reg/f:DI 7 sp)
33539 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
33540 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33541 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
33542 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
33543 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
33544 reload_reg_rtx: (reg:V2DI 22 xmm1)
33546 Which isn't going to work since SSE instructions can't handle scalar
33547 additions. Returning GENERAL_REGS forces the addition into integer
33548 register and reload can handle subsequent reloads without problems. */
33550 if (in_p && GET_CODE (x) == PLUS
33551 && SSE_CLASS_P (rclass)
33552 && SCALAR_INT_MODE_P (mode))
33553 return GENERAL_REGS;
33555 return NO_REGS;
33558 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
33560 static bool
33561 ix86_class_likely_spilled_p (reg_class_t rclass)
33563 switch (rclass)
33565 case AREG:
33566 case DREG:
33567 case CREG:
33568 case BREG:
33569 case AD_REGS:
33570 case SIREG:
33571 case DIREG:
33572 case SSE_FIRST_REG:
33573 case FP_TOP_REG:
33574 case FP_SECOND_REG:
33575 return true;
33577 default:
33578 break;
33581 return false;
33584 /* If we are copying between general and FP registers, we need a memory
33585 location. The same is true for SSE and MMX registers.
33587 To optimize register_move_cost performance, allow inline variant.
33589 The macro can't work reliably when one of the CLASSES is class containing
33590 registers from multiple units (SSE, MMX, integer). We avoid this by never
33591 combining those units in single alternative in the machine description.
33592 Ensure that this constraint holds to avoid unexpected surprises.
33594 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
33595 enforce these sanity checks. */
33597 static inline bool
33598 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33599 enum machine_mode mode, int strict)
33601 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
33602 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
33603 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
33604 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
33605 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
33606 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
33608 gcc_assert (!strict || lra_in_progress);
33609 return true;
33612 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
33613 return true;
33615 /* ??? This is a lie. We do have moves between mmx/general, and for
33616 mmx/sse2. But by saying we need secondary memory we discourage the
33617 register allocator from using the mmx registers unless needed. */
33618 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
33619 return true;
33621 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33623 /* SSE1 doesn't have any direct moves from other classes. */
33624 if (!TARGET_SSE2)
33625 return true;
33627 /* If the target says that inter-unit moves are more expensive
33628 than moving through memory, then don't generate them. */
33629 if (!TARGET_INTER_UNIT_MOVES)
33630 return true;
33632 /* Between SSE and general, we have moves no larger than word size. */
33633 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
33634 return true;
33637 return false;
33640 bool
33641 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
33642 enum machine_mode mode, int strict)
33644 return inline_secondary_memory_needed (class1, class2, mode, strict);
33647 /* Implement the TARGET_CLASS_MAX_NREGS hook.
33649 On the 80386, this is the size of MODE in words,
33650 except in the FP regs, where a single reg is always enough. */
33652 static unsigned char
33653 ix86_class_max_nregs (reg_class_t rclass, enum machine_mode mode)
33655 if (MAYBE_INTEGER_CLASS_P (rclass))
33657 if (mode == XFmode)
33658 return (TARGET_64BIT ? 2 : 3);
33659 else if (mode == XCmode)
33660 return (TARGET_64BIT ? 4 : 6);
33661 else
33662 return ((GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD);
33664 else
33666 if (COMPLEX_MODE_P (mode))
33667 return 2;
33668 else
33669 return 1;
33673 /* Return true if the registers in CLASS cannot represent the change from
33674 modes FROM to TO. */
33676 bool
33677 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
33678 enum reg_class regclass)
33680 if (from == to)
33681 return false;
33683 /* x87 registers can't do subreg at all, as all values are reformatted
33684 to extended precision. */
33685 if (MAYBE_FLOAT_CLASS_P (regclass))
33686 return true;
33688 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
33690 /* Vector registers do not support QI or HImode loads. If we don't
33691 disallow a change to these modes, reload will assume it's ok to
33692 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
33693 the vec_dupv4hi pattern. */
33694 if (GET_MODE_SIZE (from) < 4)
33695 return true;
33697 /* Vector registers do not support subreg with nonzero offsets, which
33698 are otherwise valid for integer registers. Since we can't see
33699 whether we have a nonzero offset from here, prohibit all
33700 nonparadoxical subregs changing size. */
33701 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
33702 return true;
33705 return false;
33708 /* Return the cost of moving data of mode M between a
33709 register and memory. A value of 2 is the default; this cost is
33710 relative to those in `REGISTER_MOVE_COST'.
33712 This function is used extensively by register_move_cost that is used to
33713 build tables at startup. Make it inline in this case.
33714 When IN is 2, return maximum of in and out move cost.
33716 If moving between registers and memory is more expensive than
33717 between two registers, you should define this macro to express the
33718 relative cost.
33720 Model also increased moving costs of QImode registers in non
33721 Q_REGS classes.
33723 static inline int
33724 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
33725 int in)
33727 int cost;
33728 if (FLOAT_CLASS_P (regclass))
33730 int index;
33731 switch (mode)
33733 case SFmode:
33734 index = 0;
33735 break;
33736 case DFmode:
33737 index = 1;
33738 break;
33739 case XFmode:
33740 index = 2;
33741 break;
33742 default:
33743 return 100;
33745 if (in == 2)
33746 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
33747 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
33749 if (SSE_CLASS_P (regclass))
33751 int index;
33752 switch (GET_MODE_SIZE (mode))
33754 case 4:
33755 index = 0;
33756 break;
33757 case 8:
33758 index = 1;
33759 break;
33760 case 16:
33761 index = 2;
33762 break;
33763 default:
33764 return 100;
33766 if (in == 2)
33767 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
33768 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
33770 if (MMX_CLASS_P (regclass))
33772 int index;
33773 switch (GET_MODE_SIZE (mode))
33775 case 4:
33776 index = 0;
33777 break;
33778 case 8:
33779 index = 1;
33780 break;
33781 default:
33782 return 100;
33784 if (in)
33785 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
33786 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
33788 switch (GET_MODE_SIZE (mode))
33790 case 1:
33791 if (Q_CLASS_P (regclass) || TARGET_64BIT)
33793 if (!in)
33794 return ix86_cost->int_store[0];
33795 if (TARGET_PARTIAL_REG_DEPENDENCY
33796 && optimize_function_for_speed_p (cfun))
33797 cost = ix86_cost->movzbl_load;
33798 else
33799 cost = ix86_cost->int_load[0];
33800 if (in == 2)
33801 return MAX (cost, ix86_cost->int_store[0]);
33802 return cost;
33804 else
33806 if (in == 2)
33807 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
33808 if (in)
33809 return ix86_cost->movzbl_load;
33810 else
33811 return ix86_cost->int_store[0] + 4;
33813 break;
33814 case 2:
33815 if (in == 2)
33816 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
33817 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
33818 default:
33819 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
33820 if (mode == TFmode)
33821 mode = XFmode;
33822 if (in == 2)
33823 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
33824 else if (in)
33825 cost = ix86_cost->int_load[2];
33826 else
33827 cost = ix86_cost->int_store[2];
33828 return (cost * (((int) GET_MODE_SIZE (mode)
33829 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
33833 static int
33834 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
33835 bool in)
33837 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
33841 /* Return the cost of moving data from a register in class CLASS1 to
33842 one in class CLASS2.
33844 It is not required that the cost always equal 2 when FROM is the same as TO;
33845 on some machines it is expensive to move between registers if they are not
33846 general registers. */
33848 static int
33849 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
33850 reg_class_t class2_i)
33852 enum reg_class class1 = (enum reg_class) class1_i;
33853 enum reg_class class2 = (enum reg_class) class2_i;
33855 /* In case we require secondary memory, compute cost of the store followed
33856 by load. In order to avoid bad register allocation choices, we need
33857 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
33859 if (inline_secondary_memory_needed (class1, class2, mode, 0))
33861 int cost = 1;
33863 cost += inline_memory_move_cost (mode, class1, 2);
33864 cost += inline_memory_move_cost (mode, class2, 2);
33866 /* In case of copying from general_purpose_register we may emit multiple
33867 stores followed by single load causing memory size mismatch stall.
33868 Count this as arbitrarily high cost of 20. */
33869 if (targetm.class_max_nregs (class1, mode)
33870 > targetm.class_max_nregs (class2, mode))
33871 cost += 20;
33873 /* In the case of FP/MMX moves, the registers actually overlap, and we
33874 have to switch modes in order to treat them differently. */
33875 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
33876 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
33877 cost += 20;
33879 return cost;
33882 /* Moves between SSE/MMX and integer unit are expensive. */
33883 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
33884 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
33886 /* ??? By keeping returned value relatively high, we limit the number
33887 of moves between integer and MMX/SSE registers for all targets.
33888 Additionally, high value prevents problem with x86_modes_tieable_p(),
33889 where integer modes in MMX/SSE registers are not tieable
33890 because of missing QImode and HImode moves to, from or between
33891 MMX/SSE registers. */
33892 return MAX (8, ix86_cost->mmxsse_to_integer);
33894 if (MAYBE_FLOAT_CLASS_P (class1))
33895 return ix86_cost->fp_move;
33896 if (MAYBE_SSE_CLASS_P (class1))
33897 return ix86_cost->sse_move;
33898 if (MAYBE_MMX_CLASS_P (class1))
33899 return ix86_cost->mmx_move;
33900 return 2;
33903 /* Return TRUE if hard register REGNO can hold a value of machine-mode
33904 MODE. */
33906 bool
33907 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
33909 /* Flags and only flags can only hold CCmode values. */
33910 if (CC_REGNO_P (regno))
33911 return GET_MODE_CLASS (mode) == MODE_CC;
33912 if (GET_MODE_CLASS (mode) == MODE_CC
33913 || GET_MODE_CLASS (mode) == MODE_RANDOM
33914 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
33915 return false;
33916 if (STACK_REGNO_P (regno))
33917 return VALID_FP_MODE_P (mode);
33918 if (SSE_REGNO_P (regno))
33920 /* We implement the move patterns for all vector modes into and
33921 out of SSE registers, even when no operation instructions
33922 are available. OImode and AVX modes are available only when
33923 AVX is enabled. */
33924 return ((TARGET_AVX
33925 && VALID_AVX256_REG_OR_OI_MODE (mode))
33926 || VALID_SSE_REG_MODE (mode)
33927 || VALID_SSE2_REG_MODE (mode)
33928 || VALID_MMX_REG_MODE (mode)
33929 || VALID_MMX_REG_MODE_3DNOW (mode));
33931 if (MMX_REGNO_P (regno))
33933 /* We implement the move patterns for 3DNOW modes even in MMX mode,
33934 so if the register is available at all, then we can move data of
33935 the given mode into or out of it. */
33936 return (VALID_MMX_REG_MODE (mode)
33937 || VALID_MMX_REG_MODE_3DNOW (mode));
33940 if (mode == QImode)
33942 /* Take care for QImode values - they can be in non-QI regs,
33943 but then they do cause partial register stalls. */
33944 if (TARGET_64BIT || QI_REGNO_P (regno))
33945 return true;
33946 if (!TARGET_PARTIAL_REG_STALL)
33947 return true;
33948 /* LRA checks if the hard register is OK for the given mode.
33949 QImode values can live in non-QI regs, so we allow all
33950 registers here. */
33951 if (lra_in_progress)
33952 return true;
33953 return !can_create_pseudo_p ();
33955 /* We handle both integer and floats in the general purpose registers. */
33956 else if (VALID_INT_MODE_P (mode))
33957 return true;
33958 else if (VALID_FP_MODE_P (mode))
33959 return true;
33960 else if (VALID_DFP_MODE_P (mode))
33961 return true;
33962 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
33963 on to use that value in smaller contexts, this can easily force a
33964 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
33965 supporting DImode, allow it. */
33966 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
33967 return true;
33969 return false;
33972 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
33973 tieable integer mode. */
33975 static bool
33976 ix86_tieable_integer_mode_p (enum machine_mode mode)
33978 switch (mode)
33980 case HImode:
33981 case SImode:
33982 return true;
33984 case QImode:
33985 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
33987 case DImode:
33988 return TARGET_64BIT;
33990 default:
33991 return false;
33995 /* Return true if MODE1 is accessible in a register that can hold MODE2
33996 without copying. That is, all register classes that can hold MODE2
33997 can also hold MODE1. */
33999 bool
34000 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
34002 if (mode1 == mode2)
34003 return true;
34005 if (ix86_tieable_integer_mode_p (mode1)
34006 && ix86_tieable_integer_mode_p (mode2))
34007 return true;
34009 /* MODE2 being XFmode implies fp stack or general regs, which means we
34010 can tie any smaller floating point modes to it. Note that we do not
34011 tie this with TFmode. */
34012 if (mode2 == XFmode)
34013 return mode1 == SFmode || mode1 == DFmode;
34015 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
34016 that we can tie it with SFmode. */
34017 if (mode2 == DFmode)
34018 return mode1 == SFmode;
34020 /* If MODE2 is only appropriate for an SSE register, then tie with
34021 any other mode acceptable to SSE registers. */
34022 if (GET_MODE_SIZE (mode2) == 32
34023 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34024 return (GET_MODE_SIZE (mode1) == 32
34025 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34026 if (GET_MODE_SIZE (mode2) == 16
34027 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
34028 return (GET_MODE_SIZE (mode1) == 16
34029 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
34031 /* If MODE2 is appropriate for an MMX register, then tie
34032 with any other mode acceptable to MMX registers. */
34033 if (GET_MODE_SIZE (mode2) == 8
34034 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
34035 return (GET_MODE_SIZE (mode1) == 8
34036 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
34038 return false;
34041 /* Return the cost of moving between two registers of mode MODE. */
34043 static int
34044 ix86_set_reg_reg_cost (enum machine_mode mode)
34046 unsigned int units = UNITS_PER_WORD;
34048 switch (GET_MODE_CLASS (mode))
34050 default:
34051 break;
34053 case MODE_CC:
34054 units = GET_MODE_SIZE (CCmode);
34055 break;
34057 case MODE_FLOAT:
34058 if ((TARGET_SSE && mode == TFmode)
34059 || (TARGET_80387 && mode == XFmode)
34060 || ((TARGET_80387 || TARGET_SSE2) && mode == DFmode)
34061 || ((TARGET_80387 || TARGET_SSE) && mode == SFmode))
34062 units = GET_MODE_SIZE (mode);
34063 break;
34065 case MODE_COMPLEX_FLOAT:
34066 if ((TARGET_SSE && mode == TCmode)
34067 || (TARGET_80387 && mode == XCmode)
34068 || ((TARGET_80387 || TARGET_SSE2) && mode == DCmode)
34069 || ((TARGET_80387 || TARGET_SSE) && mode == SCmode))
34070 units = GET_MODE_SIZE (mode);
34071 break;
34073 case MODE_VECTOR_INT:
34074 case MODE_VECTOR_FLOAT:
34075 if ((TARGET_AVX && VALID_AVX256_REG_MODE (mode))
34076 || (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
34077 || (TARGET_SSE && VALID_SSE_REG_MODE (mode))
34078 || (TARGET_MMX && VALID_MMX_REG_MODE (mode)))
34079 units = GET_MODE_SIZE (mode);
34082 /* Return the cost of moving between two registers of mode MODE,
34083 assuming that the move will be in pieces of at most UNITS bytes. */
34084 return COSTS_N_INSNS ((GET_MODE_SIZE (mode) + units - 1) / units);
34087 /* Compute a (partial) cost for rtx X. Return true if the complete
34088 cost has been computed, and false if subexpressions should be
34089 scanned. In either case, *TOTAL contains the cost result. */
34091 static bool
34092 ix86_rtx_costs (rtx x, int code_i, int outer_code_i, int opno, int *total,
34093 bool speed)
34095 enum rtx_code code = (enum rtx_code) code_i;
34096 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
34097 enum machine_mode mode = GET_MODE (x);
34098 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
34100 switch (code)
34102 case SET:
34103 if (register_operand (SET_DEST (x), VOIDmode)
34104 && reg_or_0_operand (SET_SRC (x), VOIDmode))
34106 *total = ix86_set_reg_reg_cost (GET_MODE (SET_DEST (x)));
34107 return true;
34109 return false;
34111 case CONST_INT:
34112 case CONST:
34113 case LABEL_REF:
34114 case SYMBOL_REF:
34115 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
34116 *total = 3;
34117 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
34118 *total = 2;
34119 else if (flag_pic && SYMBOLIC_CONST (x)
34120 && (!TARGET_64BIT
34121 || (!GET_CODE (x) != LABEL_REF
34122 && (GET_CODE (x) != SYMBOL_REF
34123 || !SYMBOL_REF_LOCAL_P (x)))))
34124 *total = 1;
34125 else
34126 *total = 0;
34127 return true;
34129 case CONST_DOUBLE:
34130 if (mode == VOIDmode)
34132 *total = 0;
34133 return true;
34135 switch (standard_80387_constant_p (x))
34137 case 1: /* 0.0 */
34138 *total = 1;
34139 return true;
34140 default: /* Other constants */
34141 *total = 2;
34142 return true;
34143 case 0:
34144 case -1:
34145 break;
34147 if (SSE_FLOAT_MODE_P (mode))
34149 case CONST_VECTOR:
34150 switch (standard_sse_constant_p (x))
34152 case 0:
34153 break;
34154 case 1: /* 0: xor eliminates false dependency */
34155 *total = 0;
34156 return true;
34157 default: /* -1: cmp contains false dependency */
34158 *total = 1;
34159 return true;
34162 /* Fall back to (MEM (SYMBOL_REF)), since that's where
34163 it'll probably end up. Add a penalty for size. */
34164 *total = (COSTS_N_INSNS (1)
34165 + (flag_pic != 0 && !TARGET_64BIT)
34166 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
34167 return true;
34169 case ZERO_EXTEND:
34170 /* The zero extensions is often completely free on x86_64, so make
34171 it as cheap as possible. */
34172 if (TARGET_64BIT && mode == DImode
34173 && GET_MODE (XEXP (x, 0)) == SImode)
34174 *total = 1;
34175 else if (TARGET_ZERO_EXTEND_WITH_AND)
34176 *total = cost->add;
34177 else
34178 *total = cost->movzx;
34179 return false;
34181 case SIGN_EXTEND:
34182 *total = cost->movsx;
34183 return false;
34185 case ASHIFT:
34186 if (SCALAR_INT_MODE_P (mode)
34187 && GET_MODE_SIZE (mode) < UNITS_PER_WORD
34188 && CONST_INT_P (XEXP (x, 1)))
34190 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34191 if (value == 1)
34193 *total = cost->add;
34194 return false;
34196 if ((value == 2 || value == 3)
34197 && cost->lea <= cost->shift_const)
34199 *total = cost->lea;
34200 return false;
34203 /* FALLTHRU */
34205 case ROTATE:
34206 case ASHIFTRT:
34207 case LSHIFTRT:
34208 case ROTATERT:
34209 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34211 /* ??? Should be SSE vector operation cost. */
34212 /* At least for published AMD latencies, this really is the same
34213 as the latency for a simple fpu operation like fabs. */
34214 /* V*QImode is emulated with 1-11 insns. */
34215 if (mode == V16QImode || mode == V32QImode)
34217 int count = 11;
34218 if (TARGET_XOP && mode == V16QImode)
34220 /* For XOP we use vpshab, which requires a broadcast of the
34221 value to the variable shift insn. For constants this
34222 means a V16Q const in mem; even when we can perform the
34223 shift with one insn set the cost to prefer paddb. */
34224 if (CONSTANT_P (XEXP (x, 1)))
34226 *total = (cost->fabs
34227 + rtx_cost (XEXP (x, 0), code, 0, speed)
34228 + (speed ? 2 : COSTS_N_BYTES (16)));
34229 return true;
34231 count = 3;
34233 else if (TARGET_SSSE3)
34234 count = 7;
34235 *total = cost->fabs * count;
34237 else
34238 *total = cost->fabs;
34240 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34242 if (CONST_INT_P (XEXP (x, 1)))
34244 if (INTVAL (XEXP (x, 1)) > 32)
34245 *total = cost->shift_const + COSTS_N_INSNS (2);
34246 else
34247 *total = cost->shift_const * 2;
34249 else
34251 if (GET_CODE (XEXP (x, 1)) == AND)
34252 *total = cost->shift_var * 2;
34253 else
34254 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
34257 else
34259 if (CONST_INT_P (XEXP (x, 1)))
34260 *total = cost->shift_const;
34261 else if (GET_CODE (XEXP (x, 1)) == SUBREG
34262 && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND)
34264 /* Return the cost after shift-and truncation. */
34265 *total = cost->shift_var;
34266 return true;
34268 else
34269 *total = cost->shift_var;
34271 return false;
34273 case FMA:
34275 rtx sub;
34277 gcc_assert (FLOAT_MODE_P (mode));
34278 gcc_assert (TARGET_FMA || TARGET_FMA4);
34280 /* ??? SSE scalar/vector cost should be used here. */
34281 /* ??? Bald assumption that fma has the same cost as fmul. */
34282 *total = cost->fmul;
34283 *total += rtx_cost (XEXP (x, 1), FMA, 1, speed);
34285 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
34286 sub = XEXP (x, 0);
34287 if (GET_CODE (sub) == NEG)
34288 sub = XEXP (sub, 0);
34289 *total += rtx_cost (sub, FMA, 0, speed);
34291 sub = XEXP (x, 2);
34292 if (GET_CODE (sub) == NEG)
34293 sub = XEXP (sub, 0);
34294 *total += rtx_cost (sub, FMA, 2, speed);
34295 return true;
34298 case MULT:
34299 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34301 /* ??? SSE scalar cost should be used here. */
34302 *total = cost->fmul;
34303 return false;
34305 else if (X87_FLOAT_MODE_P (mode))
34307 *total = cost->fmul;
34308 return false;
34310 else if (FLOAT_MODE_P (mode))
34312 /* ??? SSE vector cost should be used here. */
34313 *total = cost->fmul;
34314 return false;
34316 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34318 /* V*QImode is emulated with 7-13 insns. */
34319 if (mode == V16QImode || mode == V32QImode)
34321 int extra = 11;
34322 if (TARGET_XOP && mode == V16QImode)
34323 extra = 5;
34324 else if (TARGET_SSSE3)
34325 extra = 6;
34326 *total = cost->fmul * 2 + cost->fabs * extra;
34328 /* V*DImode is emulated with 5-8 insns. */
34329 else if (mode == V2DImode || mode == V4DImode)
34331 if (TARGET_XOP && mode == V2DImode)
34332 *total = cost->fmul * 2 + cost->fabs * 3;
34333 else
34334 *total = cost->fmul * 3 + cost->fabs * 5;
34336 /* Without sse4.1, we don't have PMULLD; it's emulated with 7
34337 insns, including two PMULUDQ. */
34338 else if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
34339 *total = cost->fmul * 2 + cost->fabs * 5;
34340 else
34341 *total = cost->fmul;
34342 return false;
34344 else
34346 rtx op0 = XEXP (x, 0);
34347 rtx op1 = XEXP (x, 1);
34348 int nbits;
34349 if (CONST_INT_P (XEXP (x, 1)))
34351 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
34352 for (nbits = 0; value != 0; value &= value - 1)
34353 nbits++;
34355 else
34356 /* This is arbitrary. */
34357 nbits = 7;
34359 /* Compute costs correctly for widening multiplication. */
34360 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
34361 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
34362 == GET_MODE_SIZE (mode))
34364 int is_mulwiden = 0;
34365 enum machine_mode inner_mode = GET_MODE (op0);
34367 if (GET_CODE (op0) == GET_CODE (op1))
34368 is_mulwiden = 1, op1 = XEXP (op1, 0);
34369 else if (CONST_INT_P (op1))
34371 if (GET_CODE (op0) == SIGN_EXTEND)
34372 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
34373 == INTVAL (op1);
34374 else
34375 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
34378 if (is_mulwiden)
34379 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
34382 *total = (cost->mult_init[MODE_INDEX (mode)]
34383 + nbits * cost->mult_bit
34384 + rtx_cost (op0, outer_code, opno, speed)
34385 + rtx_cost (op1, outer_code, opno, speed));
34387 return true;
34390 case DIV:
34391 case UDIV:
34392 case MOD:
34393 case UMOD:
34394 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34395 /* ??? SSE cost should be used here. */
34396 *total = cost->fdiv;
34397 else if (X87_FLOAT_MODE_P (mode))
34398 *total = cost->fdiv;
34399 else if (FLOAT_MODE_P (mode))
34400 /* ??? SSE vector cost should be used here. */
34401 *total = cost->fdiv;
34402 else
34403 *total = cost->divide[MODE_INDEX (mode)];
34404 return false;
34406 case PLUS:
34407 if (GET_MODE_CLASS (mode) == MODE_INT
34408 && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
34410 if (GET_CODE (XEXP (x, 0)) == PLUS
34411 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
34412 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
34413 && CONSTANT_P (XEXP (x, 1)))
34415 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
34416 if (val == 2 || val == 4 || val == 8)
34418 *total = cost->lea;
34419 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34420 outer_code, opno, speed);
34421 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
34422 outer_code, opno, speed);
34423 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34424 return true;
34427 else if (GET_CODE (XEXP (x, 0)) == MULT
34428 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
34430 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
34431 if (val == 2 || val == 4 || val == 8)
34433 *total = cost->lea;
34434 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34435 outer_code, opno, speed);
34436 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34437 return true;
34440 else if (GET_CODE (XEXP (x, 0)) == PLUS)
34442 *total = cost->lea;
34443 *total += rtx_cost (XEXP (XEXP (x, 0), 0),
34444 outer_code, opno, speed);
34445 *total += rtx_cost (XEXP (XEXP (x, 0), 1),
34446 outer_code, opno, speed);
34447 *total += rtx_cost (XEXP (x, 1), outer_code, opno, speed);
34448 return true;
34451 /* FALLTHRU */
34453 case MINUS:
34454 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34456 /* ??? SSE cost should be used here. */
34457 *total = cost->fadd;
34458 return false;
34460 else if (X87_FLOAT_MODE_P (mode))
34462 *total = cost->fadd;
34463 return false;
34465 else if (FLOAT_MODE_P (mode))
34467 /* ??? SSE vector cost should be used here. */
34468 *total = cost->fadd;
34469 return false;
34471 /* FALLTHRU */
34473 case AND:
34474 case IOR:
34475 case XOR:
34476 if (GET_MODE_CLASS (mode) == MODE_INT
34477 && GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34479 *total = (cost->add * 2
34480 + (rtx_cost (XEXP (x, 0), outer_code, opno, speed)
34481 << (GET_MODE (XEXP (x, 0)) != DImode))
34482 + (rtx_cost (XEXP (x, 1), outer_code, opno, speed)
34483 << (GET_MODE (XEXP (x, 1)) != DImode)));
34484 return true;
34486 /* FALLTHRU */
34488 case NEG:
34489 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34491 /* ??? SSE cost should be used here. */
34492 *total = cost->fchs;
34493 return false;
34495 else if (X87_FLOAT_MODE_P (mode))
34497 *total = cost->fchs;
34498 return false;
34500 else if (FLOAT_MODE_P (mode))
34502 /* ??? SSE vector cost should be used here. */
34503 *total = cost->fchs;
34504 return false;
34506 /* FALLTHRU */
34508 case NOT:
34509 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
34511 /* ??? Should be SSE vector operation cost. */
34512 /* At least for published AMD latencies, this really is the same
34513 as the latency for a simple fpu operation like fabs. */
34514 *total = cost->fabs;
34516 else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
34517 *total = cost->add * 2;
34518 else
34519 *total = cost->add;
34520 return false;
34522 case COMPARE:
34523 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
34524 && XEXP (XEXP (x, 0), 1) == const1_rtx
34525 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
34526 && XEXP (x, 1) == const0_rtx)
34528 /* This kind of construct is implemented using test[bwl].
34529 Treat it as if we had an AND. */
34530 *total = (cost->add
34531 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, opno, speed)
34532 + rtx_cost (const1_rtx, outer_code, opno, speed));
34533 return true;
34535 return false;
34537 case FLOAT_EXTEND:
34538 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
34539 *total = 0;
34540 return false;
34542 case ABS:
34543 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34544 /* ??? SSE cost should be used here. */
34545 *total = cost->fabs;
34546 else if (X87_FLOAT_MODE_P (mode))
34547 *total = cost->fabs;
34548 else if (FLOAT_MODE_P (mode))
34549 /* ??? SSE vector cost should be used here. */
34550 *total = cost->fabs;
34551 return false;
34553 case SQRT:
34554 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
34555 /* ??? SSE cost should be used here. */
34556 *total = cost->fsqrt;
34557 else if (X87_FLOAT_MODE_P (mode))
34558 *total = cost->fsqrt;
34559 else if (FLOAT_MODE_P (mode))
34560 /* ??? SSE vector cost should be used here. */
34561 *total = cost->fsqrt;
34562 return false;
34564 case UNSPEC:
34565 if (XINT (x, 1) == UNSPEC_TP)
34566 *total = 0;
34567 return false;
34569 case VEC_SELECT:
34570 case VEC_CONCAT:
34571 case VEC_MERGE:
34572 case VEC_DUPLICATE:
34573 /* ??? Assume all of these vector manipulation patterns are
34574 recognizable. In which case they all pretty much have the
34575 same cost. */
34576 *total = cost->fabs;
34577 return true;
34579 default:
34580 return false;
34584 #if TARGET_MACHO
34586 static int current_machopic_label_num;
34588 /* Given a symbol name and its associated stub, write out the
34589 definition of the stub. */
34591 void
34592 machopic_output_stub (FILE *file, const char *symb, const char *stub)
34594 unsigned int length;
34595 char *binder_name, *symbol_name, lazy_ptr_name[32];
34596 int label = ++current_machopic_label_num;
34598 /* For 64-bit we shouldn't get here. */
34599 gcc_assert (!TARGET_64BIT);
34601 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
34602 symb = targetm.strip_name_encoding (symb);
34604 length = strlen (stub);
34605 binder_name = XALLOCAVEC (char, length + 32);
34606 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
34608 length = strlen (symb);
34609 symbol_name = XALLOCAVEC (char, length + 32);
34610 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
34612 sprintf (lazy_ptr_name, "L%d$lz", label);
34614 if (MACHOPIC_ATT_STUB)
34615 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
34616 else if (MACHOPIC_PURE)
34617 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
34618 else
34619 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
34621 fprintf (file, "%s:\n", stub);
34622 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34624 if (MACHOPIC_ATT_STUB)
34626 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
34628 else if (MACHOPIC_PURE)
34630 /* PIC stub. */
34631 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34632 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
34633 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
34634 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
34635 label, lazy_ptr_name, label);
34636 fprintf (file, "\tjmp\t*%%ecx\n");
34638 else
34639 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
34641 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
34642 it needs no stub-binding-helper. */
34643 if (MACHOPIC_ATT_STUB)
34644 return;
34646 fprintf (file, "%s:\n", binder_name);
34648 if (MACHOPIC_PURE)
34650 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
34651 fprintf (file, "\tpushl\t%%ecx\n");
34653 else
34654 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
34656 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
34658 /* N.B. Keep the correspondence of these
34659 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
34660 old-pic/new-pic/non-pic stubs; altering this will break
34661 compatibility with existing dylibs. */
34662 if (MACHOPIC_PURE)
34664 /* 25-byte PIC stub using "CALL get_pc_thunk". */
34665 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
34667 else
34668 /* 16-byte -mdynamic-no-pic stub. */
34669 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
34671 fprintf (file, "%s:\n", lazy_ptr_name);
34672 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
34673 fprintf (file, ASM_LONG "%s\n", binder_name);
34675 #endif /* TARGET_MACHO */
34677 /* Order the registers for register allocator. */
34679 void
34680 x86_order_regs_for_local_alloc (void)
34682 int pos = 0;
34683 int i;
34685 /* First allocate the local general purpose registers. */
34686 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34687 if (GENERAL_REGNO_P (i) && call_used_regs[i])
34688 reg_alloc_order [pos++] = i;
34690 /* Global general purpose registers. */
34691 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
34692 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
34693 reg_alloc_order [pos++] = i;
34695 /* x87 registers come first in case we are doing FP math
34696 using them. */
34697 if (!TARGET_SSE_MATH)
34698 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34699 reg_alloc_order [pos++] = i;
34701 /* SSE registers. */
34702 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
34703 reg_alloc_order [pos++] = i;
34704 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
34705 reg_alloc_order [pos++] = i;
34707 /* x87 registers. */
34708 if (TARGET_SSE_MATH)
34709 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
34710 reg_alloc_order [pos++] = i;
34712 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
34713 reg_alloc_order [pos++] = i;
34715 /* Initialize the rest of array as we do not allocate some registers
34716 at all. */
34717 while (pos < FIRST_PSEUDO_REGISTER)
34718 reg_alloc_order [pos++] = 0;
34721 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
34722 in struct attribute_spec handler. */
34723 static tree
34724 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
34725 tree args,
34726 int flags ATTRIBUTE_UNUSED,
34727 bool *no_add_attrs)
34729 if (TREE_CODE (*node) != FUNCTION_TYPE
34730 && TREE_CODE (*node) != METHOD_TYPE
34731 && TREE_CODE (*node) != FIELD_DECL
34732 && TREE_CODE (*node) != TYPE_DECL)
34734 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34735 name);
34736 *no_add_attrs = true;
34737 return NULL_TREE;
34739 if (TARGET_64BIT)
34741 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
34742 name);
34743 *no_add_attrs = true;
34744 return NULL_TREE;
34746 if (is_attribute_p ("callee_pop_aggregate_return", name))
34748 tree cst;
34750 cst = TREE_VALUE (args);
34751 if (TREE_CODE (cst) != INTEGER_CST)
34753 warning (OPT_Wattributes,
34754 "%qE attribute requires an integer constant argument",
34755 name);
34756 *no_add_attrs = true;
34758 else if (compare_tree_int (cst, 0) != 0
34759 && compare_tree_int (cst, 1) != 0)
34761 warning (OPT_Wattributes,
34762 "argument to %qE attribute is neither zero, nor one",
34763 name);
34764 *no_add_attrs = true;
34767 return NULL_TREE;
34770 return NULL_TREE;
34773 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
34774 struct attribute_spec.handler. */
34775 static tree
34776 ix86_handle_abi_attribute (tree *node, tree name,
34777 tree args ATTRIBUTE_UNUSED,
34778 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34780 if (TREE_CODE (*node) != FUNCTION_TYPE
34781 && TREE_CODE (*node) != METHOD_TYPE
34782 && TREE_CODE (*node) != FIELD_DECL
34783 && TREE_CODE (*node) != TYPE_DECL)
34785 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34786 name);
34787 *no_add_attrs = true;
34788 return NULL_TREE;
34791 /* Can combine regparm with all attributes but fastcall. */
34792 if (is_attribute_p ("ms_abi", name))
34794 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
34796 error ("ms_abi and sysv_abi attributes are not compatible");
34799 return NULL_TREE;
34801 else if (is_attribute_p ("sysv_abi", name))
34803 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
34805 error ("ms_abi and sysv_abi attributes are not compatible");
34808 return NULL_TREE;
34811 return NULL_TREE;
34814 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
34815 struct attribute_spec.handler. */
34816 static tree
34817 ix86_handle_struct_attribute (tree *node, tree name,
34818 tree args ATTRIBUTE_UNUSED,
34819 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34821 tree *type = NULL;
34822 if (DECL_P (*node))
34824 if (TREE_CODE (*node) == TYPE_DECL)
34825 type = &TREE_TYPE (*node);
34827 else
34828 type = node;
34830 if (!(type && RECORD_OR_UNION_TYPE_P (*type)))
34832 warning (OPT_Wattributes, "%qE attribute ignored",
34833 name);
34834 *no_add_attrs = true;
34837 else if ((is_attribute_p ("ms_struct", name)
34838 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
34839 || ((is_attribute_p ("gcc_struct", name)
34840 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
34842 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
34843 name);
34844 *no_add_attrs = true;
34847 return NULL_TREE;
34850 static tree
34851 ix86_handle_fndecl_attribute (tree *node, tree name,
34852 tree args ATTRIBUTE_UNUSED,
34853 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
34855 if (TREE_CODE (*node) != FUNCTION_DECL)
34857 warning (OPT_Wattributes, "%qE attribute only applies to functions",
34858 name);
34859 *no_add_attrs = true;
34861 return NULL_TREE;
34864 static bool
34865 ix86_ms_bitfield_layout_p (const_tree record_type)
34867 return ((TARGET_MS_BITFIELD_LAYOUT
34868 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
34869 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
34872 /* Returns an expression indicating where the this parameter is
34873 located on entry to the FUNCTION. */
34875 static rtx
34876 x86_this_parameter (tree function)
34878 tree type = TREE_TYPE (function);
34879 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
34880 int nregs;
34882 if (TARGET_64BIT)
34884 const int *parm_regs;
34886 if (ix86_function_type_abi (type) == MS_ABI)
34887 parm_regs = x86_64_ms_abi_int_parameter_registers;
34888 else
34889 parm_regs = x86_64_int_parameter_registers;
34890 return gen_rtx_REG (Pmode, parm_regs[aggr]);
34893 nregs = ix86_function_regparm (type, function);
34895 if (nregs > 0 && !stdarg_p (type))
34897 int regno;
34898 unsigned int ccvt = ix86_get_callcvt (type);
34900 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34901 regno = aggr ? DX_REG : CX_REG;
34902 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34904 regno = CX_REG;
34905 if (aggr)
34906 return gen_rtx_MEM (SImode,
34907 plus_constant (Pmode, stack_pointer_rtx, 4));
34909 else
34911 regno = AX_REG;
34912 if (aggr)
34914 regno = DX_REG;
34915 if (nregs == 1)
34916 return gen_rtx_MEM (SImode,
34917 plus_constant (Pmode,
34918 stack_pointer_rtx, 4));
34921 return gen_rtx_REG (SImode, regno);
34924 return gen_rtx_MEM (SImode, plus_constant (Pmode, stack_pointer_rtx,
34925 aggr ? 8 : 4));
34928 /* Determine whether x86_output_mi_thunk can succeed. */
34930 static bool
34931 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
34932 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
34933 HOST_WIDE_INT vcall_offset, const_tree function)
34935 /* 64-bit can handle anything. */
34936 if (TARGET_64BIT)
34937 return true;
34939 /* For 32-bit, everything's fine if we have one free register. */
34940 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
34941 return true;
34943 /* Need a free register for vcall_offset. */
34944 if (vcall_offset)
34945 return false;
34947 /* Need a free register for GOT references. */
34948 if (flag_pic && !targetm.binds_local_p (function))
34949 return false;
34951 /* Otherwise ok. */
34952 return true;
34955 /* Output the assembler code for a thunk function. THUNK_DECL is the
34956 declaration for the thunk function itself, FUNCTION is the decl for
34957 the target function. DELTA is an immediate constant offset to be
34958 added to THIS. If VCALL_OFFSET is nonzero, the word at
34959 *(*this + vcall_offset) should be added to THIS. */
34961 static void
34962 x86_output_mi_thunk (FILE *file,
34963 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
34964 HOST_WIDE_INT vcall_offset, tree function)
34966 rtx this_param = x86_this_parameter (function);
34967 rtx this_reg, tmp, fnaddr;
34968 unsigned int tmp_regno;
34970 if (TARGET_64BIT)
34971 tmp_regno = R10_REG;
34972 else
34974 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
34975 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
34976 tmp_regno = AX_REG;
34977 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
34978 tmp_regno = DX_REG;
34979 else
34980 tmp_regno = CX_REG;
34983 emit_note (NOTE_INSN_PROLOGUE_END);
34985 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
34986 pull it in now and let DELTA benefit. */
34987 if (REG_P (this_param))
34988 this_reg = this_param;
34989 else if (vcall_offset)
34991 /* Put the this parameter into %eax. */
34992 this_reg = gen_rtx_REG (Pmode, AX_REG);
34993 emit_move_insn (this_reg, this_param);
34995 else
34996 this_reg = NULL_RTX;
34998 /* Adjust the this parameter by a fixed constant. */
34999 if (delta)
35001 rtx delta_rtx = GEN_INT (delta);
35002 rtx delta_dst = this_reg ? this_reg : this_param;
35004 if (TARGET_64BIT)
35006 if (!x86_64_general_operand (delta_rtx, Pmode))
35008 tmp = gen_rtx_REG (Pmode, tmp_regno);
35009 emit_move_insn (tmp, delta_rtx);
35010 delta_rtx = tmp;
35014 ix86_emit_binop (PLUS, Pmode, delta_dst, delta_rtx);
35017 /* Adjust the this parameter by a value stored in the vtable. */
35018 if (vcall_offset)
35020 rtx vcall_addr, vcall_mem, this_mem;
35022 tmp = gen_rtx_REG (Pmode, tmp_regno);
35024 this_mem = gen_rtx_MEM (ptr_mode, this_reg);
35025 if (Pmode != ptr_mode)
35026 this_mem = gen_rtx_ZERO_EXTEND (Pmode, this_mem);
35027 emit_move_insn (tmp, this_mem);
35029 /* Adjust the this parameter. */
35030 vcall_addr = plus_constant (Pmode, tmp, vcall_offset);
35031 if (TARGET_64BIT
35032 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
35034 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
35035 emit_move_insn (tmp2, GEN_INT (vcall_offset));
35036 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
35039 vcall_mem = gen_rtx_MEM (ptr_mode, vcall_addr);
35040 if (Pmode != ptr_mode)
35041 emit_insn (gen_addsi_1_zext (this_reg,
35042 gen_rtx_REG (ptr_mode,
35043 REGNO (this_reg)),
35044 vcall_mem));
35045 else
35046 ix86_emit_binop (PLUS, Pmode, this_reg, vcall_mem);
35049 /* If necessary, drop THIS back to its stack slot. */
35050 if (this_reg && this_reg != this_param)
35051 emit_move_insn (this_param, this_reg);
35053 fnaddr = XEXP (DECL_RTL (function), 0);
35054 if (TARGET_64BIT)
35056 if (!flag_pic || targetm.binds_local_p (function)
35057 || cfun->machine->call_abi == MS_ABI)
35059 else
35061 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
35062 tmp = gen_rtx_CONST (Pmode, tmp);
35063 fnaddr = gen_const_mem (Pmode, tmp);
35066 else
35068 if (!flag_pic || targetm.binds_local_p (function))
35070 #if TARGET_MACHO
35071 else if (TARGET_MACHO)
35073 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
35074 fnaddr = XEXP (fnaddr, 0);
35076 #endif /* TARGET_MACHO */
35077 else
35079 tmp = gen_rtx_REG (Pmode, CX_REG);
35080 output_set_got (tmp, NULL_RTX);
35082 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
35083 fnaddr = gen_rtx_CONST (Pmode, fnaddr);
35084 fnaddr = gen_rtx_PLUS (Pmode, tmp, fnaddr);
35085 fnaddr = gen_const_mem (Pmode, fnaddr);
35089 /* Our sibling call patterns do not allow memories, because we have no
35090 predicate that can distinguish between frame and non-frame memory.
35091 For our purposes here, we can get away with (ab)using a jump pattern,
35092 because we're going to do no optimization. */
35093 if (MEM_P (fnaddr))
35094 emit_jump_insn (gen_indirect_jump (fnaddr));
35095 else
35097 if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr))
35098 fnaddr = legitimize_pic_address (fnaddr,
35099 gen_rtx_REG (Pmode, tmp_regno));
35101 if (!sibcall_insn_operand (fnaddr, word_mode))
35103 tmp = gen_rtx_REG (word_mode, tmp_regno);
35104 if (GET_MODE (fnaddr) != word_mode)
35105 fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
35106 emit_move_insn (tmp, fnaddr);
35107 fnaddr = tmp;
35110 tmp = gen_rtx_MEM (QImode, fnaddr);
35111 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
35112 tmp = emit_call_insn (tmp);
35113 SIBLING_CALL_P (tmp) = 1;
35115 emit_barrier ();
35117 /* Emit just enough of rest_of_compilation to get the insns emitted.
35118 Note that use_thunk calls assemble_start_function et al. */
35119 tmp = get_insns ();
35120 shorten_branches (tmp);
35121 final_start_function (tmp, file, 1);
35122 final (tmp, file, 1);
35123 final_end_function ();
35126 static void
35127 x86_file_start (void)
35129 default_file_start ();
35130 #if TARGET_MACHO
35131 darwin_file_start ();
35132 #endif
35133 if (X86_FILE_START_VERSION_DIRECTIVE)
35134 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
35135 if (X86_FILE_START_FLTUSED)
35136 fputs ("\t.global\t__fltused\n", asm_out_file);
35137 if (ix86_asm_dialect == ASM_INTEL)
35138 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
35142 x86_field_alignment (tree field, int computed)
35144 enum machine_mode mode;
35145 tree type = TREE_TYPE (field);
35147 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
35148 return computed;
35149 mode = TYPE_MODE (strip_array_types (type));
35150 if (mode == DFmode || mode == DCmode
35151 || GET_MODE_CLASS (mode) == MODE_INT
35152 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
35153 return MIN (32, computed);
35154 return computed;
35157 /* Output assembler code to FILE to increment profiler label # LABELNO
35158 for profiling a function entry. */
35159 void
35160 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
35162 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
35163 : MCOUNT_NAME);
35165 if (TARGET_64BIT)
35167 #ifndef NO_PROFILE_COUNTERS
35168 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
35169 #endif
35171 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
35172 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
35173 else
35174 fprintf (file, "\tcall\t%s\n", mcount_name);
35176 else if (flag_pic)
35178 #ifndef NO_PROFILE_COUNTERS
35179 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
35180 LPREFIX, labelno);
35181 #endif
35182 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
35184 else
35186 #ifndef NO_PROFILE_COUNTERS
35187 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
35188 LPREFIX, labelno);
35189 #endif
35190 fprintf (file, "\tcall\t%s\n", mcount_name);
35194 /* We don't have exact information about the insn sizes, but we may assume
35195 quite safely that we are informed about all 1 byte insns and memory
35196 address sizes. This is enough to eliminate unnecessary padding in
35197 99% of cases. */
35199 static int
35200 min_insn_size (rtx insn)
35202 int l = 0, len;
35204 if (!INSN_P (insn) || !active_insn_p (insn))
35205 return 0;
35207 /* Discard alignments we've emit and jump instructions. */
35208 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
35209 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
35210 return 0;
35211 if (JUMP_TABLE_DATA_P (insn))
35212 return 0;
35214 /* Important case - calls are always 5 bytes.
35215 It is common to have many calls in the row. */
35216 if (CALL_P (insn)
35217 && symbolic_reference_mentioned_p (PATTERN (insn))
35218 && !SIBLING_CALL_P (insn))
35219 return 5;
35220 len = get_attr_length (insn);
35221 if (len <= 1)
35222 return 1;
35224 /* For normal instructions we rely on get_attr_length being exact,
35225 with a few exceptions. */
35226 if (!JUMP_P (insn))
35228 enum attr_type type = get_attr_type (insn);
35230 switch (type)
35232 case TYPE_MULTI:
35233 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
35234 || asm_noperands (PATTERN (insn)) >= 0)
35235 return 0;
35236 break;
35237 case TYPE_OTHER:
35238 case TYPE_FCMP:
35239 break;
35240 default:
35241 /* Otherwise trust get_attr_length. */
35242 return len;
35245 l = get_attr_length_address (insn);
35246 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
35247 l = 4;
35249 if (l)
35250 return 1+l;
35251 else
35252 return 2;
35255 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35257 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
35258 window. */
35260 static void
35261 ix86_avoid_jump_mispredicts (void)
35263 rtx insn, start = get_insns ();
35264 int nbytes = 0, njumps = 0;
35265 int isjump = 0;
35267 /* Look for all minimal intervals of instructions containing 4 jumps.
35268 The intervals are bounded by START and INSN. NBYTES is the total
35269 size of instructions in the interval including INSN and not including
35270 START. When the NBYTES is smaller than 16 bytes, it is possible
35271 that the end of START and INSN ends up in the same 16byte page.
35273 The smallest offset in the page INSN can start is the case where START
35274 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
35275 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
35277 Don't consider asm goto as jump, while it can contain a jump, it doesn't
35278 have to, control transfer to label(s) can be performed through other
35279 means, and also we estimate minimum length of all asm stmts as 0. */
35280 for (insn = start; insn; insn = NEXT_INSN (insn))
35282 int min_size;
35284 if (LABEL_P (insn))
35286 int align = label_to_alignment (insn);
35287 int max_skip = label_to_max_skip (insn);
35289 if (max_skip > 15)
35290 max_skip = 15;
35291 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
35292 already in the current 16 byte page, because otherwise
35293 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
35294 bytes to reach 16 byte boundary. */
35295 if (align <= 0
35296 || (align <= 3 && max_skip != (1 << align) - 1))
35297 max_skip = 0;
35298 if (dump_file)
35299 fprintf (dump_file, "Label %i with max_skip %i\n",
35300 INSN_UID (insn), max_skip);
35301 if (max_skip)
35303 while (nbytes + max_skip >= 16)
35305 start = NEXT_INSN (start);
35306 if ((JUMP_P (start)
35307 && asm_noperands (PATTERN (start)) < 0
35308 && GET_CODE (PATTERN (start)) != ADDR_VEC
35309 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35310 || CALL_P (start))
35311 njumps--, isjump = 1;
35312 else
35313 isjump = 0;
35314 nbytes -= min_insn_size (start);
35317 continue;
35320 min_size = min_insn_size (insn);
35321 nbytes += min_size;
35322 if (dump_file)
35323 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
35324 INSN_UID (insn), min_size);
35325 if ((JUMP_P (insn)
35326 && asm_noperands (PATTERN (insn)) < 0
35327 && GET_CODE (PATTERN (insn)) != ADDR_VEC
35328 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
35329 || CALL_P (insn))
35330 njumps++;
35331 else
35332 continue;
35334 while (njumps > 3)
35336 start = NEXT_INSN (start);
35337 if ((JUMP_P (start)
35338 && asm_noperands (PATTERN (start)) < 0
35339 && GET_CODE (PATTERN (start)) != ADDR_VEC
35340 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
35341 || CALL_P (start))
35342 njumps--, isjump = 1;
35343 else
35344 isjump = 0;
35345 nbytes -= min_insn_size (start);
35347 gcc_assert (njumps >= 0);
35348 if (dump_file)
35349 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
35350 INSN_UID (start), INSN_UID (insn), nbytes);
35352 if (njumps == 3 && isjump && nbytes < 16)
35354 int padsize = 15 - nbytes + min_insn_size (insn);
35356 if (dump_file)
35357 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
35358 INSN_UID (insn), padsize);
35359 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
35363 #endif
35365 /* AMD Athlon works faster
35366 when RET is not destination of conditional jump or directly preceded
35367 by other jump instruction. We avoid the penalty by inserting NOP just
35368 before the RET instructions in such cases. */
35369 static void
35370 ix86_pad_returns (void)
35372 edge e;
35373 edge_iterator ei;
35375 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35377 basic_block bb = e->src;
35378 rtx ret = BB_END (bb);
35379 rtx prev;
35380 bool replace = false;
35382 if (!JUMP_P (ret) || !ANY_RETURN_P (PATTERN (ret))
35383 || optimize_bb_for_size_p (bb))
35384 continue;
35385 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
35386 if (active_insn_p (prev) || LABEL_P (prev))
35387 break;
35388 if (prev && LABEL_P (prev))
35390 edge e;
35391 edge_iterator ei;
35393 FOR_EACH_EDGE (e, ei, bb->preds)
35394 if (EDGE_FREQUENCY (e) && e->src->index >= 0
35395 && !(e->flags & EDGE_FALLTHRU))
35396 replace = true;
35398 if (!replace)
35400 prev = prev_active_insn (ret);
35401 if (prev
35402 && ((JUMP_P (prev) && any_condjump_p (prev))
35403 || CALL_P (prev)))
35404 replace = true;
35405 /* Empty functions get branch mispredict even when
35406 the jump destination is not visible to us. */
35407 if (!prev && !optimize_function_for_size_p (cfun))
35408 replace = true;
35410 if (replace)
35412 emit_jump_insn_before (gen_simple_return_internal_long (), ret);
35413 delete_insn (ret);
35418 /* Count the minimum number of instructions in BB. Return 4 if the
35419 number of instructions >= 4. */
35421 static int
35422 ix86_count_insn_bb (basic_block bb)
35424 rtx insn;
35425 int insn_count = 0;
35427 /* Count number of instructions in this block. Return 4 if the number
35428 of instructions >= 4. */
35429 FOR_BB_INSNS (bb, insn)
35431 /* Only happen in exit blocks. */
35432 if (JUMP_P (insn)
35433 && ANY_RETURN_P (PATTERN (insn)))
35434 break;
35436 if (NONDEBUG_INSN_P (insn)
35437 && GET_CODE (PATTERN (insn)) != USE
35438 && GET_CODE (PATTERN (insn)) != CLOBBER)
35440 insn_count++;
35441 if (insn_count >= 4)
35442 return insn_count;
35446 return insn_count;
35450 /* Count the minimum number of instructions in code path in BB.
35451 Return 4 if the number of instructions >= 4. */
35453 static int
35454 ix86_count_insn (basic_block bb)
35456 edge e;
35457 edge_iterator ei;
35458 int min_prev_count;
35460 /* Only bother counting instructions along paths with no
35461 more than 2 basic blocks between entry and exit. Given
35462 that BB has an edge to exit, determine if a predecessor
35463 of BB has an edge from entry. If so, compute the number
35464 of instructions in the predecessor block. If there
35465 happen to be multiple such blocks, compute the minimum. */
35466 min_prev_count = 4;
35467 FOR_EACH_EDGE (e, ei, bb->preds)
35469 edge prev_e;
35470 edge_iterator prev_ei;
35472 if (e->src == ENTRY_BLOCK_PTR)
35474 min_prev_count = 0;
35475 break;
35477 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
35479 if (prev_e->src == ENTRY_BLOCK_PTR)
35481 int count = ix86_count_insn_bb (e->src);
35482 if (count < min_prev_count)
35483 min_prev_count = count;
35484 break;
35489 if (min_prev_count < 4)
35490 min_prev_count += ix86_count_insn_bb (bb);
35492 return min_prev_count;
35495 /* Pad short function to 4 instructions. */
35497 static void
35498 ix86_pad_short_function (void)
35500 edge e;
35501 edge_iterator ei;
35503 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35505 rtx ret = BB_END (e->src);
35506 if (JUMP_P (ret) && ANY_RETURN_P (PATTERN (ret)))
35508 int insn_count = ix86_count_insn (e->src);
35510 /* Pad short function. */
35511 if (insn_count < 4)
35513 rtx insn = ret;
35515 /* Find epilogue. */
35516 while (insn
35517 && (!NOTE_P (insn)
35518 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
35519 insn = PREV_INSN (insn);
35521 if (!insn)
35522 insn = ret;
35524 /* Two NOPs count as one instruction. */
35525 insn_count = 2 * (4 - insn_count);
35526 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
35532 /* Fix up a Windows system unwinder issue. If an EH region falls thru into
35533 the epilogue, the Windows system unwinder will apply epilogue logic and
35534 produce incorrect offsets. This can be avoided by adding a nop between
35535 the last insn that can throw and the first insn of the epilogue. */
35537 static void
35538 ix86_seh_fixup_eh_fallthru (void)
35540 edge e;
35541 edge_iterator ei;
35543 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
35545 rtx insn, next;
35547 /* Find the beginning of the epilogue. */
35548 for (insn = BB_END (e->src); insn != NULL; insn = PREV_INSN (insn))
35549 if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_EPILOGUE_BEG)
35550 break;
35551 if (insn == NULL)
35552 continue;
35554 /* We only care about preceeding insns that can throw. */
35555 insn = prev_active_insn (insn);
35556 if (insn == NULL || !can_throw_internal (insn))
35557 continue;
35559 /* Do not separate calls from their debug information. */
35560 for (next = NEXT_INSN (insn); next != NULL; next = NEXT_INSN (next))
35561 if (NOTE_P (next)
35562 && (NOTE_KIND (next) == NOTE_INSN_VAR_LOCATION
35563 || NOTE_KIND (next) == NOTE_INSN_CALL_ARG_LOCATION))
35564 insn = next;
35565 else
35566 break;
35568 emit_insn_after (gen_nops (const1_rtx), insn);
35572 /* Implement machine specific optimizations. We implement padding of returns
35573 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
35574 static void
35575 ix86_reorg (void)
35577 /* We are freeing block_for_insn in the toplev to keep compatibility
35578 with old MDEP_REORGS that are not CFG based. Recompute it now. */
35579 compute_bb_for_insn ();
35581 if (TARGET_SEH && current_function_has_exception_handlers ())
35582 ix86_seh_fixup_eh_fallthru ();
35584 if (optimize && optimize_function_for_speed_p (cfun))
35586 if (TARGET_PAD_SHORT_FUNCTION)
35587 ix86_pad_short_function ();
35588 else if (TARGET_PAD_RETURNS)
35589 ix86_pad_returns ();
35590 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
35591 if (TARGET_FOUR_JUMP_LIMIT)
35592 ix86_avoid_jump_mispredicts ();
35593 #endif
35597 /* Return nonzero when QImode register that must be represented via REX prefix
35598 is used. */
35599 bool
35600 x86_extended_QIreg_mentioned_p (rtx insn)
35602 int i;
35603 extract_insn_cached (insn);
35604 for (i = 0; i < recog_data.n_operands; i++)
35605 if (GENERAL_REG_P (recog_data.operand[i])
35606 && !QI_REGNO_P (REGNO (recog_data.operand[i])))
35607 return true;
35608 return false;
35611 /* Return nonzero when P points to register encoded via REX prefix.
35612 Called via for_each_rtx. */
35613 static int
35614 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
35616 unsigned int regno;
35617 if (!REG_P (*p))
35618 return 0;
35619 regno = REGNO (*p);
35620 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
35623 /* Return true when INSN mentions register that must be encoded using REX
35624 prefix. */
35625 bool
35626 x86_extended_reg_mentioned_p (rtx insn)
35628 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
35629 extended_reg_mentioned_1, NULL);
35632 /* If profitable, negate (without causing overflow) integer constant
35633 of mode MODE at location LOC. Return true in this case. */
35634 bool
35635 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
35637 HOST_WIDE_INT val;
35639 if (!CONST_INT_P (*loc))
35640 return false;
35642 switch (mode)
35644 case DImode:
35645 /* DImode x86_64 constants must fit in 32 bits. */
35646 gcc_assert (x86_64_immediate_operand (*loc, mode));
35648 mode = SImode;
35649 break;
35651 case SImode:
35652 case HImode:
35653 case QImode:
35654 break;
35656 default:
35657 gcc_unreachable ();
35660 /* Avoid overflows. */
35661 if (mode_signbit_p (mode, *loc))
35662 return false;
35664 val = INTVAL (*loc);
35666 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
35667 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
35668 if ((val < 0 && val != -128)
35669 || val == 128)
35671 *loc = GEN_INT (-val);
35672 return true;
35675 return false;
35678 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
35679 optabs would emit if we didn't have TFmode patterns. */
35681 void
35682 x86_emit_floatuns (rtx operands[2])
35684 rtx neglab, donelab, i0, i1, f0, in, out;
35685 enum machine_mode mode, inmode;
35687 inmode = GET_MODE (operands[1]);
35688 gcc_assert (inmode == SImode || inmode == DImode);
35690 out = operands[0];
35691 in = force_reg (inmode, operands[1]);
35692 mode = GET_MODE (out);
35693 neglab = gen_label_rtx ();
35694 donelab = gen_label_rtx ();
35695 f0 = gen_reg_rtx (mode);
35697 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
35699 expand_float (out, in, 0);
35701 emit_jump_insn (gen_jump (donelab));
35702 emit_barrier ();
35704 emit_label (neglab);
35706 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
35707 1, OPTAB_DIRECT);
35708 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
35709 1, OPTAB_DIRECT);
35710 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
35712 expand_float (f0, i0, 0);
35714 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
35716 emit_label (donelab);
35719 /* AVX2 does support 32-byte integer vector operations,
35720 thus the longest vector we are faced with is V32QImode. */
35721 #define MAX_VECT_LEN 32
35723 struct expand_vec_perm_d
35725 rtx target, op0, op1;
35726 unsigned char perm[MAX_VECT_LEN];
35727 enum machine_mode vmode;
35728 unsigned char nelt;
35729 bool one_operand_p;
35730 bool testing_p;
35733 static bool canonicalize_perm (struct expand_vec_perm_d *d);
35734 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
35735 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
35737 /* Get a vector mode of the same size as the original but with elements
35738 twice as wide. This is only guaranteed to apply to integral vectors. */
35740 static inline enum machine_mode
35741 get_mode_wider_vector (enum machine_mode o)
35743 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
35744 enum machine_mode n = GET_MODE_WIDER_MODE (o);
35745 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
35746 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
35747 return n;
35750 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35751 with all elements equal to VAR. Return true if successful. */
35753 static bool
35754 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
35755 rtx target, rtx val)
35757 bool ok;
35759 switch (mode)
35761 case V2SImode:
35762 case V2SFmode:
35763 if (!mmx_ok)
35764 return false;
35765 /* FALLTHRU */
35767 case V4DFmode:
35768 case V4DImode:
35769 case V8SFmode:
35770 case V8SImode:
35771 case V2DFmode:
35772 case V2DImode:
35773 case V4SFmode:
35774 case V4SImode:
35776 rtx insn, dup;
35778 /* First attempt to recognize VAL as-is. */
35779 dup = gen_rtx_VEC_DUPLICATE (mode, val);
35780 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
35781 if (recog_memoized (insn) < 0)
35783 rtx seq;
35784 /* If that fails, force VAL into a register. */
35786 start_sequence ();
35787 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
35788 seq = get_insns ();
35789 end_sequence ();
35790 if (seq)
35791 emit_insn_before (seq, insn);
35793 ok = recog_memoized (insn) >= 0;
35794 gcc_assert (ok);
35797 return true;
35799 case V4HImode:
35800 if (!mmx_ok)
35801 return false;
35802 if (TARGET_SSE || TARGET_3DNOW_A)
35804 rtx x;
35806 val = gen_lowpart (SImode, val);
35807 x = gen_rtx_TRUNCATE (HImode, val);
35808 x = gen_rtx_VEC_DUPLICATE (mode, x);
35809 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35810 return true;
35812 goto widen;
35814 case V8QImode:
35815 if (!mmx_ok)
35816 return false;
35817 goto widen;
35819 case V8HImode:
35820 if (TARGET_SSE2)
35822 struct expand_vec_perm_d dperm;
35823 rtx tmp1, tmp2;
35825 permute:
35826 memset (&dperm, 0, sizeof (dperm));
35827 dperm.target = target;
35828 dperm.vmode = mode;
35829 dperm.nelt = GET_MODE_NUNITS (mode);
35830 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
35831 dperm.one_operand_p = true;
35833 /* Extend to SImode using a paradoxical SUBREG. */
35834 tmp1 = gen_reg_rtx (SImode);
35835 emit_move_insn (tmp1, gen_lowpart (SImode, val));
35837 /* Insert the SImode value as low element of a V4SImode vector. */
35838 tmp2 = gen_lowpart (V4SImode, dperm.op0);
35839 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
35841 ok = (expand_vec_perm_1 (&dperm)
35842 || expand_vec_perm_broadcast_1 (&dperm));
35843 gcc_assert (ok);
35844 return ok;
35846 goto widen;
35848 case V16QImode:
35849 if (TARGET_SSE2)
35850 goto permute;
35851 goto widen;
35853 widen:
35854 /* Replicate the value once into the next wider mode and recurse. */
35856 enum machine_mode smode, wsmode, wvmode;
35857 rtx x;
35859 smode = GET_MODE_INNER (mode);
35860 wvmode = get_mode_wider_vector (mode);
35861 wsmode = GET_MODE_INNER (wvmode);
35863 val = convert_modes (wsmode, smode, val, true);
35864 x = expand_simple_binop (wsmode, ASHIFT, val,
35865 GEN_INT (GET_MODE_BITSIZE (smode)),
35866 NULL_RTX, 1, OPTAB_LIB_WIDEN);
35867 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
35869 x = gen_lowpart (wvmode, target);
35870 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
35871 gcc_assert (ok);
35872 return ok;
35875 case V16HImode:
35876 case V32QImode:
35878 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
35879 rtx x = gen_reg_rtx (hvmode);
35881 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
35882 gcc_assert (ok);
35884 x = gen_rtx_VEC_CONCAT (mode, x, x);
35885 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35887 return true;
35889 default:
35890 return false;
35894 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
35895 whose ONE_VAR element is VAR, and other elements are zero. Return true
35896 if successful. */
35898 static bool
35899 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
35900 rtx target, rtx var, int one_var)
35902 enum machine_mode vsimode;
35903 rtx new_target;
35904 rtx x, tmp;
35905 bool use_vector_set = false;
35907 switch (mode)
35909 case V2DImode:
35910 /* For SSE4.1, we normally use vector set. But if the second
35911 element is zero and inter-unit moves are OK, we use movq
35912 instead. */
35913 use_vector_set = (TARGET_64BIT
35914 && TARGET_SSE4_1
35915 && !(TARGET_INTER_UNIT_MOVES
35916 && one_var == 0));
35917 break;
35918 case V16QImode:
35919 case V4SImode:
35920 case V4SFmode:
35921 use_vector_set = TARGET_SSE4_1;
35922 break;
35923 case V8HImode:
35924 use_vector_set = TARGET_SSE2;
35925 break;
35926 case V4HImode:
35927 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
35928 break;
35929 case V32QImode:
35930 case V16HImode:
35931 case V8SImode:
35932 case V8SFmode:
35933 case V4DFmode:
35934 use_vector_set = TARGET_AVX;
35935 break;
35936 case V4DImode:
35937 /* Use ix86_expand_vector_set in 64bit mode only. */
35938 use_vector_set = TARGET_AVX && TARGET_64BIT;
35939 break;
35940 default:
35941 break;
35944 if (use_vector_set)
35946 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
35947 var = force_reg (GET_MODE_INNER (mode), var);
35948 ix86_expand_vector_set (mmx_ok, target, var, one_var);
35949 return true;
35952 switch (mode)
35954 case V2SFmode:
35955 case V2SImode:
35956 if (!mmx_ok)
35957 return false;
35958 /* FALLTHRU */
35960 case V2DFmode:
35961 case V2DImode:
35962 if (one_var != 0)
35963 return false;
35964 var = force_reg (GET_MODE_INNER (mode), var);
35965 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
35966 emit_insn (gen_rtx_SET (VOIDmode, target, x));
35967 return true;
35969 case V4SFmode:
35970 case V4SImode:
35971 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
35972 new_target = gen_reg_rtx (mode);
35973 else
35974 new_target = target;
35975 var = force_reg (GET_MODE_INNER (mode), var);
35976 x = gen_rtx_VEC_DUPLICATE (mode, var);
35977 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
35978 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
35979 if (one_var != 0)
35981 /* We need to shuffle the value to the correct position, so
35982 create a new pseudo to store the intermediate result. */
35984 /* With SSE2, we can use the integer shuffle insns. */
35985 if (mode != V4SFmode && TARGET_SSE2)
35987 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
35988 const1_rtx,
35989 GEN_INT (one_var == 1 ? 0 : 1),
35990 GEN_INT (one_var == 2 ? 0 : 1),
35991 GEN_INT (one_var == 3 ? 0 : 1)));
35992 if (target != new_target)
35993 emit_move_insn (target, new_target);
35994 return true;
35997 /* Otherwise convert the intermediate result to V4SFmode and
35998 use the SSE1 shuffle instructions. */
35999 if (mode != V4SFmode)
36001 tmp = gen_reg_rtx (V4SFmode);
36002 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
36004 else
36005 tmp = new_target;
36007 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
36008 const1_rtx,
36009 GEN_INT (one_var == 1 ? 0 : 1),
36010 GEN_INT (one_var == 2 ? 0+4 : 1+4),
36011 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
36013 if (mode != V4SFmode)
36014 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
36015 else if (tmp != target)
36016 emit_move_insn (target, tmp);
36018 else if (target != new_target)
36019 emit_move_insn (target, new_target);
36020 return true;
36022 case V8HImode:
36023 case V16QImode:
36024 vsimode = V4SImode;
36025 goto widen;
36026 case V4HImode:
36027 case V8QImode:
36028 if (!mmx_ok)
36029 return false;
36030 vsimode = V2SImode;
36031 goto widen;
36032 widen:
36033 if (one_var != 0)
36034 return false;
36036 /* Zero extend the variable element to SImode and recurse. */
36037 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
36039 x = gen_reg_rtx (vsimode);
36040 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
36041 var, one_var))
36042 gcc_unreachable ();
36044 emit_move_insn (target, gen_lowpart (mode, x));
36045 return true;
36047 default:
36048 return false;
36052 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
36053 consisting of the values in VALS. It is known that all elements
36054 except ONE_VAR are constants. Return true if successful. */
36056 static bool
36057 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
36058 rtx target, rtx vals, int one_var)
36060 rtx var = XVECEXP (vals, 0, one_var);
36061 enum machine_mode wmode;
36062 rtx const_vec, x;
36064 const_vec = copy_rtx (vals);
36065 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
36066 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
36068 switch (mode)
36070 case V2DFmode:
36071 case V2DImode:
36072 case V2SFmode:
36073 case V2SImode:
36074 /* For the two element vectors, it's just as easy to use
36075 the general case. */
36076 return false;
36078 case V4DImode:
36079 /* Use ix86_expand_vector_set in 64bit mode only. */
36080 if (!TARGET_64BIT)
36081 return false;
36082 case V4DFmode:
36083 case V8SFmode:
36084 case V8SImode:
36085 case V16HImode:
36086 case V32QImode:
36087 case V4SFmode:
36088 case V4SImode:
36089 case V8HImode:
36090 case V4HImode:
36091 break;
36093 case V16QImode:
36094 if (TARGET_SSE4_1)
36095 break;
36096 wmode = V8HImode;
36097 goto widen;
36098 case V8QImode:
36099 wmode = V4HImode;
36100 goto widen;
36101 widen:
36102 /* There's no way to set one QImode entry easily. Combine
36103 the variable value with its adjacent constant value, and
36104 promote to an HImode set. */
36105 x = XVECEXP (vals, 0, one_var ^ 1);
36106 if (one_var & 1)
36108 var = convert_modes (HImode, QImode, var, true);
36109 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
36110 NULL_RTX, 1, OPTAB_LIB_WIDEN);
36111 x = GEN_INT (INTVAL (x) & 0xff);
36113 else
36115 var = convert_modes (HImode, QImode, var, true);
36116 x = gen_int_mode (INTVAL (x) << 8, HImode);
36118 if (x != const0_rtx)
36119 var = expand_simple_binop (HImode, IOR, var, x, var,
36120 1, OPTAB_LIB_WIDEN);
36122 x = gen_reg_rtx (wmode);
36123 emit_move_insn (x, gen_lowpart (wmode, const_vec));
36124 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
36126 emit_move_insn (target, gen_lowpart (mode, x));
36127 return true;
36129 default:
36130 return false;
36133 emit_move_insn (target, const_vec);
36134 ix86_expand_vector_set (mmx_ok, target, var, one_var);
36135 return true;
36138 /* A subroutine of ix86_expand_vector_init_general. Use vector
36139 concatenate to handle the most general case: all values variable,
36140 and none identical. */
36142 static void
36143 ix86_expand_vector_init_concat (enum machine_mode mode,
36144 rtx target, rtx *ops, int n)
36146 enum machine_mode cmode, hmode = VOIDmode;
36147 rtx first[8], second[4];
36148 rtvec v;
36149 int i, j;
36151 switch (n)
36153 case 2:
36154 switch (mode)
36156 case V8SImode:
36157 cmode = V4SImode;
36158 break;
36159 case V8SFmode:
36160 cmode = V4SFmode;
36161 break;
36162 case V4DImode:
36163 cmode = V2DImode;
36164 break;
36165 case V4DFmode:
36166 cmode = V2DFmode;
36167 break;
36168 case V4SImode:
36169 cmode = V2SImode;
36170 break;
36171 case V4SFmode:
36172 cmode = V2SFmode;
36173 break;
36174 case V2DImode:
36175 cmode = DImode;
36176 break;
36177 case V2SImode:
36178 cmode = SImode;
36179 break;
36180 case V2DFmode:
36181 cmode = DFmode;
36182 break;
36183 case V2SFmode:
36184 cmode = SFmode;
36185 break;
36186 default:
36187 gcc_unreachable ();
36190 if (!register_operand (ops[1], cmode))
36191 ops[1] = force_reg (cmode, ops[1]);
36192 if (!register_operand (ops[0], cmode))
36193 ops[0] = force_reg (cmode, ops[0]);
36194 emit_insn (gen_rtx_SET (VOIDmode, target,
36195 gen_rtx_VEC_CONCAT (mode, ops[0],
36196 ops[1])));
36197 break;
36199 case 4:
36200 switch (mode)
36202 case V4DImode:
36203 cmode = V2DImode;
36204 break;
36205 case V4DFmode:
36206 cmode = V2DFmode;
36207 break;
36208 case V4SImode:
36209 cmode = V2SImode;
36210 break;
36211 case V4SFmode:
36212 cmode = V2SFmode;
36213 break;
36214 default:
36215 gcc_unreachable ();
36217 goto half;
36219 case 8:
36220 switch (mode)
36222 case V8SImode:
36223 cmode = V2SImode;
36224 hmode = V4SImode;
36225 break;
36226 case V8SFmode:
36227 cmode = V2SFmode;
36228 hmode = V4SFmode;
36229 break;
36230 default:
36231 gcc_unreachable ();
36233 goto half;
36235 half:
36236 /* FIXME: We process inputs backward to help RA. PR 36222. */
36237 i = n - 1;
36238 j = (n >> 1) - 1;
36239 for (; i > 0; i -= 2, j--)
36241 first[j] = gen_reg_rtx (cmode);
36242 v = gen_rtvec (2, ops[i - 1], ops[i]);
36243 ix86_expand_vector_init (false, first[j],
36244 gen_rtx_PARALLEL (cmode, v));
36247 n >>= 1;
36248 if (n > 2)
36250 gcc_assert (hmode != VOIDmode);
36251 for (i = j = 0; i < n; i += 2, j++)
36253 second[j] = gen_reg_rtx (hmode);
36254 ix86_expand_vector_init_concat (hmode, second [j],
36255 &first [i], 2);
36257 n >>= 1;
36258 ix86_expand_vector_init_concat (mode, target, second, n);
36260 else
36261 ix86_expand_vector_init_concat (mode, target, first, n);
36262 break;
36264 default:
36265 gcc_unreachable ();
36269 /* A subroutine of ix86_expand_vector_init_general. Use vector
36270 interleave to handle the most general case: all values variable,
36271 and none identical. */
36273 static void
36274 ix86_expand_vector_init_interleave (enum machine_mode mode,
36275 rtx target, rtx *ops, int n)
36277 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
36278 int i, j;
36279 rtx op0, op1;
36280 rtx (*gen_load_even) (rtx, rtx, rtx);
36281 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
36282 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
36284 switch (mode)
36286 case V8HImode:
36287 gen_load_even = gen_vec_setv8hi;
36288 gen_interleave_first_low = gen_vec_interleave_lowv4si;
36289 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36290 inner_mode = HImode;
36291 first_imode = V4SImode;
36292 second_imode = V2DImode;
36293 third_imode = VOIDmode;
36294 break;
36295 case V16QImode:
36296 gen_load_even = gen_vec_setv16qi;
36297 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
36298 gen_interleave_second_low = gen_vec_interleave_lowv4si;
36299 inner_mode = QImode;
36300 first_imode = V8HImode;
36301 second_imode = V4SImode;
36302 third_imode = V2DImode;
36303 break;
36304 default:
36305 gcc_unreachable ();
36308 for (i = 0; i < n; i++)
36310 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
36311 op0 = gen_reg_rtx (SImode);
36312 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
36314 /* Insert the SImode value as low element of V4SImode vector. */
36315 op1 = gen_reg_rtx (V4SImode);
36316 op0 = gen_rtx_VEC_MERGE (V4SImode,
36317 gen_rtx_VEC_DUPLICATE (V4SImode,
36318 op0),
36319 CONST0_RTX (V4SImode),
36320 const1_rtx);
36321 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
36323 /* Cast the V4SImode vector back to a vector in orignal mode. */
36324 op0 = gen_reg_rtx (mode);
36325 emit_move_insn (op0, gen_lowpart (mode, op1));
36327 /* Load even elements into the second positon. */
36328 emit_insn (gen_load_even (op0,
36329 force_reg (inner_mode,
36330 ops [i + i + 1]),
36331 const1_rtx));
36333 /* Cast vector to FIRST_IMODE vector. */
36334 ops[i] = gen_reg_rtx (first_imode);
36335 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
36338 /* Interleave low FIRST_IMODE vectors. */
36339 for (i = j = 0; i < n; i += 2, j++)
36341 op0 = gen_reg_rtx (first_imode);
36342 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
36344 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
36345 ops[j] = gen_reg_rtx (second_imode);
36346 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
36349 /* Interleave low SECOND_IMODE vectors. */
36350 switch (second_imode)
36352 case V4SImode:
36353 for (i = j = 0; i < n / 2; i += 2, j++)
36355 op0 = gen_reg_rtx (second_imode);
36356 emit_insn (gen_interleave_second_low (op0, ops[i],
36357 ops[i + 1]));
36359 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
36360 vector. */
36361 ops[j] = gen_reg_rtx (third_imode);
36362 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
36364 second_imode = V2DImode;
36365 gen_interleave_second_low = gen_vec_interleave_lowv2di;
36366 /* FALLTHRU */
36368 case V2DImode:
36369 op0 = gen_reg_rtx (second_imode);
36370 emit_insn (gen_interleave_second_low (op0, ops[0],
36371 ops[1]));
36373 /* Cast the SECOND_IMODE vector back to a vector on original
36374 mode. */
36375 emit_insn (gen_rtx_SET (VOIDmode, target,
36376 gen_lowpart (mode, op0)));
36377 break;
36379 default:
36380 gcc_unreachable ();
36384 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
36385 all values variable, and none identical. */
36387 static void
36388 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
36389 rtx target, rtx vals)
36391 rtx ops[32], op0, op1;
36392 enum machine_mode half_mode = VOIDmode;
36393 int n, i;
36395 switch (mode)
36397 case V2SFmode:
36398 case V2SImode:
36399 if (!mmx_ok && !TARGET_SSE)
36400 break;
36401 /* FALLTHRU */
36403 case V8SFmode:
36404 case V8SImode:
36405 case V4DFmode:
36406 case V4DImode:
36407 case V4SFmode:
36408 case V4SImode:
36409 case V2DFmode:
36410 case V2DImode:
36411 n = GET_MODE_NUNITS (mode);
36412 for (i = 0; i < n; i++)
36413 ops[i] = XVECEXP (vals, 0, i);
36414 ix86_expand_vector_init_concat (mode, target, ops, n);
36415 return;
36417 case V32QImode:
36418 half_mode = V16QImode;
36419 goto half;
36421 case V16HImode:
36422 half_mode = V8HImode;
36423 goto half;
36425 half:
36426 n = GET_MODE_NUNITS (mode);
36427 for (i = 0; i < n; i++)
36428 ops[i] = XVECEXP (vals, 0, i);
36429 op0 = gen_reg_rtx (half_mode);
36430 op1 = gen_reg_rtx (half_mode);
36431 ix86_expand_vector_init_interleave (half_mode, op0, ops,
36432 n >> 2);
36433 ix86_expand_vector_init_interleave (half_mode, op1,
36434 &ops [n >> 1], n >> 2);
36435 emit_insn (gen_rtx_SET (VOIDmode, target,
36436 gen_rtx_VEC_CONCAT (mode, op0, op1)));
36437 return;
36439 case V16QImode:
36440 if (!TARGET_SSE4_1)
36441 break;
36442 /* FALLTHRU */
36444 case V8HImode:
36445 if (!TARGET_SSE2)
36446 break;
36448 /* Don't use ix86_expand_vector_init_interleave if we can't
36449 move from GPR to SSE register directly. */
36450 if (!TARGET_INTER_UNIT_MOVES)
36451 break;
36453 n = GET_MODE_NUNITS (mode);
36454 for (i = 0; i < n; i++)
36455 ops[i] = XVECEXP (vals, 0, i);
36456 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
36457 return;
36459 case V4HImode:
36460 case V8QImode:
36461 break;
36463 default:
36464 gcc_unreachable ();
36468 int i, j, n_elts, n_words, n_elt_per_word;
36469 enum machine_mode inner_mode;
36470 rtx words[4], shift;
36472 inner_mode = GET_MODE_INNER (mode);
36473 n_elts = GET_MODE_NUNITS (mode);
36474 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
36475 n_elt_per_word = n_elts / n_words;
36476 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
36478 for (i = 0; i < n_words; ++i)
36480 rtx word = NULL_RTX;
36482 for (j = 0; j < n_elt_per_word; ++j)
36484 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
36485 elt = convert_modes (word_mode, inner_mode, elt, true);
36487 if (j == 0)
36488 word = elt;
36489 else
36491 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
36492 word, 1, OPTAB_LIB_WIDEN);
36493 word = expand_simple_binop (word_mode, IOR, word, elt,
36494 word, 1, OPTAB_LIB_WIDEN);
36498 words[i] = word;
36501 if (n_words == 1)
36502 emit_move_insn (target, gen_lowpart (mode, words[0]));
36503 else if (n_words == 2)
36505 rtx tmp = gen_reg_rtx (mode);
36506 emit_clobber (tmp);
36507 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
36508 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
36509 emit_move_insn (target, tmp);
36511 else if (n_words == 4)
36513 rtx tmp = gen_reg_rtx (V4SImode);
36514 gcc_assert (word_mode == SImode);
36515 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
36516 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
36517 emit_move_insn (target, gen_lowpart (mode, tmp));
36519 else
36520 gcc_unreachable ();
36524 /* Initialize vector TARGET via VALS. Suppress the use of MMX
36525 instructions unless MMX_OK is true. */
36527 void
36528 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
36530 enum machine_mode mode = GET_MODE (target);
36531 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36532 int n_elts = GET_MODE_NUNITS (mode);
36533 int n_var = 0, one_var = -1;
36534 bool all_same = true, all_const_zero = true;
36535 int i;
36536 rtx x;
36538 for (i = 0; i < n_elts; ++i)
36540 x = XVECEXP (vals, 0, i);
36541 if (!(CONST_INT_P (x)
36542 || GET_CODE (x) == CONST_DOUBLE
36543 || GET_CODE (x) == CONST_FIXED))
36544 n_var++, one_var = i;
36545 else if (x != CONST0_RTX (inner_mode))
36546 all_const_zero = false;
36547 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
36548 all_same = false;
36551 /* Constants are best loaded from the constant pool. */
36552 if (n_var == 0)
36554 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
36555 return;
36558 /* If all values are identical, broadcast the value. */
36559 if (all_same
36560 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
36561 XVECEXP (vals, 0, 0)))
36562 return;
36564 /* Values where only one field is non-constant are best loaded from
36565 the pool and overwritten via move later. */
36566 if (n_var == 1)
36568 if (all_const_zero
36569 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
36570 XVECEXP (vals, 0, one_var),
36571 one_var))
36572 return;
36574 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
36575 return;
36578 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
36581 void
36582 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
36584 enum machine_mode mode = GET_MODE (target);
36585 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36586 enum machine_mode half_mode;
36587 bool use_vec_merge = false;
36588 rtx tmp;
36589 static rtx (*gen_extract[6][2]) (rtx, rtx)
36591 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
36592 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
36593 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
36594 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
36595 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
36596 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
36598 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
36600 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
36601 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
36602 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
36603 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
36604 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
36605 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
36607 int i, j, n;
36609 switch (mode)
36611 case V2SFmode:
36612 case V2SImode:
36613 if (mmx_ok)
36615 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36616 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
36617 if (elt == 0)
36618 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36619 else
36620 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36621 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36622 return;
36624 break;
36626 case V2DImode:
36627 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
36628 if (use_vec_merge)
36629 break;
36631 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
36632 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
36633 if (elt == 0)
36634 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
36635 else
36636 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
36637 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36638 return;
36640 case V2DFmode:
36642 rtx op0, op1;
36644 /* For the two element vectors, we implement a VEC_CONCAT with
36645 the extraction of the other element. */
36647 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
36648 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
36650 if (elt == 0)
36651 op0 = val, op1 = tmp;
36652 else
36653 op0 = tmp, op1 = val;
36655 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
36656 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36658 return;
36660 case V4SFmode:
36661 use_vec_merge = TARGET_SSE4_1;
36662 if (use_vec_merge)
36663 break;
36665 switch (elt)
36667 case 0:
36668 use_vec_merge = true;
36669 break;
36671 case 1:
36672 /* tmp = target = A B C D */
36673 tmp = copy_to_reg (target);
36674 /* target = A A B B */
36675 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
36676 /* target = X A B B */
36677 ix86_expand_vector_set (false, target, val, 0);
36678 /* target = A X C D */
36679 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36680 const1_rtx, const0_rtx,
36681 GEN_INT (2+4), GEN_INT (3+4)));
36682 return;
36684 case 2:
36685 /* tmp = target = A B C D */
36686 tmp = copy_to_reg (target);
36687 /* tmp = X B C D */
36688 ix86_expand_vector_set (false, tmp, val, 0);
36689 /* target = A B X D */
36690 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36691 const0_rtx, const1_rtx,
36692 GEN_INT (0+4), GEN_INT (3+4)));
36693 return;
36695 case 3:
36696 /* tmp = target = A B C D */
36697 tmp = copy_to_reg (target);
36698 /* tmp = X B C D */
36699 ix86_expand_vector_set (false, tmp, val, 0);
36700 /* target = A B X D */
36701 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
36702 const0_rtx, const1_rtx,
36703 GEN_INT (2+4), GEN_INT (0+4)));
36704 return;
36706 default:
36707 gcc_unreachable ();
36709 break;
36711 case V4SImode:
36712 use_vec_merge = TARGET_SSE4_1;
36713 if (use_vec_merge)
36714 break;
36716 /* Element 0 handled by vec_merge below. */
36717 if (elt == 0)
36719 use_vec_merge = true;
36720 break;
36723 if (TARGET_SSE2)
36725 /* With SSE2, use integer shuffles to swap element 0 and ELT,
36726 store into element 0, then shuffle them back. */
36728 rtx order[4];
36730 order[0] = GEN_INT (elt);
36731 order[1] = const1_rtx;
36732 order[2] = const2_rtx;
36733 order[3] = GEN_INT (3);
36734 order[elt] = const0_rtx;
36736 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36737 order[1], order[2], order[3]));
36739 ix86_expand_vector_set (false, target, val, 0);
36741 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
36742 order[1], order[2], order[3]));
36744 else
36746 /* For SSE1, we have to reuse the V4SF code. */
36747 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
36748 gen_lowpart (SFmode, val), elt);
36750 return;
36752 case V8HImode:
36753 use_vec_merge = TARGET_SSE2;
36754 break;
36755 case V4HImode:
36756 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36757 break;
36759 case V16QImode:
36760 use_vec_merge = TARGET_SSE4_1;
36761 break;
36763 case V8QImode:
36764 break;
36766 case V32QImode:
36767 half_mode = V16QImode;
36768 j = 0;
36769 n = 16;
36770 goto half;
36772 case V16HImode:
36773 half_mode = V8HImode;
36774 j = 1;
36775 n = 8;
36776 goto half;
36778 case V8SImode:
36779 half_mode = V4SImode;
36780 j = 2;
36781 n = 4;
36782 goto half;
36784 case V4DImode:
36785 half_mode = V2DImode;
36786 j = 3;
36787 n = 2;
36788 goto half;
36790 case V8SFmode:
36791 half_mode = V4SFmode;
36792 j = 4;
36793 n = 4;
36794 goto half;
36796 case V4DFmode:
36797 half_mode = V2DFmode;
36798 j = 5;
36799 n = 2;
36800 goto half;
36802 half:
36803 /* Compute offset. */
36804 i = elt / n;
36805 elt %= n;
36807 gcc_assert (i <= 1);
36809 /* Extract the half. */
36810 tmp = gen_reg_rtx (half_mode);
36811 emit_insn (gen_extract[j][i] (tmp, target));
36813 /* Put val in tmp at elt. */
36814 ix86_expand_vector_set (false, tmp, val, elt);
36816 /* Put it back. */
36817 emit_insn (gen_insert[j][i] (target, target, tmp));
36818 return;
36820 default:
36821 break;
36824 if (use_vec_merge)
36826 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
36827 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
36828 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
36830 else
36832 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
36834 emit_move_insn (mem, target);
36836 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
36837 emit_move_insn (tmp, val);
36839 emit_move_insn (target, mem);
36843 void
36844 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
36846 enum machine_mode mode = GET_MODE (vec);
36847 enum machine_mode inner_mode = GET_MODE_INNER (mode);
36848 bool use_vec_extr = false;
36849 rtx tmp;
36851 switch (mode)
36853 case V2SImode:
36854 case V2SFmode:
36855 if (!mmx_ok)
36856 break;
36857 /* FALLTHRU */
36859 case V2DFmode:
36860 case V2DImode:
36861 use_vec_extr = true;
36862 break;
36864 case V4SFmode:
36865 use_vec_extr = TARGET_SSE4_1;
36866 if (use_vec_extr)
36867 break;
36869 switch (elt)
36871 case 0:
36872 tmp = vec;
36873 break;
36875 case 1:
36876 case 3:
36877 tmp = gen_reg_rtx (mode);
36878 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
36879 GEN_INT (elt), GEN_INT (elt),
36880 GEN_INT (elt+4), GEN_INT (elt+4)));
36881 break;
36883 case 2:
36884 tmp = gen_reg_rtx (mode);
36885 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
36886 break;
36888 default:
36889 gcc_unreachable ();
36891 vec = tmp;
36892 use_vec_extr = true;
36893 elt = 0;
36894 break;
36896 case V4SImode:
36897 use_vec_extr = TARGET_SSE4_1;
36898 if (use_vec_extr)
36899 break;
36901 if (TARGET_SSE2)
36903 switch (elt)
36905 case 0:
36906 tmp = vec;
36907 break;
36909 case 1:
36910 case 3:
36911 tmp = gen_reg_rtx (mode);
36912 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
36913 GEN_INT (elt), GEN_INT (elt),
36914 GEN_INT (elt), GEN_INT (elt)));
36915 break;
36917 case 2:
36918 tmp = gen_reg_rtx (mode);
36919 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
36920 break;
36922 default:
36923 gcc_unreachable ();
36925 vec = tmp;
36926 use_vec_extr = true;
36927 elt = 0;
36929 else
36931 /* For SSE1, we have to reuse the V4SF code. */
36932 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
36933 gen_lowpart (V4SFmode, vec), elt);
36934 return;
36936 break;
36938 case V8HImode:
36939 use_vec_extr = TARGET_SSE2;
36940 break;
36941 case V4HImode:
36942 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
36943 break;
36945 case V16QImode:
36946 use_vec_extr = TARGET_SSE4_1;
36947 break;
36949 case V8SFmode:
36950 if (TARGET_AVX)
36952 tmp = gen_reg_rtx (V4SFmode);
36953 if (elt < 4)
36954 emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
36955 else
36956 emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
36957 ix86_expand_vector_extract (false, target, tmp, elt & 3);
36958 return;
36960 break;
36962 case V4DFmode:
36963 if (TARGET_AVX)
36965 tmp = gen_reg_rtx (V2DFmode);
36966 if (elt < 2)
36967 emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
36968 else
36969 emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
36970 ix86_expand_vector_extract (false, target, tmp, elt & 1);
36971 return;
36973 break;
36975 case V32QImode:
36976 if (TARGET_AVX)
36978 tmp = gen_reg_rtx (V16QImode);
36979 if (elt < 16)
36980 emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
36981 else
36982 emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
36983 ix86_expand_vector_extract (false, target, tmp, elt & 15);
36984 return;
36986 break;
36988 case V16HImode:
36989 if (TARGET_AVX)
36991 tmp = gen_reg_rtx (V8HImode);
36992 if (elt < 8)
36993 emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
36994 else
36995 emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
36996 ix86_expand_vector_extract (false, target, tmp, elt & 7);
36997 return;
36999 break;
37001 case V8SImode:
37002 if (TARGET_AVX)
37004 tmp = gen_reg_rtx (V4SImode);
37005 if (elt < 4)
37006 emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
37007 else
37008 emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
37009 ix86_expand_vector_extract (false, target, tmp, elt & 3);
37010 return;
37012 break;
37014 case V4DImode:
37015 if (TARGET_AVX)
37017 tmp = gen_reg_rtx (V2DImode);
37018 if (elt < 2)
37019 emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
37020 else
37021 emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
37022 ix86_expand_vector_extract (false, target, tmp, elt & 1);
37023 return;
37025 break;
37027 case V8QImode:
37028 /* ??? Could extract the appropriate HImode element and shift. */
37029 default:
37030 break;
37033 if (use_vec_extr)
37035 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
37036 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
37038 /* Let the rtl optimizers know about the zero extension performed. */
37039 if (inner_mode == QImode || inner_mode == HImode)
37041 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
37042 target = gen_lowpart (SImode, target);
37045 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
37047 else
37049 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
37051 emit_move_insn (mem, vec);
37053 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
37054 emit_move_insn (target, tmp);
37058 /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
37059 to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
37060 The upper bits of DEST are undefined, though they shouldn't cause
37061 exceptions (some bits from src or all zeros are ok). */
37063 static void
37064 emit_reduc_half (rtx dest, rtx src, int i)
37066 rtx tem;
37067 switch (GET_MODE (src))
37069 case V4SFmode:
37070 if (i == 128)
37071 tem = gen_sse_movhlps (dest, src, src);
37072 else
37073 tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
37074 GEN_INT (1 + 4), GEN_INT (1 + 4));
37075 break;
37076 case V2DFmode:
37077 tem = gen_vec_interleave_highv2df (dest, src, src);
37078 break;
37079 case V16QImode:
37080 case V8HImode:
37081 case V4SImode:
37082 case V2DImode:
37083 tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
37084 gen_lowpart (V1TImode, src),
37085 GEN_INT (i / 2));
37086 break;
37087 case V8SFmode:
37088 if (i == 256)
37089 tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
37090 else
37091 tem = gen_avx_shufps256 (dest, src, src,
37092 GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
37093 break;
37094 case V4DFmode:
37095 if (i == 256)
37096 tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
37097 else
37098 tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
37099 break;
37100 case V32QImode:
37101 case V16HImode:
37102 case V8SImode:
37103 case V4DImode:
37104 if (i == 256)
37105 tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
37106 gen_lowpart (V4DImode, src),
37107 gen_lowpart (V4DImode, src),
37108 const1_rtx);
37109 else
37110 tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
37111 gen_lowpart (V2TImode, src),
37112 GEN_INT (i / 2));
37113 break;
37114 default:
37115 gcc_unreachable ();
37117 emit_insn (tem);
37120 /* Expand a vector reduction. FN is the binary pattern to reduce;
37121 DEST is the destination; IN is the input vector. */
37123 void
37124 ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
37126 rtx half, dst, vec = in;
37127 enum machine_mode mode = GET_MODE (in);
37128 int i;
37130 /* SSE4 has a special instruction for V8HImode UMIN reduction. */
37131 if (TARGET_SSE4_1
37132 && mode == V8HImode
37133 && fn == gen_uminv8hi3)
37135 emit_insn (gen_sse4_1_phminposuw (dest, in));
37136 return;
37139 for (i = GET_MODE_BITSIZE (mode);
37140 i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
37141 i >>= 1)
37143 half = gen_reg_rtx (mode);
37144 emit_reduc_half (half, vec, i);
37145 if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
37146 dst = dest;
37147 else
37148 dst = gen_reg_rtx (mode);
37149 emit_insn (fn (dst, half, vec));
37150 vec = dst;
37154 /* Target hook for scalar_mode_supported_p. */
37155 static bool
37156 ix86_scalar_mode_supported_p (enum machine_mode mode)
37158 if (DECIMAL_FLOAT_MODE_P (mode))
37159 return default_decimal_float_supported_p ();
37160 else if (mode == TFmode)
37161 return true;
37162 else
37163 return default_scalar_mode_supported_p (mode);
37166 /* Implements target hook vector_mode_supported_p. */
37167 static bool
37168 ix86_vector_mode_supported_p (enum machine_mode mode)
37170 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
37171 return true;
37172 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
37173 return true;
37174 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
37175 return true;
37176 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
37177 return true;
37178 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
37179 return true;
37180 return false;
37183 /* Target hook for c_mode_for_suffix. */
37184 static enum machine_mode
37185 ix86_c_mode_for_suffix (char suffix)
37187 if (suffix == 'q')
37188 return TFmode;
37189 if (suffix == 'w')
37190 return XFmode;
37192 return VOIDmode;
37195 /* Worker function for TARGET_MD_ASM_CLOBBERS.
37197 We do this in the new i386 backend to maintain source compatibility
37198 with the old cc0-based compiler. */
37200 static tree
37201 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
37202 tree inputs ATTRIBUTE_UNUSED,
37203 tree clobbers)
37205 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
37206 clobbers);
37207 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
37208 clobbers);
37209 return clobbers;
37212 /* Implements target vector targetm.asm.encode_section_info. */
37214 static void ATTRIBUTE_UNUSED
37215 ix86_encode_section_info (tree decl, rtx rtl, int first)
37217 default_encode_section_info (decl, rtl, first);
37219 if (TREE_CODE (decl) == VAR_DECL
37220 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
37221 && ix86_in_large_data_p (decl))
37222 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
37225 /* Worker function for REVERSE_CONDITION. */
37227 enum rtx_code
37228 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
37230 return (mode != CCFPmode && mode != CCFPUmode
37231 ? reverse_condition (code)
37232 : reverse_condition_maybe_unordered (code));
37235 /* Output code to perform an x87 FP register move, from OPERANDS[1]
37236 to OPERANDS[0]. */
37238 const char *
37239 output_387_reg_move (rtx insn, rtx *operands)
37241 if (REG_P (operands[0]))
37243 if (REG_P (operands[1])
37244 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37246 if (REGNO (operands[0]) == FIRST_STACK_REG)
37247 return output_387_ffreep (operands, 0);
37248 return "fstp\t%y0";
37250 if (STACK_TOP_P (operands[0]))
37251 return "fld%Z1\t%y1";
37252 return "fst\t%y0";
37254 else if (MEM_P (operands[0]))
37256 gcc_assert (REG_P (operands[1]));
37257 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
37258 return "fstp%Z0\t%y0";
37259 else
37261 /* There is no non-popping store to memory for XFmode.
37262 So if we need one, follow the store with a load. */
37263 if (GET_MODE (operands[0]) == XFmode)
37264 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
37265 else
37266 return "fst%Z0\t%y0";
37269 else
37270 gcc_unreachable();
37273 /* Output code to perform a conditional jump to LABEL, if C2 flag in
37274 FP status register is set. */
37276 void
37277 ix86_emit_fp_unordered_jump (rtx label)
37279 rtx reg = gen_reg_rtx (HImode);
37280 rtx temp;
37282 emit_insn (gen_x86_fnstsw_1 (reg));
37284 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
37286 emit_insn (gen_x86_sahf_1 (reg));
37288 temp = gen_rtx_REG (CCmode, FLAGS_REG);
37289 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
37291 else
37293 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
37295 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
37296 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
37299 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
37300 gen_rtx_LABEL_REF (VOIDmode, label),
37301 pc_rtx);
37302 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
37304 emit_jump_insn (temp);
37305 predict_jump (REG_BR_PROB_BASE * 10 / 100);
37308 /* Output code to perform a log1p XFmode calculation. */
37310 void ix86_emit_i387_log1p (rtx op0, rtx op1)
37312 rtx label1 = gen_label_rtx ();
37313 rtx label2 = gen_label_rtx ();
37315 rtx tmp = gen_reg_rtx (XFmode);
37316 rtx tmp2 = gen_reg_rtx (XFmode);
37317 rtx test;
37319 emit_insn (gen_absxf2 (tmp, op1));
37320 test = gen_rtx_GE (VOIDmode, tmp,
37321 CONST_DOUBLE_FROM_REAL_VALUE (
37322 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
37323 XFmode));
37324 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
37326 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37327 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
37328 emit_jump (label2);
37330 emit_label (label1);
37331 emit_move_insn (tmp, CONST1_RTX (XFmode));
37332 emit_insn (gen_addxf3 (tmp, op1, tmp));
37333 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
37334 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
37336 emit_label (label2);
37339 /* Emit code for round calculation. */
37340 void ix86_emit_i387_round (rtx op0, rtx op1)
37342 enum machine_mode inmode = GET_MODE (op1);
37343 enum machine_mode outmode = GET_MODE (op0);
37344 rtx e1, e2, res, tmp, tmp1, half;
37345 rtx scratch = gen_reg_rtx (HImode);
37346 rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
37347 rtx jump_label = gen_label_rtx ();
37348 rtx insn;
37349 rtx (*gen_abs) (rtx, rtx);
37350 rtx (*gen_neg) (rtx, rtx);
37352 switch (inmode)
37354 case SFmode:
37355 gen_abs = gen_abssf2;
37356 break;
37357 case DFmode:
37358 gen_abs = gen_absdf2;
37359 break;
37360 case XFmode:
37361 gen_abs = gen_absxf2;
37362 break;
37363 default:
37364 gcc_unreachable ();
37367 switch (outmode)
37369 case SFmode:
37370 gen_neg = gen_negsf2;
37371 break;
37372 case DFmode:
37373 gen_neg = gen_negdf2;
37374 break;
37375 case XFmode:
37376 gen_neg = gen_negxf2;
37377 break;
37378 case HImode:
37379 gen_neg = gen_neghi2;
37380 break;
37381 case SImode:
37382 gen_neg = gen_negsi2;
37383 break;
37384 case DImode:
37385 gen_neg = gen_negdi2;
37386 break;
37387 default:
37388 gcc_unreachable ();
37391 e1 = gen_reg_rtx (inmode);
37392 e2 = gen_reg_rtx (inmode);
37393 res = gen_reg_rtx (outmode);
37395 half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, inmode);
37397 /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
37399 /* scratch = fxam(op1) */
37400 emit_insn (gen_rtx_SET (VOIDmode, scratch,
37401 gen_rtx_UNSPEC (HImode, gen_rtvec (1, op1),
37402 UNSPEC_FXAM)));
37403 /* e1 = fabs(op1) */
37404 emit_insn (gen_abs (e1, op1));
37406 /* e2 = e1 + 0.5 */
37407 half = force_reg (inmode, half);
37408 emit_insn (gen_rtx_SET (VOIDmode, e2,
37409 gen_rtx_PLUS (inmode, e1, half)));
37411 /* res = floor(e2) */
37412 if (inmode != XFmode)
37414 tmp1 = gen_reg_rtx (XFmode);
37416 emit_insn (gen_rtx_SET (VOIDmode, tmp1,
37417 gen_rtx_FLOAT_EXTEND (XFmode, e2)));
37419 else
37420 tmp1 = e2;
37422 switch (outmode)
37424 case SFmode:
37425 case DFmode:
37427 rtx tmp0 = gen_reg_rtx (XFmode);
37429 emit_insn (gen_frndintxf2_floor (tmp0, tmp1));
37431 emit_insn (gen_rtx_SET (VOIDmode, res,
37432 gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp0),
37433 UNSPEC_TRUNC_NOOP)));
37435 break;
37436 case XFmode:
37437 emit_insn (gen_frndintxf2_floor (res, tmp1));
37438 break;
37439 case HImode:
37440 emit_insn (gen_lfloorxfhi2 (res, tmp1));
37441 break;
37442 case SImode:
37443 emit_insn (gen_lfloorxfsi2 (res, tmp1));
37444 break;
37445 case DImode:
37446 emit_insn (gen_lfloorxfdi2 (res, tmp1));
37447 break;
37448 default:
37449 gcc_unreachable ();
37452 /* flags = signbit(a) */
37453 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x02)));
37455 /* if (flags) then res = -res */
37456 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
37457 gen_rtx_EQ (VOIDmode, flags, const0_rtx),
37458 gen_rtx_LABEL_REF (VOIDmode, jump_label),
37459 pc_rtx);
37460 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37461 predict_jump (REG_BR_PROB_BASE * 50 / 100);
37462 JUMP_LABEL (insn) = jump_label;
37464 emit_insn (gen_neg (res, res));
37466 emit_label (jump_label);
37467 LABEL_NUSES (jump_label) = 1;
37469 emit_move_insn (op0, res);
37472 /* Output code to perform a Newton-Rhapson approximation of a single precision
37473 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
37475 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
37477 rtx x0, x1, e0, e1;
37479 x0 = gen_reg_rtx (mode);
37480 e0 = gen_reg_rtx (mode);
37481 e1 = gen_reg_rtx (mode);
37482 x1 = gen_reg_rtx (mode);
37484 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
37486 b = force_reg (mode, b);
37488 /* x0 = rcp(b) estimate */
37489 emit_insn (gen_rtx_SET (VOIDmode, x0,
37490 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
37491 UNSPEC_RCP)));
37492 /* e0 = x0 * b */
37493 emit_insn (gen_rtx_SET (VOIDmode, e0,
37494 gen_rtx_MULT (mode, x0, b)));
37496 /* e0 = x0 * e0 */
37497 emit_insn (gen_rtx_SET (VOIDmode, e0,
37498 gen_rtx_MULT (mode, x0, e0)));
37500 /* e1 = x0 + x0 */
37501 emit_insn (gen_rtx_SET (VOIDmode, e1,
37502 gen_rtx_PLUS (mode, x0, x0)));
37504 /* x1 = e1 - e0 */
37505 emit_insn (gen_rtx_SET (VOIDmode, x1,
37506 gen_rtx_MINUS (mode, e1, e0)));
37508 /* res = a * x1 */
37509 emit_insn (gen_rtx_SET (VOIDmode, res,
37510 gen_rtx_MULT (mode, a, x1)));
37513 /* Output code to perform a Newton-Rhapson approximation of a
37514 single precision floating point [reciprocal] square root. */
37516 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
37517 bool recip)
37519 rtx x0, e0, e1, e2, e3, mthree, mhalf;
37520 REAL_VALUE_TYPE r;
37522 x0 = gen_reg_rtx (mode);
37523 e0 = gen_reg_rtx (mode);
37524 e1 = gen_reg_rtx (mode);
37525 e2 = gen_reg_rtx (mode);
37526 e3 = gen_reg_rtx (mode);
37528 real_from_integer (&r, VOIDmode, -3, -1, 0);
37529 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37531 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
37532 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
37534 if (VECTOR_MODE_P (mode))
37536 mthree = ix86_build_const_vector (mode, true, mthree);
37537 mhalf = ix86_build_const_vector (mode, true, mhalf);
37540 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
37541 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
37543 a = force_reg (mode, a);
37545 /* x0 = rsqrt(a) estimate */
37546 emit_insn (gen_rtx_SET (VOIDmode, x0,
37547 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
37548 UNSPEC_RSQRT)));
37550 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
37551 if (!recip)
37553 rtx zero, mask;
37555 zero = gen_reg_rtx (mode);
37556 mask = gen_reg_rtx (mode);
37558 zero = force_reg (mode, CONST0_RTX(mode));
37559 emit_insn (gen_rtx_SET (VOIDmode, mask,
37560 gen_rtx_NE (mode, zero, a)));
37562 emit_insn (gen_rtx_SET (VOIDmode, x0,
37563 gen_rtx_AND (mode, x0, mask)));
37566 /* e0 = x0 * a */
37567 emit_insn (gen_rtx_SET (VOIDmode, e0,
37568 gen_rtx_MULT (mode, x0, a)));
37569 /* e1 = e0 * x0 */
37570 emit_insn (gen_rtx_SET (VOIDmode, e1,
37571 gen_rtx_MULT (mode, e0, x0)));
37573 /* e2 = e1 - 3. */
37574 mthree = force_reg (mode, mthree);
37575 emit_insn (gen_rtx_SET (VOIDmode, e2,
37576 gen_rtx_PLUS (mode, e1, mthree)));
37578 mhalf = force_reg (mode, mhalf);
37579 if (recip)
37580 /* e3 = -.5 * x0 */
37581 emit_insn (gen_rtx_SET (VOIDmode, e3,
37582 gen_rtx_MULT (mode, x0, mhalf)));
37583 else
37584 /* e3 = -.5 * e0 */
37585 emit_insn (gen_rtx_SET (VOIDmode, e3,
37586 gen_rtx_MULT (mode, e0, mhalf)));
37587 /* ret = e2 * e3 */
37588 emit_insn (gen_rtx_SET (VOIDmode, res,
37589 gen_rtx_MULT (mode, e2, e3)));
37592 #ifdef TARGET_SOLARIS
37593 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
37595 static void
37596 i386_solaris_elf_named_section (const char *name, unsigned int flags,
37597 tree decl)
37599 /* With Binutils 2.15, the "@unwind" marker must be specified on
37600 every occurrence of the ".eh_frame" section, not just the first
37601 one. */
37602 if (TARGET_64BIT
37603 && strcmp (name, ".eh_frame") == 0)
37605 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
37606 flags & SECTION_WRITE ? "aw" : "a");
37607 return;
37610 #ifndef USE_GAS
37611 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
37613 solaris_elf_asm_comdat_section (name, flags, decl);
37614 return;
37616 #endif
37618 default_elf_asm_named_section (name, flags, decl);
37620 #endif /* TARGET_SOLARIS */
37622 /* Return the mangling of TYPE if it is an extended fundamental type. */
37624 static const char *
37625 ix86_mangle_type (const_tree type)
37627 type = TYPE_MAIN_VARIANT (type);
37629 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
37630 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
37631 return NULL;
37633 switch (TYPE_MODE (type))
37635 case TFmode:
37636 /* __float128 is "g". */
37637 return "g";
37638 case XFmode:
37639 /* "long double" or __float80 is "e". */
37640 return "e";
37641 default:
37642 return NULL;
37646 /* For 32-bit code we can save PIC register setup by using
37647 __stack_chk_fail_local hidden function instead of calling
37648 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
37649 register, so it is better to call __stack_chk_fail directly. */
37651 static tree ATTRIBUTE_UNUSED
37652 ix86_stack_protect_fail (void)
37654 return TARGET_64BIT
37655 ? default_external_stack_protect_fail ()
37656 : default_hidden_stack_protect_fail ();
37659 /* Select a format to encode pointers in exception handling data. CODE
37660 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
37661 true if the symbol may be affected by dynamic relocations.
37663 ??? All x86 object file formats are capable of representing this.
37664 After all, the relocation needed is the same as for the call insn.
37665 Whether or not a particular assembler allows us to enter such, I
37666 guess we'll have to see. */
37668 asm_preferred_eh_data_format (int code, int global)
37670 if (flag_pic)
37672 int type = DW_EH_PE_sdata8;
37673 if (!TARGET_64BIT
37674 || ix86_cmodel == CM_SMALL_PIC
37675 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
37676 type = DW_EH_PE_sdata4;
37677 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
37679 if (ix86_cmodel == CM_SMALL
37680 || (ix86_cmodel == CM_MEDIUM && code))
37681 return DW_EH_PE_udata4;
37682 return DW_EH_PE_absptr;
37685 /* Expand copysign from SIGN to the positive value ABS_VALUE
37686 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
37687 the sign-bit. */
37688 static void
37689 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
37691 enum machine_mode mode = GET_MODE (sign);
37692 rtx sgn = gen_reg_rtx (mode);
37693 if (mask == NULL_RTX)
37695 enum machine_mode vmode;
37697 if (mode == SFmode)
37698 vmode = V4SFmode;
37699 else if (mode == DFmode)
37700 vmode = V2DFmode;
37701 else
37702 vmode = mode;
37704 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
37705 if (!VECTOR_MODE_P (mode))
37707 /* We need to generate a scalar mode mask in this case. */
37708 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37709 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37710 mask = gen_reg_rtx (mode);
37711 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37714 else
37715 mask = gen_rtx_NOT (mode, mask);
37716 emit_insn (gen_rtx_SET (VOIDmode, sgn,
37717 gen_rtx_AND (mode, mask, sign)));
37718 emit_insn (gen_rtx_SET (VOIDmode, result,
37719 gen_rtx_IOR (mode, abs_value, sgn)));
37722 /* Expand fabs (OP0) and return a new rtx that holds the result. The
37723 mask for masking out the sign-bit is stored in *SMASK, if that is
37724 non-null. */
37725 static rtx
37726 ix86_expand_sse_fabs (rtx op0, rtx *smask)
37728 enum machine_mode vmode, mode = GET_MODE (op0);
37729 rtx xa, mask;
37731 xa = gen_reg_rtx (mode);
37732 if (mode == SFmode)
37733 vmode = V4SFmode;
37734 else if (mode == DFmode)
37735 vmode = V2DFmode;
37736 else
37737 vmode = mode;
37738 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
37739 if (!VECTOR_MODE_P (mode))
37741 /* We need to generate a scalar mode mask in this case. */
37742 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
37743 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
37744 mask = gen_reg_rtx (mode);
37745 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
37747 emit_insn (gen_rtx_SET (VOIDmode, xa,
37748 gen_rtx_AND (mode, op0, mask)));
37750 if (smask)
37751 *smask = mask;
37753 return xa;
37756 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
37757 swapping the operands if SWAP_OPERANDS is true. The expanded
37758 code is a forward jump to a newly created label in case the
37759 comparison is true. The generated label rtx is returned. */
37760 static rtx
37761 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
37762 bool swap_operands)
37764 rtx label, tmp;
37766 if (swap_operands)
37768 tmp = op0;
37769 op0 = op1;
37770 op1 = tmp;
37773 label = gen_label_rtx ();
37774 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
37775 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37776 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
37777 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
37778 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
37779 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
37780 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
37781 JUMP_LABEL (tmp) = label;
37783 return label;
37786 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
37787 using comparison code CODE. Operands are swapped for the comparison if
37788 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
37789 static rtx
37790 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
37791 bool swap_operands)
37793 rtx (*insn)(rtx, rtx, rtx, rtx);
37794 enum machine_mode mode = GET_MODE (op0);
37795 rtx mask = gen_reg_rtx (mode);
37797 if (swap_operands)
37799 rtx tmp = op0;
37800 op0 = op1;
37801 op1 = tmp;
37804 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
37806 emit_insn (insn (mask, op0, op1,
37807 gen_rtx_fmt_ee (code, mode, op0, op1)));
37808 return mask;
37811 /* Generate and return a rtx of mode MODE for 2**n where n is the number
37812 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
37813 static rtx
37814 ix86_gen_TWO52 (enum machine_mode mode)
37816 REAL_VALUE_TYPE TWO52r;
37817 rtx TWO52;
37819 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
37820 TWO52 = const_double_from_real_value (TWO52r, mode);
37821 TWO52 = force_reg (mode, TWO52);
37823 return TWO52;
37826 /* Expand SSE sequence for computing lround from OP1 storing
37827 into OP0. */
37828 void
37829 ix86_expand_lround (rtx op0, rtx op1)
37831 /* C code for the stuff we're doing below:
37832 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
37833 return (long)tmp;
37835 enum machine_mode mode = GET_MODE (op1);
37836 const struct real_format *fmt;
37837 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
37838 rtx adj;
37840 /* load nextafter (0.5, 0.0) */
37841 fmt = REAL_MODE_FORMAT (mode);
37842 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
37843 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
37845 /* adj = copysign (0.5, op1) */
37846 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
37847 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
37849 /* adj = op1 + adj */
37850 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
37852 /* op0 = (imode)adj */
37853 expand_fix (op0, adj, 0);
37856 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
37857 into OPERAND0. */
37858 void
37859 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
37861 /* C code for the stuff we're doing below (for do_floor):
37862 xi = (long)op1;
37863 xi -= (double)xi > op1 ? 1 : 0;
37864 return xi;
37866 enum machine_mode fmode = GET_MODE (op1);
37867 enum machine_mode imode = GET_MODE (op0);
37868 rtx ireg, freg, label, tmp;
37870 /* reg = (long)op1 */
37871 ireg = gen_reg_rtx (imode);
37872 expand_fix (ireg, op1, 0);
37874 /* freg = (double)reg */
37875 freg = gen_reg_rtx (fmode);
37876 expand_float (freg, ireg, 0);
37878 /* ireg = (freg > op1) ? ireg - 1 : ireg */
37879 label = ix86_expand_sse_compare_and_jump (UNLE,
37880 freg, op1, !do_floor);
37881 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
37882 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
37883 emit_move_insn (ireg, tmp);
37885 emit_label (label);
37886 LABEL_NUSES (label) = 1;
37888 emit_move_insn (op0, ireg);
37891 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
37892 result in OPERAND0. */
37893 void
37894 ix86_expand_rint (rtx operand0, rtx operand1)
37896 /* C code for the stuff we're doing below:
37897 xa = fabs (operand1);
37898 if (!isless (xa, 2**52))
37899 return operand1;
37900 xa = xa + 2**52 - 2**52;
37901 return copysign (xa, operand1);
37903 enum machine_mode mode = GET_MODE (operand0);
37904 rtx res, xa, label, TWO52, mask;
37906 res = gen_reg_rtx (mode);
37907 emit_move_insn (res, operand1);
37909 /* xa = abs (operand1) */
37910 xa = ix86_expand_sse_fabs (res, &mask);
37912 /* if (!isless (xa, TWO52)) goto label; */
37913 TWO52 = ix86_gen_TWO52 (mode);
37914 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37916 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37917 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37919 ix86_sse_copysign_to_positive (res, xa, res, mask);
37921 emit_label (label);
37922 LABEL_NUSES (label) = 1;
37924 emit_move_insn (operand0, res);
37927 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37928 into OPERAND0. */
37929 void
37930 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
37932 /* C code for the stuff we expand below.
37933 double xa = fabs (x), x2;
37934 if (!isless (xa, TWO52))
37935 return x;
37936 xa = xa + TWO52 - TWO52;
37937 x2 = copysign (xa, x);
37938 Compensate. Floor:
37939 if (x2 > x)
37940 x2 -= 1;
37941 Compensate. Ceil:
37942 if (x2 < x)
37943 x2 -= -1;
37944 return x2;
37946 enum machine_mode mode = GET_MODE (operand0);
37947 rtx xa, TWO52, tmp, label, one, res, mask;
37949 TWO52 = ix86_gen_TWO52 (mode);
37951 /* Temporary for holding the result, initialized to the input
37952 operand to ease control flow. */
37953 res = gen_reg_rtx (mode);
37954 emit_move_insn (res, operand1);
37956 /* xa = abs (operand1) */
37957 xa = ix86_expand_sse_fabs (res, &mask);
37959 /* if (!isless (xa, TWO52)) goto label; */
37960 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
37962 /* xa = xa + TWO52 - TWO52; */
37963 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
37964 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
37966 /* xa = copysign (xa, operand1) */
37967 ix86_sse_copysign_to_positive (xa, xa, res, mask);
37969 /* generate 1.0 or -1.0 */
37970 one = force_reg (mode,
37971 const_double_from_real_value (do_floor
37972 ? dconst1 : dconstm1, mode));
37974 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
37975 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
37976 emit_insn (gen_rtx_SET (VOIDmode, tmp,
37977 gen_rtx_AND (mode, one, tmp)));
37978 /* We always need to subtract here to preserve signed zero. */
37979 tmp = expand_simple_binop (mode, MINUS,
37980 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
37981 emit_move_insn (res, tmp);
37983 emit_label (label);
37984 LABEL_NUSES (label) = 1;
37986 emit_move_insn (operand0, res);
37989 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
37990 into OPERAND0. */
37991 void
37992 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
37994 /* C code for the stuff we expand below.
37995 double xa = fabs (x), x2;
37996 if (!isless (xa, TWO52))
37997 return x;
37998 x2 = (double)(long)x;
37999 Compensate. Floor:
38000 if (x2 > x)
38001 x2 -= 1;
38002 Compensate. Ceil:
38003 if (x2 < x)
38004 x2 += 1;
38005 if (HONOR_SIGNED_ZEROS (mode))
38006 return copysign (x2, x);
38007 return x2;
38009 enum machine_mode mode = GET_MODE (operand0);
38010 rtx xa, xi, TWO52, tmp, label, one, res, mask;
38012 TWO52 = ix86_gen_TWO52 (mode);
38014 /* Temporary for holding the result, initialized to the input
38015 operand to ease control flow. */
38016 res = gen_reg_rtx (mode);
38017 emit_move_insn (res, operand1);
38019 /* xa = abs (operand1) */
38020 xa = ix86_expand_sse_fabs (res, &mask);
38022 /* if (!isless (xa, TWO52)) goto label; */
38023 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38025 /* xa = (double)(long)x */
38026 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38027 expand_fix (xi, res, 0);
38028 expand_float (xa, xi, 0);
38030 /* generate 1.0 */
38031 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38033 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
38034 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
38035 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38036 gen_rtx_AND (mode, one, tmp)));
38037 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
38038 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38039 emit_move_insn (res, tmp);
38041 if (HONOR_SIGNED_ZEROS (mode))
38042 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38044 emit_label (label);
38045 LABEL_NUSES (label) = 1;
38047 emit_move_insn (operand0, res);
38050 /* Expand SSE sequence for computing round from OPERAND1 storing
38051 into OPERAND0. Sequence that works without relying on DImode truncation
38052 via cvttsd2siq that is only available on 64bit targets. */
38053 void
38054 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
38056 /* C code for the stuff we expand below.
38057 double xa = fabs (x), xa2, x2;
38058 if (!isless (xa, TWO52))
38059 return x;
38060 Using the absolute value and copying back sign makes
38061 -0.0 -> -0.0 correct.
38062 xa2 = xa + TWO52 - TWO52;
38063 Compensate.
38064 dxa = xa2 - xa;
38065 if (dxa <= -0.5)
38066 xa2 += 1;
38067 else if (dxa > 0.5)
38068 xa2 -= 1;
38069 x2 = copysign (xa2, x);
38070 return x2;
38072 enum machine_mode mode = GET_MODE (operand0);
38073 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
38075 TWO52 = ix86_gen_TWO52 (mode);
38077 /* Temporary for holding the result, initialized to the input
38078 operand to ease control flow. */
38079 res = gen_reg_rtx (mode);
38080 emit_move_insn (res, operand1);
38082 /* xa = abs (operand1) */
38083 xa = ix86_expand_sse_fabs (res, &mask);
38085 /* if (!isless (xa, TWO52)) goto label; */
38086 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38088 /* xa2 = xa + TWO52 - TWO52; */
38089 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38090 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
38092 /* dxa = xa2 - xa; */
38093 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
38095 /* generate 0.5, 1.0 and -0.5 */
38096 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
38097 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
38098 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
38099 0, OPTAB_DIRECT);
38101 /* Compensate. */
38102 tmp = gen_reg_rtx (mode);
38103 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
38104 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
38105 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38106 gen_rtx_AND (mode, one, tmp)));
38107 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38108 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
38109 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
38110 emit_insn (gen_rtx_SET (VOIDmode, tmp,
38111 gen_rtx_AND (mode, one, tmp)));
38112 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
38114 /* res = copysign (xa2, operand1) */
38115 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
38117 emit_label (label);
38118 LABEL_NUSES (label) = 1;
38120 emit_move_insn (operand0, res);
38123 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38124 into OPERAND0. */
38125 void
38126 ix86_expand_trunc (rtx operand0, rtx operand1)
38128 /* C code for SSE variant we expand below.
38129 double xa = fabs (x), x2;
38130 if (!isless (xa, TWO52))
38131 return x;
38132 x2 = (double)(long)x;
38133 if (HONOR_SIGNED_ZEROS (mode))
38134 return copysign (x2, x);
38135 return x2;
38137 enum machine_mode mode = GET_MODE (operand0);
38138 rtx xa, xi, TWO52, label, res, mask;
38140 TWO52 = ix86_gen_TWO52 (mode);
38142 /* Temporary for holding the result, initialized to the input
38143 operand to ease control flow. */
38144 res = gen_reg_rtx (mode);
38145 emit_move_insn (res, operand1);
38147 /* xa = abs (operand1) */
38148 xa = ix86_expand_sse_fabs (res, &mask);
38150 /* if (!isless (xa, TWO52)) goto label; */
38151 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38153 /* x = (double)(long)x */
38154 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38155 expand_fix (xi, res, 0);
38156 expand_float (res, xi, 0);
38158 if (HONOR_SIGNED_ZEROS (mode))
38159 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
38161 emit_label (label);
38162 LABEL_NUSES (label) = 1;
38164 emit_move_insn (operand0, res);
38167 /* Expand SSE sequence for computing trunc from OPERAND1 storing
38168 into OPERAND0. */
38169 void
38170 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
38172 enum machine_mode mode = GET_MODE (operand0);
38173 rtx xa, mask, TWO52, label, one, res, smask, tmp;
38175 /* C code for SSE variant we expand below.
38176 double xa = fabs (x), x2;
38177 if (!isless (xa, TWO52))
38178 return x;
38179 xa2 = xa + TWO52 - TWO52;
38180 Compensate:
38181 if (xa2 > xa)
38182 xa2 -= 1.0;
38183 x2 = copysign (xa2, x);
38184 return x2;
38187 TWO52 = ix86_gen_TWO52 (mode);
38189 /* Temporary for holding the result, initialized to the input
38190 operand to ease control flow. */
38191 res = gen_reg_rtx (mode);
38192 emit_move_insn (res, operand1);
38194 /* xa = abs (operand1) */
38195 xa = ix86_expand_sse_fabs (res, &smask);
38197 /* if (!isless (xa, TWO52)) goto label; */
38198 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38200 /* res = xa + TWO52 - TWO52; */
38201 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
38202 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
38203 emit_move_insn (res, tmp);
38205 /* generate 1.0 */
38206 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
38208 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
38209 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
38210 emit_insn (gen_rtx_SET (VOIDmode, mask,
38211 gen_rtx_AND (mode, mask, one)));
38212 tmp = expand_simple_binop (mode, MINUS,
38213 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
38214 emit_move_insn (res, tmp);
38216 /* res = copysign (res, operand1) */
38217 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
38219 emit_label (label);
38220 LABEL_NUSES (label) = 1;
38222 emit_move_insn (operand0, res);
38225 /* Expand SSE sequence for computing round from OPERAND1 storing
38226 into OPERAND0. */
38227 void
38228 ix86_expand_round (rtx operand0, rtx operand1)
38230 /* C code for the stuff we're doing below:
38231 double xa = fabs (x);
38232 if (!isless (xa, TWO52))
38233 return x;
38234 xa = (double)(long)(xa + nextafter (0.5, 0.0));
38235 return copysign (xa, x);
38237 enum machine_mode mode = GET_MODE (operand0);
38238 rtx res, TWO52, xa, label, xi, half, mask;
38239 const struct real_format *fmt;
38240 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38242 /* Temporary for holding the result, initialized to the input
38243 operand to ease control flow. */
38244 res = gen_reg_rtx (mode);
38245 emit_move_insn (res, operand1);
38247 TWO52 = ix86_gen_TWO52 (mode);
38248 xa = ix86_expand_sse_fabs (res, &mask);
38249 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
38251 /* load nextafter (0.5, 0.0) */
38252 fmt = REAL_MODE_FORMAT (mode);
38253 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38254 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38256 /* xa = xa + 0.5 */
38257 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
38258 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
38260 /* xa = (double)(int64_t)xa */
38261 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
38262 expand_fix (xi, xa, 0);
38263 expand_float (xa, xi, 0);
38265 /* res = copysign (xa, operand1) */
38266 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
38268 emit_label (label);
38269 LABEL_NUSES (label) = 1;
38271 emit_move_insn (operand0, res);
38274 /* Expand SSE sequence for computing round
38275 from OP1 storing into OP0 using sse4 round insn. */
38276 void
38277 ix86_expand_round_sse4 (rtx op0, rtx op1)
38279 enum machine_mode mode = GET_MODE (op0);
38280 rtx e1, e2, res, half;
38281 const struct real_format *fmt;
38282 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
38283 rtx (*gen_copysign) (rtx, rtx, rtx);
38284 rtx (*gen_round) (rtx, rtx, rtx);
38286 switch (mode)
38288 case SFmode:
38289 gen_copysign = gen_copysignsf3;
38290 gen_round = gen_sse4_1_roundsf2;
38291 break;
38292 case DFmode:
38293 gen_copysign = gen_copysigndf3;
38294 gen_round = gen_sse4_1_rounddf2;
38295 break;
38296 default:
38297 gcc_unreachable ();
38300 /* round (a) = trunc (a + copysign (0.5, a)) */
38302 /* load nextafter (0.5, 0.0) */
38303 fmt = REAL_MODE_FORMAT (mode);
38304 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
38305 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
38306 half = const_double_from_real_value (pred_half, mode);
38308 /* e1 = copysign (0.5, op1) */
38309 e1 = gen_reg_rtx (mode);
38310 emit_insn (gen_copysign (e1, half, op1));
38312 /* e2 = op1 + e1 */
38313 e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
38315 /* res = trunc (e2) */
38316 res = gen_reg_rtx (mode);
38317 emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
38319 emit_move_insn (op0, res);
38323 /* Table of valid machine attributes. */
38324 static const struct attribute_spec ix86_attribute_table[] =
38326 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
38327 affects_type_identity } */
38328 /* Stdcall attribute says callee is responsible for popping arguments
38329 if they are not variable. */
38330 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38331 true },
38332 /* Fastcall attribute says callee is responsible for popping arguments
38333 if they are not variable. */
38334 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38335 true },
38336 /* Thiscall attribute says callee is responsible for popping arguments
38337 if they are not variable. */
38338 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38339 true },
38340 /* Cdecl attribute says the callee is a normal C declaration */
38341 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38342 true },
38343 /* Regparm attribute specifies how many integer arguments are to be
38344 passed in registers. */
38345 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
38346 true },
38347 /* Sseregparm attribute says we are using x86_64 calling conventions
38348 for FP arguments. */
38349 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
38350 true },
38351 /* The transactional memory builtins are implicitly regparm or fastcall
38352 depending on the ABI. Override the generic do-nothing attribute that
38353 these builtins were declared with. */
38354 { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
38355 true },
38356 /* force_align_arg_pointer says this function realigns the stack at entry. */
38357 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
38358 false, true, true, ix86_handle_cconv_attribute, false },
38359 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
38360 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
38361 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
38362 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
38363 false },
38364 #endif
38365 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38366 false },
38367 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
38368 false },
38369 #ifdef SUBTARGET_ATTRIBUTE_TABLE
38370 SUBTARGET_ATTRIBUTE_TABLE,
38371 #endif
38372 /* ms_abi and sysv_abi calling convention function attributes. */
38373 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38374 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
38375 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
38376 false },
38377 { "callee_pop_aggregate_return", 1, 1, false, true, true,
38378 ix86_handle_callee_pop_aggregate_return, true },
38379 /* End element. */
38380 { NULL, 0, 0, false, false, false, NULL, false }
38383 /* Implement targetm.vectorize.builtin_vectorization_cost. */
38384 static int
38385 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
38386 tree vectype,
38387 int misalign ATTRIBUTE_UNUSED)
38389 unsigned elements;
38391 switch (type_of_cost)
38393 case scalar_stmt:
38394 return ix86_cost->scalar_stmt_cost;
38396 case scalar_load:
38397 return ix86_cost->scalar_load_cost;
38399 case scalar_store:
38400 return ix86_cost->scalar_store_cost;
38402 case vector_stmt:
38403 return ix86_cost->vec_stmt_cost;
38405 case vector_load:
38406 return ix86_cost->vec_align_load_cost;
38408 case vector_store:
38409 return ix86_cost->vec_store_cost;
38411 case vec_to_scalar:
38412 return ix86_cost->vec_to_scalar_cost;
38414 case scalar_to_vec:
38415 return ix86_cost->scalar_to_vec_cost;
38417 case unaligned_load:
38418 case unaligned_store:
38419 return ix86_cost->vec_unalign_load_cost;
38421 case cond_branch_taken:
38422 return ix86_cost->cond_taken_branch_cost;
38424 case cond_branch_not_taken:
38425 return ix86_cost->cond_not_taken_branch_cost;
38427 case vec_perm:
38428 case vec_promote_demote:
38429 return ix86_cost->vec_stmt_cost;
38431 case vec_construct:
38432 elements = TYPE_VECTOR_SUBPARTS (vectype);
38433 return elements / 2 + 1;
38435 default:
38436 gcc_unreachable ();
38440 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
38441 insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
38442 insn every time. */
38444 static GTY(()) rtx vselect_insn;
38446 /* Initialize vselect_insn. */
38448 static void
38449 init_vselect_insn (void)
38451 unsigned i;
38452 rtx x;
38454 x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
38455 for (i = 0; i < MAX_VECT_LEN; ++i)
38456 XVECEXP (x, 0, i) = const0_rtx;
38457 x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
38458 const0_rtx), x);
38459 x = gen_rtx_SET (VOIDmode, const0_rtx, x);
38460 start_sequence ();
38461 vselect_insn = emit_insn (x);
38462 end_sequence ();
38465 /* Construct (set target (vec_select op0 (parallel perm))) and
38466 return true if that's a valid instruction in the active ISA. */
38468 static bool
38469 expand_vselect (rtx target, rtx op0, const unsigned char *perm,
38470 unsigned nelt, bool testing_p)
38472 unsigned int i;
38473 rtx x, save_vconcat;
38474 int icode;
38476 if (vselect_insn == NULL_RTX)
38477 init_vselect_insn ();
38479 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
38480 PUT_NUM_ELEM (XVEC (x, 0), nelt);
38481 for (i = 0; i < nelt; ++i)
38482 XVECEXP (x, 0, i) = GEN_INT (perm[i]);
38483 save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38484 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
38485 PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
38486 SET_DEST (PATTERN (vselect_insn)) = target;
38487 icode = recog_memoized (vselect_insn);
38489 if (icode >= 0 && !testing_p)
38490 emit_insn (copy_rtx (PATTERN (vselect_insn)));
38492 SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
38493 XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
38494 INSN_CODE (vselect_insn) = -1;
38496 return icode >= 0;
38499 /* Similar, but generate a vec_concat from op0 and op1 as well. */
38501 static bool
38502 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
38503 const unsigned char *perm, unsigned nelt,
38504 bool testing_p)
38506 enum machine_mode v2mode;
38507 rtx x;
38508 bool ok;
38510 if (vselect_insn == NULL_RTX)
38511 init_vselect_insn ();
38513 v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
38514 x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
38515 PUT_MODE (x, v2mode);
38516 XEXP (x, 0) = op0;
38517 XEXP (x, 1) = op1;
38518 ok = expand_vselect (target, x, perm, nelt, testing_p);
38519 XEXP (x, 0) = const0_rtx;
38520 XEXP (x, 1) = const0_rtx;
38521 return ok;
38524 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38525 in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
38527 static bool
38528 expand_vec_perm_blend (struct expand_vec_perm_d *d)
38530 enum machine_mode vmode = d->vmode;
38531 unsigned i, mask, nelt = d->nelt;
38532 rtx target, op0, op1, x;
38533 rtx rperm[32], vperm;
38535 if (d->one_operand_p)
38536 return false;
38537 if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
38539 else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
38541 else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
38543 else
38544 return false;
38546 /* This is a blend, not a permute. Elements must stay in their
38547 respective lanes. */
38548 for (i = 0; i < nelt; ++i)
38550 unsigned e = d->perm[i];
38551 if (!(e == i || e == i + nelt))
38552 return false;
38555 if (d->testing_p)
38556 return true;
38558 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
38559 decision should be extracted elsewhere, so that we only try that
38560 sequence once all budget==3 options have been tried. */
38561 target = d->target;
38562 op0 = d->op0;
38563 op1 = d->op1;
38564 mask = 0;
38566 switch (vmode)
38568 case V4DFmode:
38569 case V8SFmode:
38570 case V2DFmode:
38571 case V4SFmode:
38572 case V8HImode:
38573 case V8SImode:
38574 for (i = 0; i < nelt; ++i)
38575 mask |= (d->perm[i] >= nelt) << i;
38576 break;
38578 case V2DImode:
38579 for (i = 0; i < 2; ++i)
38580 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
38581 vmode = V8HImode;
38582 goto do_subreg;
38584 case V4SImode:
38585 for (i = 0; i < 4; ++i)
38586 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38587 vmode = V8HImode;
38588 goto do_subreg;
38590 case V16QImode:
38591 /* See if bytes move in pairs so we can use pblendw with
38592 an immediate argument, rather than pblendvb with a vector
38593 argument. */
38594 for (i = 0; i < 16; i += 2)
38595 if (d->perm[i] + 1 != d->perm[i + 1])
38597 use_pblendvb:
38598 for (i = 0; i < nelt; ++i)
38599 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
38601 finish_pblendvb:
38602 vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
38603 vperm = force_reg (vmode, vperm);
38605 if (GET_MODE_SIZE (vmode) == 16)
38606 emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
38607 else
38608 emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
38609 return true;
38612 for (i = 0; i < 8; ++i)
38613 mask |= (d->perm[i * 2] >= 16) << i;
38614 vmode = V8HImode;
38615 /* FALLTHRU */
38617 do_subreg:
38618 target = gen_lowpart (vmode, target);
38619 op0 = gen_lowpart (vmode, op0);
38620 op1 = gen_lowpart (vmode, op1);
38621 break;
38623 case V32QImode:
38624 /* See if bytes move in pairs. If not, vpblendvb must be used. */
38625 for (i = 0; i < 32; i += 2)
38626 if (d->perm[i] + 1 != d->perm[i + 1])
38627 goto use_pblendvb;
38628 /* See if bytes move in quadruplets. If yes, vpblendd
38629 with immediate can be used. */
38630 for (i = 0; i < 32; i += 4)
38631 if (d->perm[i] + 2 != d->perm[i + 2])
38632 break;
38633 if (i < 32)
38635 /* See if bytes move the same in both lanes. If yes,
38636 vpblendw with immediate can be used. */
38637 for (i = 0; i < 16; i += 2)
38638 if (d->perm[i] + 16 != d->perm[i + 16])
38639 goto use_pblendvb;
38641 /* Use vpblendw. */
38642 for (i = 0; i < 16; ++i)
38643 mask |= (d->perm[i * 2] >= 32) << i;
38644 vmode = V16HImode;
38645 goto do_subreg;
38648 /* Use vpblendd. */
38649 for (i = 0; i < 8; ++i)
38650 mask |= (d->perm[i * 4] >= 32) << i;
38651 vmode = V8SImode;
38652 goto do_subreg;
38654 case V16HImode:
38655 /* See if words move in pairs. If yes, vpblendd can be used. */
38656 for (i = 0; i < 16; i += 2)
38657 if (d->perm[i] + 1 != d->perm[i + 1])
38658 break;
38659 if (i < 16)
38661 /* See if words move the same in both lanes. If not,
38662 vpblendvb must be used. */
38663 for (i = 0; i < 8; i++)
38664 if (d->perm[i] + 8 != d->perm[i + 8])
38666 /* Use vpblendvb. */
38667 for (i = 0; i < 32; ++i)
38668 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
38670 vmode = V32QImode;
38671 nelt = 32;
38672 target = gen_lowpart (vmode, target);
38673 op0 = gen_lowpart (vmode, op0);
38674 op1 = gen_lowpart (vmode, op1);
38675 goto finish_pblendvb;
38678 /* Use vpblendw. */
38679 for (i = 0; i < 16; ++i)
38680 mask |= (d->perm[i] >= 16) << i;
38681 break;
38684 /* Use vpblendd. */
38685 for (i = 0; i < 8; ++i)
38686 mask |= (d->perm[i * 2] >= 16) << i;
38687 vmode = V8SImode;
38688 goto do_subreg;
38690 case V4DImode:
38691 /* Use vpblendd. */
38692 for (i = 0; i < 4; ++i)
38693 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
38694 vmode = V8SImode;
38695 goto do_subreg;
38697 default:
38698 gcc_unreachable ();
38701 /* This matches five different patterns with the different modes. */
38702 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
38703 x = gen_rtx_SET (VOIDmode, target, x);
38704 emit_insn (x);
38706 return true;
38709 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38710 in terms of the variable form of vpermilps.
38712 Note that we will have already failed the immediate input vpermilps,
38713 which requires that the high and low part shuffle be identical; the
38714 variable form doesn't require that. */
38716 static bool
38717 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
38719 rtx rperm[8], vperm;
38720 unsigned i;
38722 if (!TARGET_AVX || d->vmode != V8SFmode || !d->one_operand_p)
38723 return false;
38725 /* We can only permute within the 128-bit lane. */
38726 for (i = 0; i < 8; ++i)
38728 unsigned e = d->perm[i];
38729 if (i < 4 ? e >= 4 : e < 4)
38730 return false;
38733 if (d->testing_p)
38734 return true;
38736 for (i = 0; i < 8; ++i)
38738 unsigned e = d->perm[i];
38740 /* Within each 128-bit lane, the elements of op0 are numbered
38741 from 0 and the elements of op1 are numbered from 4. */
38742 if (e >= 8 + 4)
38743 e -= 8;
38744 else if (e >= 4)
38745 e -= 4;
38747 rperm[i] = GEN_INT (e);
38750 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
38751 vperm = force_reg (V8SImode, vperm);
38752 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
38754 return true;
38757 /* Return true if permutation D can be performed as VMODE permutation
38758 instead. */
38760 static bool
38761 valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
38763 unsigned int i, j, chunk;
38765 if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
38766 || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
38767 || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
38768 return false;
38770 if (GET_MODE_NUNITS (vmode) >= d->nelt)
38771 return true;
38773 chunk = d->nelt / GET_MODE_NUNITS (vmode);
38774 for (i = 0; i < d->nelt; i += chunk)
38775 if (d->perm[i] & (chunk - 1))
38776 return false;
38777 else
38778 for (j = 1; j < chunk; ++j)
38779 if (d->perm[i] + j != d->perm[i + j])
38780 return false;
38782 return true;
38785 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
38786 in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
38788 static bool
38789 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
38791 unsigned i, nelt, eltsz, mask;
38792 unsigned char perm[32];
38793 enum machine_mode vmode = V16QImode;
38794 rtx rperm[32], vperm, target, op0, op1;
38796 nelt = d->nelt;
38798 if (!d->one_operand_p)
38800 if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
38802 if (TARGET_AVX2
38803 && valid_perm_using_mode_p (V2TImode, d))
38805 if (d->testing_p)
38806 return true;
38808 /* Use vperm2i128 insn. The pattern uses
38809 V4DImode instead of V2TImode. */
38810 target = gen_lowpart (V4DImode, d->target);
38811 op0 = gen_lowpart (V4DImode, d->op0);
38812 op1 = gen_lowpart (V4DImode, d->op1);
38813 rperm[0]
38814 = GEN_INT ((d->perm[0] / (nelt / 2))
38815 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
38816 emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
38817 return true;
38819 return false;
38822 else
38824 if (GET_MODE_SIZE (d->vmode) == 16)
38826 if (!TARGET_SSSE3)
38827 return false;
38829 else if (GET_MODE_SIZE (d->vmode) == 32)
38831 if (!TARGET_AVX2)
38832 return false;
38834 /* V4DImode should be already handled through
38835 expand_vselect by vpermq instruction. */
38836 gcc_assert (d->vmode != V4DImode);
38838 vmode = V32QImode;
38839 if (d->vmode == V8SImode
38840 || d->vmode == V16HImode
38841 || d->vmode == V32QImode)
38843 /* First see if vpermq can be used for
38844 V8SImode/V16HImode/V32QImode. */
38845 if (valid_perm_using_mode_p (V4DImode, d))
38847 for (i = 0; i < 4; i++)
38848 perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
38849 if (d->testing_p)
38850 return true;
38851 return expand_vselect (gen_lowpart (V4DImode, d->target),
38852 gen_lowpart (V4DImode, d->op0),
38853 perm, 4, false);
38856 /* Next see if vpermd can be used. */
38857 if (valid_perm_using_mode_p (V8SImode, d))
38858 vmode = V8SImode;
38860 /* Or if vpermps can be used. */
38861 else if (d->vmode == V8SFmode)
38862 vmode = V8SImode;
38864 if (vmode == V32QImode)
38866 /* vpshufb only works intra lanes, it is not
38867 possible to shuffle bytes in between the lanes. */
38868 for (i = 0; i < nelt; ++i)
38869 if ((d->perm[i] ^ i) & (nelt / 2))
38870 return false;
38873 else
38874 return false;
38877 if (d->testing_p)
38878 return true;
38880 if (vmode == V8SImode)
38881 for (i = 0; i < 8; ++i)
38882 rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
38883 else
38885 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
38886 if (!d->one_operand_p)
38887 mask = 2 * nelt - 1;
38888 else if (vmode == V16QImode)
38889 mask = nelt - 1;
38890 else
38891 mask = nelt / 2 - 1;
38893 for (i = 0; i < nelt; ++i)
38895 unsigned j, e = d->perm[i] & mask;
38896 for (j = 0; j < eltsz; ++j)
38897 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
38901 vperm = gen_rtx_CONST_VECTOR (vmode,
38902 gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
38903 vperm = force_reg (vmode, vperm);
38905 target = gen_lowpart (vmode, d->target);
38906 op0 = gen_lowpart (vmode, d->op0);
38907 if (d->one_operand_p)
38909 if (vmode == V16QImode)
38910 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
38911 else if (vmode == V32QImode)
38912 emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
38913 else if (vmode == V8SFmode)
38914 emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
38915 else
38916 emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
38918 else
38920 op1 = gen_lowpart (vmode, d->op1);
38921 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
38924 return true;
38927 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
38928 in a single instruction. */
38930 static bool
38931 expand_vec_perm_1 (struct expand_vec_perm_d *d)
38933 unsigned i, nelt = d->nelt;
38934 unsigned char perm2[MAX_VECT_LEN];
38936 /* Check plain VEC_SELECT first, because AVX has instructions that could
38937 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
38938 input where SEL+CONCAT may not. */
38939 if (d->one_operand_p)
38941 int mask = nelt - 1;
38942 bool identity_perm = true;
38943 bool broadcast_perm = true;
38945 for (i = 0; i < nelt; i++)
38947 perm2[i] = d->perm[i] & mask;
38948 if (perm2[i] != i)
38949 identity_perm = false;
38950 if (perm2[i])
38951 broadcast_perm = false;
38954 if (identity_perm)
38956 if (!d->testing_p)
38957 emit_move_insn (d->target, d->op0);
38958 return true;
38960 else if (broadcast_perm && TARGET_AVX2)
38962 /* Use vpbroadcast{b,w,d}. */
38963 rtx (*gen) (rtx, rtx) = NULL;
38964 switch (d->vmode)
38966 case V32QImode:
38967 gen = gen_avx2_pbroadcastv32qi_1;
38968 break;
38969 case V16HImode:
38970 gen = gen_avx2_pbroadcastv16hi_1;
38971 break;
38972 case V8SImode:
38973 gen = gen_avx2_pbroadcastv8si_1;
38974 break;
38975 case V16QImode:
38976 gen = gen_avx2_pbroadcastv16qi;
38977 break;
38978 case V8HImode:
38979 gen = gen_avx2_pbroadcastv8hi;
38980 break;
38981 case V8SFmode:
38982 gen = gen_avx2_vec_dupv8sf_1;
38983 break;
38984 /* For other modes prefer other shuffles this function creates. */
38985 default: break;
38987 if (gen != NULL)
38989 if (!d->testing_p)
38990 emit_insn (gen (d->target, d->op0));
38991 return true;
38995 if (expand_vselect (d->target, d->op0, perm2, nelt, d->testing_p))
38996 return true;
38998 /* There are plenty of patterns in sse.md that are written for
38999 SEL+CONCAT and are not replicated for a single op. Perhaps
39000 that should be changed, to avoid the nastiness here. */
39002 /* Recognize interleave style patterns, which means incrementing
39003 every other permutation operand. */
39004 for (i = 0; i < nelt; i += 2)
39006 perm2[i] = d->perm[i] & mask;
39007 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
39009 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39010 d->testing_p))
39011 return true;
39013 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
39014 if (nelt >= 4)
39016 for (i = 0; i < nelt; i += 4)
39018 perm2[i + 0] = d->perm[i + 0] & mask;
39019 perm2[i + 1] = d->perm[i + 1] & mask;
39020 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
39021 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
39024 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt,
39025 d->testing_p))
39026 return true;
39030 /* Finally, try the fully general two operand permute. */
39031 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
39032 d->testing_p))
39033 return true;
39035 /* Recognize interleave style patterns with reversed operands. */
39036 if (!d->one_operand_p)
39038 for (i = 0; i < nelt; ++i)
39040 unsigned e = d->perm[i];
39041 if (e >= nelt)
39042 e -= nelt;
39043 else
39044 e += nelt;
39045 perm2[i] = e;
39048 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt,
39049 d->testing_p))
39050 return true;
39053 /* Try the SSE4.1 blend variable merge instructions. */
39054 if (expand_vec_perm_blend (d))
39055 return true;
39057 /* Try one of the AVX vpermil variable permutations. */
39058 if (expand_vec_perm_vpermil (d))
39059 return true;
39061 /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
39062 vpshufb, vpermd, vpermps or vpermq variable permutation. */
39063 if (expand_vec_perm_pshufb (d))
39064 return true;
39066 return false;
39069 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
39070 in terms of a pair of pshuflw + pshufhw instructions. */
39072 static bool
39073 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
39075 unsigned char perm2[MAX_VECT_LEN];
39076 unsigned i;
39077 bool ok;
39079 if (d->vmode != V8HImode || !d->one_operand_p)
39080 return false;
39082 /* The two permutations only operate in 64-bit lanes. */
39083 for (i = 0; i < 4; ++i)
39084 if (d->perm[i] >= 4)
39085 return false;
39086 for (i = 4; i < 8; ++i)
39087 if (d->perm[i] < 4)
39088 return false;
39090 if (d->testing_p)
39091 return true;
39093 /* Emit the pshuflw. */
39094 memcpy (perm2, d->perm, 4);
39095 for (i = 4; i < 8; ++i)
39096 perm2[i] = i;
39097 ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
39098 gcc_assert (ok);
39100 /* Emit the pshufhw. */
39101 memcpy (perm2 + 4, d->perm + 4, 4);
39102 for (i = 0; i < 4; ++i)
39103 perm2[i] = i;
39104 ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
39105 gcc_assert (ok);
39107 return true;
39110 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39111 the permutation using the SSSE3 palignr instruction. This succeeds
39112 when all of the elements in PERM fit within one vector and we merely
39113 need to shift them down so that a single vector permutation has a
39114 chance to succeed. */
39116 static bool
39117 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
39119 unsigned i, nelt = d->nelt;
39120 unsigned min, max;
39121 bool in_order, ok;
39122 rtx shift;
39124 /* Even with AVX, palignr only operates on 128-bit vectors. */
39125 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39126 return false;
39128 min = nelt, max = 0;
39129 for (i = 0; i < nelt; ++i)
39131 unsigned e = d->perm[i];
39132 if (e < min)
39133 min = e;
39134 if (e > max)
39135 max = e;
39137 if (min == 0 || max - min >= nelt)
39138 return false;
39140 /* Given that we have SSSE3, we know we'll be able to implement the
39141 single operand permutation after the palignr with pshufb. */
39142 if (d->testing_p)
39143 return true;
39145 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
39146 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
39147 gen_lowpart (TImode, d->op1),
39148 gen_lowpart (TImode, d->op0), shift));
39150 d->op0 = d->op1 = d->target;
39151 d->one_operand_p = true;
39153 in_order = true;
39154 for (i = 0; i < nelt; ++i)
39156 unsigned e = d->perm[i] - min;
39157 if (e != i)
39158 in_order = false;
39159 d->perm[i] = e;
39162 /* Test for the degenerate case where the alignment by itself
39163 produces the desired permutation. */
39164 if (in_order)
39165 return true;
39167 ok = expand_vec_perm_1 (d);
39168 gcc_assert (ok);
39170 return ok;
39173 static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
39175 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39176 a two vector permutation into a single vector permutation by using
39177 an interleave operation to merge the vectors. */
39179 static bool
39180 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
39182 struct expand_vec_perm_d dremap, dfinal;
39183 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
39184 unsigned HOST_WIDE_INT contents;
39185 unsigned char remap[2 * MAX_VECT_LEN];
39186 rtx seq;
39187 bool ok, same_halves = false;
39189 if (GET_MODE_SIZE (d->vmode) == 16)
39191 if (d->one_operand_p)
39192 return false;
39194 else if (GET_MODE_SIZE (d->vmode) == 32)
39196 if (!TARGET_AVX)
39197 return false;
39198 /* For 32-byte modes allow even d->one_operand_p.
39199 The lack of cross-lane shuffling in some instructions
39200 might prevent a single insn shuffle. */
39201 dfinal = *d;
39202 dfinal.testing_p = true;
39203 /* If expand_vec_perm_interleave3 can expand this into
39204 a 3 insn sequence, give up and let it be expanded as
39205 3 insn sequence. While that is one insn longer,
39206 it doesn't need a memory operand and in the common
39207 case that both interleave low and high permutations
39208 with the same operands are adjacent needs 4 insns
39209 for both after CSE. */
39210 if (expand_vec_perm_interleave3 (&dfinal))
39211 return false;
39213 else
39214 return false;
39216 /* Examine from whence the elements come. */
39217 contents = 0;
39218 for (i = 0; i < nelt; ++i)
39219 contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
39221 memset (remap, 0xff, sizeof (remap));
39222 dremap = *d;
39224 if (GET_MODE_SIZE (d->vmode) == 16)
39226 unsigned HOST_WIDE_INT h1, h2, h3, h4;
39228 /* Split the two input vectors into 4 halves. */
39229 h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
39230 h2 = h1 << nelt2;
39231 h3 = h2 << nelt2;
39232 h4 = h3 << nelt2;
39234 /* If the elements from the low halves use interleave low, and similarly
39235 for interleave high. If the elements are from mis-matched halves, we
39236 can use shufps for V4SF/V4SI or do a DImode shuffle. */
39237 if ((contents & (h1 | h3)) == contents)
39239 /* punpckl* */
39240 for (i = 0; i < nelt2; ++i)
39242 remap[i] = i * 2;
39243 remap[i + nelt] = i * 2 + 1;
39244 dremap.perm[i * 2] = i;
39245 dremap.perm[i * 2 + 1] = i + nelt;
39247 if (!TARGET_SSE2 && d->vmode == V4SImode)
39248 dremap.vmode = V4SFmode;
39250 else if ((contents & (h2 | h4)) == contents)
39252 /* punpckh* */
39253 for (i = 0; i < nelt2; ++i)
39255 remap[i + nelt2] = i * 2;
39256 remap[i + nelt + nelt2] = i * 2 + 1;
39257 dremap.perm[i * 2] = i + nelt2;
39258 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
39260 if (!TARGET_SSE2 && d->vmode == V4SImode)
39261 dremap.vmode = V4SFmode;
39263 else if ((contents & (h1 | h4)) == contents)
39265 /* shufps */
39266 for (i = 0; i < nelt2; ++i)
39268 remap[i] = i;
39269 remap[i + nelt + nelt2] = i + nelt2;
39270 dremap.perm[i] = i;
39271 dremap.perm[i + nelt2] = i + nelt + nelt2;
39273 if (nelt != 4)
39275 /* shufpd */
39276 dremap.vmode = V2DImode;
39277 dremap.nelt = 2;
39278 dremap.perm[0] = 0;
39279 dremap.perm[1] = 3;
39282 else if ((contents & (h2 | h3)) == contents)
39284 /* shufps */
39285 for (i = 0; i < nelt2; ++i)
39287 remap[i + nelt2] = i;
39288 remap[i + nelt] = i + nelt2;
39289 dremap.perm[i] = i + nelt2;
39290 dremap.perm[i + nelt2] = i + nelt;
39292 if (nelt != 4)
39294 /* shufpd */
39295 dremap.vmode = V2DImode;
39296 dremap.nelt = 2;
39297 dremap.perm[0] = 1;
39298 dremap.perm[1] = 2;
39301 else
39302 return false;
39304 else
39306 unsigned int nelt4 = nelt / 4, nzcnt = 0;
39307 unsigned HOST_WIDE_INT q[8];
39308 unsigned int nonzero_halves[4];
39310 /* Split the two input vectors into 8 quarters. */
39311 q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
39312 for (i = 1; i < 8; ++i)
39313 q[i] = q[0] << (nelt4 * i);
39314 for (i = 0; i < 4; ++i)
39315 if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
39317 nonzero_halves[nzcnt] = i;
39318 ++nzcnt;
39321 if (nzcnt == 1)
39323 gcc_assert (d->one_operand_p);
39324 nonzero_halves[1] = nonzero_halves[0];
39325 same_halves = true;
39327 else if (d->one_operand_p)
39329 gcc_assert (nonzero_halves[0] == 0);
39330 gcc_assert (nonzero_halves[1] == 1);
39333 if (nzcnt <= 2)
39335 if (d->perm[0] / nelt2 == nonzero_halves[1])
39337 /* Attempt to increase the likelihood that dfinal
39338 shuffle will be intra-lane. */
39339 char tmph = nonzero_halves[0];
39340 nonzero_halves[0] = nonzero_halves[1];
39341 nonzero_halves[1] = tmph;
39344 /* vperm2f128 or vperm2i128. */
39345 for (i = 0; i < nelt2; ++i)
39347 remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
39348 remap[i + nonzero_halves[0] * nelt2] = i;
39349 dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
39350 dremap.perm[i] = i + nonzero_halves[0] * nelt2;
39353 if (d->vmode != V8SFmode
39354 && d->vmode != V4DFmode
39355 && d->vmode != V8SImode)
39357 dremap.vmode = V8SImode;
39358 dremap.nelt = 8;
39359 for (i = 0; i < 4; ++i)
39361 dremap.perm[i] = i + nonzero_halves[0] * 4;
39362 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
39366 else if (d->one_operand_p)
39367 return false;
39368 else if (TARGET_AVX2
39369 && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
39371 /* vpunpckl* */
39372 for (i = 0; i < nelt4; ++i)
39374 remap[i] = i * 2;
39375 remap[i + nelt] = i * 2 + 1;
39376 remap[i + nelt2] = i * 2 + nelt2;
39377 remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
39378 dremap.perm[i * 2] = i;
39379 dremap.perm[i * 2 + 1] = i + nelt;
39380 dremap.perm[i * 2 + nelt2] = i + nelt2;
39381 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
39384 else if (TARGET_AVX2
39385 && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
39387 /* vpunpckh* */
39388 for (i = 0; i < nelt4; ++i)
39390 remap[i + nelt4] = i * 2;
39391 remap[i + nelt + nelt4] = i * 2 + 1;
39392 remap[i + nelt2 + nelt4] = i * 2 + nelt2;
39393 remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
39394 dremap.perm[i * 2] = i + nelt4;
39395 dremap.perm[i * 2 + 1] = i + nelt + nelt4;
39396 dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
39397 dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
39400 else
39401 return false;
39404 /* Use the remapping array set up above to move the elements from their
39405 swizzled locations into their final destinations. */
39406 dfinal = *d;
39407 for (i = 0; i < nelt; ++i)
39409 unsigned e = remap[d->perm[i]];
39410 gcc_assert (e < nelt);
39411 /* If same_halves is true, both halves of the remapped vector are the
39412 same. Avoid cross-lane accesses if possible. */
39413 if (same_halves && i >= nelt2)
39415 gcc_assert (e < nelt2);
39416 dfinal.perm[i] = e + nelt2;
39418 else
39419 dfinal.perm[i] = e;
39422 if (!d->testing_p)
39423 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
39424 dfinal.op1 = dfinal.op0;
39425 dfinal.one_operand_p = true;
39426 dremap.target = dfinal.op0;
39428 /* Test if the final remap can be done with a single insn. For V4SFmode or
39429 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
39430 start_sequence ();
39431 ok = expand_vec_perm_1 (&dfinal);
39432 seq = get_insns ();
39433 end_sequence ();
39435 if (!ok)
39436 return false;
39438 if (d->testing_p)
39439 return true;
39441 if (dremap.vmode != dfinal.vmode)
39443 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
39444 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
39445 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
39448 ok = expand_vec_perm_1 (&dremap);
39449 gcc_assert (ok);
39451 emit_insn (seq);
39452 return true;
39455 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39456 a single vector cross-lane permutation into vpermq followed
39457 by any of the single insn permutations. */
39459 static bool
39460 expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
39462 struct expand_vec_perm_d dremap, dfinal;
39463 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
39464 unsigned contents[2];
39465 bool ok;
39467 if (!(TARGET_AVX2
39468 && (d->vmode == V32QImode || d->vmode == V16HImode)
39469 && d->one_operand_p))
39470 return false;
39472 contents[0] = 0;
39473 contents[1] = 0;
39474 for (i = 0; i < nelt2; ++i)
39476 contents[0] |= 1u << (d->perm[i] / nelt4);
39477 contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
39480 for (i = 0; i < 2; ++i)
39482 unsigned int cnt = 0;
39483 for (j = 0; j < 4; ++j)
39484 if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
39485 return false;
39488 if (d->testing_p)
39489 return true;
39491 dremap = *d;
39492 dremap.vmode = V4DImode;
39493 dremap.nelt = 4;
39494 dremap.target = gen_reg_rtx (V4DImode);
39495 dremap.op0 = gen_lowpart (V4DImode, d->op0);
39496 dremap.op1 = dremap.op0;
39497 dremap.one_operand_p = true;
39498 for (i = 0; i < 2; ++i)
39500 unsigned int cnt = 0;
39501 for (j = 0; j < 4; ++j)
39502 if ((contents[i] & (1u << j)) != 0)
39503 dremap.perm[2 * i + cnt++] = j;
39504 for (; cnt < 2; ++cnt)
39505 dremap.perm[2 * i + cnt] = 0;
39508 dfinal = *d;
39509 dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
39510 dfinal.op1 = dfinal.op0;
39511 dfinal.one_operand_p = true;
39512 for (i = 0, j = 0; i < nelt; ++i)
39514 if (i == nelt2)
39515 j = 2;
39516 dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
39517 if ((d->perm[i] / nelt4) == dremap.perm[j])
39519 else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
39520 dfinal.perm[i] |= nelt4;
39521 else
39522 gcc_unreachable ();
39525 ok = expand_vec_perm_1 (&dremap);
39526 gcc_assert (ok);
39528 ok = expand_vec_perm_1 (&dfinal);
39529 gcc_assert (ok);
39531 return true;
39534 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
39535 a vector permutation using two instructions, vperm2f128 resp.
39536 vperm2i128 followed by any single in-lane permutation. */
39538 static bool
39539 expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
39541 struct expand_vec_perm_d dfirst, dsecond;
39542 unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
39543 bool ok;
39545 if (!TARGET_AVX
39546 || GET_MODE_SIZE (d->vmode) != 32
39547 || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
39548 return false;
39550 dsecond = *d;
39551 dsecond.one_operand_p = false;
39552 dsecond.testing_p = true;
39554 /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
39555 immediate. For perm < 16 the second permutation uses
39556 d->op0 as first operand, for perm >= 16 it uses d->op1
39557 as first operand. The second operand is the result of
39558 vperm2[fi]128. */
39559 for (perm = 0; perm < 32; perm++)
39561 /* Ignore permutations which do not move anything cross-lane. */
39562 if (perm < 16)
39564 /* The second shuffle for e.g. V4DFmode has
39565 0123 and ABCD operands.
39566 Ignore AB23, as 23 is already in the second lane
39567 of the first operand. */
39568 if ((perm & 0xc) == (1 << 2)) continue;
39569 /* And 01CD, as 01 is in the first lane of the first
39570 operand. */
39571 if ((perm & 3) == 0) continue;
39572 /* And 4567, as then the vperm2[fi]128 doesn't change
39573 anything on the original 4567 second operand. */
39574 if ((perm & 0xf) == ((3 << 2) | 2)) continue;
39576 else
39578 /* The second shuffle for e.g. V4DFmode has
39579 4567 and ABCD operands.
39580 Ignore AB67, as 67 is already in the second lane
39581 of the first operand. */
39582 if ((perm & 0xc) == (3 << 2)) continue;
39583 /* And 45CD, as 45 is in the first lane of the first
39584 operand. */
39585 if ((perm & 3) == 2) continue;
39586 /* And 0123, as then the vperm2[fi]128 doesn't change
39587 anything on the original 0123 first operand. */
39588 if ((perm & 0xf) == (1 << 2)) continue;
39591 for (i = 0; i < nelt; i++)
39593 j = d->perm[i] / nelt2;
39594 if (j == ((perm >> (2 * (i >= nelt2))) & 3))
39595 dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
39596 else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
39597 dsecond.perm[i] = d->perm[i] & (nelt - 1);
39598 else
39599 break;
39602 if (i == nelt)
39604 start_sequence ();
39605 ok = expand_vec_perm_1 (&dsecond);
39606 end_sequence ();
39608 else
39609 ok = false;
39611 if (ok)
39613 if (d->testing_p)
39614 return true;
39616 /* Found a usable second shuffle. dfirst will be
39617 vperm2f128 on d->op0 and d->op1. */
39618 dsecond.testing_p = false;
39619 dfirst = *d;
39620 dfirst.target = gen_reg_rtx (d->vmode);
39621 for (i = 0; i < nelt; i++)
39622 dfirst.perm[i] = (i & (nelt2 - 1))
39623 + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
39625 ok = expand_vec_perm_1 (&dfirst);
39626 gcc_assert (ok);
39628 /* And dsecond is some single insn shuffle, taking
39629 d->op0 and result of vperm2f128 (if perm < 16) or
39630 d->op1 and result of vperm2f128 (otherwise). */
39631 dsecond.op1 = dfirst.target;
39632 if (perm >= 16)
39633 dsecond.op0 = dfirst.op1;
39635 ok = expand_vec_perm_1 (&dsecond);
39636 gcc_assert (ok);
39638 return true;
39641 /* For one operand, the only useful vperm2f128 permutation is 0x10. */
39642 if (d->one_operand_p)
39643 return false;
39646 return false;
39649 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
39650 a two vector permutation using 2 intra-lane interleave insns
39651 and cross-lane shuffle for 32-byte vectors. */
39653 static bool
39654 expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
39656 unsigned i, nelt;
39657 rtx (*gen) (rtx, rtx, rtx);
39659 if (d->one_operand_p)
39660 return false;
39661 if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
39663 else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
39665 else
39666 return false;
39668 nelt = d->nelt;
39669 if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
39670 return false;
39671 for (i = 0; i < nelt; i += 2)
39672 if (d->perm[i] != d->perm[0] + i / 2
39673 || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
39674 return false;
39676 if (d->testing_p)
39677 return true;
39679 switch (d->vmode)
39681 case V32QImode:
39682 if (d->perm[0])
39683 gen = gen_vec_interleave_highv32qi;
39684 else
39685 gen = gen_vec_interleave_lowv32qi;
39686 break;
39687 case V16HImode:
39688 if (d->perm[0])
39689 gen = gen_vec_interleave_highv16hi;
39690 else
39691 gen = gen_vec_interleave_lowv16hi;
39692 break;
39693 case V8SImode:
39694 if (d->perm[0])
39695 gen = gen_vec_interleave_highv8si;
39696 else
39697 gen = gen_vec_interleave_lowv8si;
39698 break;
39699 case V4DImode:
39700 if (d->perm[0])
39701 gen = gen_vec_interleave_highv4di;
39702 else
39703 gen = gen_vec_interleave_lowv4di;
39704 break;
39705 case V8SFmode:
39706 if (d->perm[0])
39707 gen = gen_vec_interleave_highv8sf;
39708 else
39709 gen = gen_vec_interleave_lowv8sf;
39710 break;
39711 case V4DFmode:
39712 if (d->perm[0])
39713 gen = gen_vec_interleave_highv4df;
39714 else
39715 gen = gen_vec_interleave_lowv4df;
39716 break;
39717 default:
39718 gcc_unreachable ();
39721 emit_insn (gen (d->target, d->op0, d->op1));
39722 return true;
39725 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
39726 a single vector permutation using a single intra-lane vector
39727 permutation, vperm2f128 swapping the lanes and vblend* insn blending
39728 the non-swapped and swapped vectors together. */
39730 static bool
39731 expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
39733 struct expand_vec_perm_d dfirst, dsecond;
39734 unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
39735 rtx seq;
39736 bool ok;
39737 rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
39739 if (!TARGET_AVX
39740 || TARGET_AVX2
39741 || (d->vmode != V8SFmode && d->vmode != V4DFmode)
39742 || !d->one_operand_p)
39743 return false;
39745 dfirst = *d;
39746 for (i = 0; i < nelt; i++)
39747 dfirst.perm[i] = 0xff;
39748 for (i = 0, msk = 0; i < nelt; i++)
39750 j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
39751 if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
39752 return false;
39753 dfirst.perm[j] = d->perm[i];
39754 if (j != i)
39755 msk |= (1 << i);
39757 for (i = 0; i < nelt; i++)
39758 if (dfirst.perm[i] == 0xff)
39759 dfirst.perm[i] = i;
39761 if (!d->testing_p)
39762 dfirst.target = gen_reg_rtx (dfirst.vmode);
39764 start_sequence ();
39765 ok = expand_vec_perm_1 (&dfirst);
39766 seq = get_insns ();
39767 end_sequence ();
39769 if (!ok)
39770 return false;
39772 if (d->testing_p)
39773 return true;
39775 emit_insn (seq);
39777 dsecond = *d;
39778 dsecond.op0 = dfirst.target;
39779 dsecond.op1 = dfirst.target;
39780 dsecond.one_operand_p = true;
39781 dsecond.target = gen_reg_rtx (dsecond.vmode);
39782 for (i = 0; i < nelt; i++)
39783 dsecond.perm[i] = i ^ nelt2;
39785 ok = expand_vec_perm_1 (&dsecond);
39786 gcc_assert (ok);
39788 blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
39789 emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
39790 return true;
39793 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
39794 permutation using two vperm2f128, followed by a vshufpd insn blending
39795 the two vectors together. */
39797 static bool
39798 expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
39800 struct expand_vec_perm_d dfirst, dsecond, dthird;
39801 bool ok;
39803 if (!TARGET_AVX || (d->vmode != V4DFmode))
39804 return false;
39806 if (d->testing_p)
39807 return true;
39809 dfirst = *d;
39810 dsecond = *d;
39811 dthird = *d;
39813 dfirst.perm[0] = (d->perm[0] & ~1);
39814 dfirst.perm[1] = (d->perm[0] & ~1) + 1;
39815 dfirst.perm[2] = (d->perm[2] & ~1);
39816 dfirst.perm[3] = (d->perm[2] & ~1) + 1;
39817 dsecond.perm[0] = (d->perm[1] & ~1);
39818 dsecond.perm[1] = (d->perm[1] & ~1) + 1;
39819 dsecond.perm[2] = (d->perm[3] & ~1);
39820 dsecond.perm[3] = (d->perm[3] & ~1) + 1;
39821 dthird.perm[0] = (d->perm[0] % 2);
39822 dthird.perm[1] = (d->perm[1] % 2) + 4;
39823 dthird.perm[2] = (d->perm[2] % 2) + 2;
39824 dthird.perm[3] = (d->perm[3] % 2) + 6;
39826 dfirst.target = gen_reg_rtx (dfirst.vmode);
39827 dsecond.target = gen_reg_rtx (dsecond.vmode);
39828 dthird.op0 = dfirst.target;
39829 dthird.op1 = dsecond.target;
39830 dthird.one_operand_p = false;
39832 canonicalize_perm (&dfirst);
39833 canonicalize_perm (&dsecond);
39835 ok = expand_vec_perm_1 (&dfirst)
39836 && expand_vec_perm_1 (&dsecond)
39837 && expand_vec_perm_1 (&dthird);
39839 gcc_assert (ok);
39841 return true;
39844 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
39845 permutation with two pshufb insns and an ior. We should have already
39846 failed all two instruction sequences. */
39848 static bool
39849 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
39851 rtx rperm[2][16], vperm, l, h, op, m128;
39852 unsigned int i, nelt, eltsz;
39854 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
39855 return false;
39856 gcc_assert (!d->one_operand_p);
39858 if (d->testing_p)
39859 return true;
39861 nelt = d->nelt;
39862 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39864 /* Generate two permutation masks. If the required element is within
39865 the given vector it is shuffled into the proper lane. If the required
39866 element is in the other vector, force a zero into the lane by setting
39867 bit 7 in the permutation mask. */
39868 m128 = GEN_INT (-128);
39869 for (i = 0; i < nelt; ++i)
39871 unsigned j, e = d->perm[i];
39872 unsigned which = (e >= nelt);
39873 if (e >= nelt)
39874 e -= nelt;
39876 for (j = 0; j < eltsz; ++j)
39878 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
39879 rperm[1-which][i*eltsz + j] = m128;
39883 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
39884 vperm = force_reg (V16QImode, vperm);
39886 l = gen_reg_rtx (V16QImode);
39887 op = gen_lowpart (V16QImode, d->op0);
39888 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
39890 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
39891 vperm = force_reg (V16QImode, vperm);
39893 h = gen_reg_rtx (V16QImode);
39894 op = gen_lowpart (V16QImode, d->op1);
39895 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
39897 op = gen_lowpart (V16QImode, d->target);
39898 emit_insn (gen_iorv16qi3 (op, l, h));
39900 return true;
39903 /* Implement arbitrary permutation of one V32QImode and V16QImode operand
39904 with two vpshufb insns, vpermq and vpor. We should have already failed
39905 all two or three instruction sequences. */
39907 static bool
39908 expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
39910 rtx rperm[2][32], vperm, l, h, hp, op, m128;
39911 unsigned int i, nelt, eltsz;
39913 if (!TARGET_AVX2
39914 || !d->one_operand_p
39915 || (d->vmode != V32QImode && d->vmode != V16HImode))
39916 return false;
39918 if (d->testing_p)
39919 return true;
39921 nelt = d->nelt;
39922 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39924 /* Generate two permutation masks. If the required element is within
39925 the same lane, it is shuffled in. If the required element from the
39926 other lane, force a zero by setting bit 7 in the permutation mask.
39927 In the other mask the mask has non-negative elements if element
39928 is requested from the other lane, but also moved to the other lane,
39929 so that the result of vpshufb can have the two V2TImode halves
39930 swapped. */
39931 m128 = GEN_INT (-128);
39932 for (i = 0; i < nelt; ++i)
39934 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
39935 unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
39937 for (j = 0; j < eltsz; ++j)
39939 rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
39940 rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
39944 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
39945 vperm = force_reg (V32QImode, vperm);
39947 h = gen_reg_rtx (V32QImode);
39948 op = gen_lowpart (V32QImode, d->op0);
39949 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
39951 /* Swap the 128-byte lanes of h into hp. */
39952 hp = gen_reg_rtx (V4DImode);
39953 op = gen_lowpart (V4DImode, h);
39954 emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
39955 const1_rtx));
39957 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
39958 vperm = force_reg (V32QImode, vperm);
39960 l = gen_reg_rtx (V32QImode);
39961 op = gen_lowpart (V32QImode, d->op0);
39962 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
39964 op = gen_lowpart (V32QImode, d->target);
39965 emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
39967 return true;
39970 /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
39971 and extract-odd permutations of two V32QImode and V16QImode operand
39972 with two vpshufb insns, vpor and vpermq. We should have already
39973 failed all two or three instruction sequences. */
39975 static bool
39976 expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
39978 rtx rperm[2][32], vperm, l, h, ior, op, m128;
39979 unsigned int i, nelt, eltsz;
39981 if (!TARGET_AVX2
39982 || d->one_operand_p
39983 || (d->vmode != V32QImode && d->vmode != V16HImode))
39984 return false;
39986 for (i = 0; i < d->nelt; ++i)
39987 if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
39988 return false;
39990 if (d->testing_p)
39991 return true;
39993 nelt = d->nelt;
39994 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
39996 /* Generate two permutation masks. In the first permutation mask
39997 the first quarter will contain indexes for the first half
39998 of the op0, the second quarter will contain bit 7 set, third quarter
39999 will contain indexes for the second half of the op0 and the
40000 last quarter bit 7 set. In the second permutation mask
40001 the first quarter will contain bit 7 set, the second quarter
40002 indexes for the first half of the op1, the third quarter bit 7 set
40003 and last quarter indexes for the second half of the op1.
40004 I.e. the first mask e.g. for V32QImode extract even will be:
40005 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
40006 (all values masked with 0xf except for -128) and second mask
40007 for extract even will be
40008 -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
40009 m128 = GEN_INT (-128);
40010 for (i = 0; i < nelt; ++i)
40012 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40013 unsigned which = d->perm[i] >= nelt;
40014 unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
40016 for (j = 0; j < eltsz; ++j)
40018 rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
40019 rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
40023 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
40024 vperm = force_reg (V32QImode, vperm);
40026 l = gen_reg_rtx (V32QImode);
40027 op = gen_lowpart (V32QImode, d->op0);
40028 emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
40030 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
40031 vperm = force_reg (V32QImode, vperm);
40033 h = gen_reg_rtx (V32QImode);
40034 op = gen_lowpart (V32QImode, d->op1);
40035 emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
40037 ior = gen_reg_rtx (V32QImode);
40038 emit_insn (gen_iorv32qi3 (ior, l, h));
40040 /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
40041 op = gen_lowpart (V4DImode, d->target);
40042 ior = gen_lowpart (V4DImode, ior);
40043 emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
40044 const1_rtx, GEN_INT (3)));
40046 return true;
40049 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
40050 and extract-odd permutations. */
40052 static bool
40053 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
40055 rtx t1, t2, t3;
40057 switch (d->vmode)
40059 case V4DFmode:
40060 if (d->testing_p)
40061 break;
40062 t1 = gen_reg_rtx (V4DFmode);
40063 t2 = gen_reg_rtx (V4DFmode);
40065 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40066 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
40067 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
40069 /* Now an unpck[lh]pd will produce the result required. */
40070 if (odd)
40071 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
40072 else
40073 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
40074 emit_insn (t3);
40075 break;
40077 case V8SFmode:
40079 int mask = odd ? 0xdd : 0x88;
40081 if (d->testing_p)
40082 break;
40083 t1 = gen_reg_rtx (V8SFmode);
40084 t2 = gen_reg_rtx (V8SFmode);
40085 t3 = gen_reg_rtx (V8SFmode);
40087 /* Shuffle within the 128-bit lanes to produce:
40088 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
40089 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
40090 GEN_INT (mask)));
40092 /* Shuffle the lanes around to produce:
40093 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
40094 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
40095 GEN_INT (0x3)));
40097 /* Shuffle within the 128-bit lanes to produce:
40098 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
40099 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
40101 /* Shuffle within the 128-bit lanes to produce:
40102 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
40103 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
40105 /* Shuffle the lanes around to produce:
40106 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
40107 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
40108 GEN_INT (0x20)));
40110 break;
40112 case V2DFmode:
40113 case V4SFmode:
40114 case V2DImode:
40115 case V4SImode:
40116 /* These are always directly implementable by expand_vec_perm_1. */
40117 gcc_unreachable ();
40119 case V8HImode:
40120 if (TARGET_SSSE3)
40121 return expand_vec_perm_pshufb2 (d);
40122 else
40124 if (d->testing_p)
40125 break;
40126 /* We need 2*log2(N)-1 operations to achieve odd/even
40127 with interleave. */
40128 t1 = gen_reg_rtx (V8HImode);
40129 t2 = gen_reg_rtx (V8HImode);
40130 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
40131 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
40132 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
40133 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
40134 if (odd)
40135 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
40136 else
40137 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
40138 emit_insn (t3);
40140 break;
40142 case V16QImode:
40143 if (TARGET_SSSE3)
40144 return expand_vec_perm_pshufb2 (d);
40145 else
40147 if (d->testing_p)
40148 break;
40149 t1 = gen_reg_rtx (V16QImode);
40150 t2 = gen_reg_rtx (V16QImode);
40151 t3 = gen_reg_rtx (V16QImode);
40152 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
40153 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
40154 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
40155 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
40156 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
40157 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
40158 if (odd)
40159 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
40160 else
40161 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
40162 emit_insn (t3);
40164 break;
40166 case V16HImode:
40167 case V32QImode:
40168 return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
40170 case V4DImode:
40171 if (!TARGET_AVX2)
40173 struct expand_vec_perm_d d_copy = *d;
40174 d_copy.vmode = V4DFmode;
40175 d_copy.target = gen_lowpart (V4DFmode, d->target);
40176 d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
40177 d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
40178 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40181 if (d->testing_p)
40182 break;
40184 t1 = gen_reg_rtx (V4DImode);
40185 t2 = gen_reg_rtx (V4DImode);
40187 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
40188 emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
40189 emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
40191 /* Now an vpunpck[lh]qdq will produce the result required. */
40192 if (odd)
40193 t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
40194 else
40195 t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
40196 emit_insn (t3);
40197 break;
40199 case V8SImode:
40200 if (!TARGET_AVX2)
40202 struct expand_vec_perm_d d_copy = *d;
40203 d_copy.vmode = V8SFmode;
40204 d_copy.target = gen_lowpart (V8SFmode, d->target);
40205 d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
40206 d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
40207 return expand_vec_perm_even_odd_1 (&d_copy, odd);
40210 if (d->testing_p)
40211 break;
40213 t1 = gen_reg_rtx (V8SImode);
40214 t2 = gen_reg_rtx (V8SImode);
40216 /* Shuffle the lanes around into
40217 { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
40218 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
40219 gen_lowpart (V4DImode, d->op0),
40220 gen_lowpart (V4DImode, d->op1),
40221 GEN_INT (0x20)));
40222 emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
40223 gen_lowpart (V4DImode, d->op0),
40224 gen_lowpart (V4DImode, d->op1),
40225 GEN_INT (0x31)));
40227 /* Swap the 2nd and 3rd position in each lane into
40228 { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
40229 emit_insn (gen_avx2_pshufdv3 (t1, t1,
40230 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40231 emit_insn (gen_avx2_pshufdv3 (t2, t2,
40232 GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
40234 /* Now an vpunpck[lh]qdq will produce
40235 { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
40236 if (odd)
40237 t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
40238 gen_lowpart (V4DImode, t1),
40239 gen_lowpart (V4DImode, t2));
40240 else
40241 t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
40242 gen_lowpart (V4DImode, t1),
40243 gen_lowpart (V4DImode, t2));
40244 emit_insn (t3);
40245 break;
40247 default:
40248 gcc_unreachable ();
40251 return true;
40254 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40255 extract-even and extract-odd permutations. */
40257 static bool
40258 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
40260 unsigned i, odd, nelt = d->nelt;
40262 odd = d->perm[0];
40263 if (odd != 0 && odd != 1)
40264 return false;
40266 for (i = 1; i < nelt; ++i)
40267 if (d->perm[i] != 2 * i + odd)
40268 return false;
40270 return expand_vec_perm_even_odd_1 (d, odd);
40273 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
40274 permutations. We assume that expand_vec_perm_1 has already failed. */
40276 static bool
40277 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
40279 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
40280 enum machine_mode vmode = d->vmode;
40281 unsigned char perm2[4];
40282 rtx op0 = d->op0;
40283 bool ok;
40285 switch (vmode)
40287 case V4DFmode:
40288 case V8SFmode:
40289 /* These are special-cased in sse.md so that we can optionally
40290 use the vbroadcast instruction. They expand to two insns
40291 if the input happens to be in a register. */
40292 gcc_unreachable ();
40294 case V2DFmode:
40295 case V2DImode:
40296 case V4SFmode:
40297 case V4SImode:
40298 /* These are always implementable using standard shuffle patterns. */
40299 gcc_unreachable ();
40301 case V8HImode:
40302 case V16QImode:
40303 /* These can be implemented via interleave. We save one insn by
40304 stopping once we have promoted to V4SImode and then use pshufd. */
40305 if (d->testing_p)
40306 return true;
40309 rtx dest;
40310 rtx (*gen) (rtx, rtx, rtx)
40311 = vmode == V16QImode ? gen_vec_interleave_lowv16qi
40312 : gen_vec_interleave_lowv8hi;
40314 if (elt >= nelt2)
40316 gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
40317 : gen_vec_interleave_highv8hi;
40318 elt -= nelt2;
40320 nelt2 /= 2;
40322 dest = gen_reg_rtx (vmode);
40323 emit_insn (gen (dest, op0, op0));
40324 vmode = get_mode_wider_vector (vmode);
40325 op0 = gen_lowpart (vmode, dest);
40327 while (vmode != V4SImode);
40329 memset (perm2, elt, 4);
40330 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4,
40331 d->testing_p);
40332 gcc_assert (ok);
40333 return true;
40335 case V32QImode:
40336 case V16HImode:
40337 case V8SImode:
40338 case V4DImode:
40339 /* For AVX2 broadcasts of the first element vpbroadcast* or
40340 vpermq should be used by expand_vec_perm_1. */
40341 gcc_assert (!TARGET_AVX2 || d->perm[0]);
40342 return false;
40344 default:
40345 gcc_unreachable ();
40349 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
40350 broadcast permutations. */
40352 static bool
40353 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
40355 unsigned i, elt, nelt = d->nelt;
40357 if (!d->one_operand_p)
40358 return false;
40360 elt = d->perm[0];
40361 for (i = 1; i < nelt; ++i)
40362 if (d->perm[i] != elt)
40363 return false;
40365 return expand_vec_perm_broadcast_1 (d);
40368 /* Implement arbitrary permutation of two V32QImode and V16QImode operands
40369 with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
40370 all the shorter instruction sequences. */
40372 static bool
40373 expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
40375 rtx rperm[4][32], vperm, l[2], h[2], op, m128;
40376 unsigned int i, nelt, eltsz;
40377 bool used[4];
40379 if (!TARGET_AVX2
40380 || d->one_operand_p
40381 || (d->vmode != V32QImode && d->vmode != V16HImode))
40382 return false;
40384 if (d->testing_p)
40385 return true;
40387 nelt = d->nelt;
40388 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
40390 /* Generate 4 permutation masks. If the required element is within
40391 the same lane, it is shuffled in. If the required element from the
40392 other lane, force a zero by setting bit 7 in the permutation mask.
40393 In the other mask the mask has non-negative elements if element
40394 is requested from the other lane, but also moved to the other lane,
40395 so that the result of vpshufb can have the two V2TImode halves
40396 swapped. */
40397 m128 = GEN_INT (-128);
40398 for (i = 0; i < 32; ++i)
40400 rperm[0][i] = m128;
40401 rperm[1][i] = m128;
40402 rperm[2][i] = m128;
40403 rperm[3][i] = m128;
40405 used[0] = false;
40406 used[1] = false;
40407 used[2] = false;
40408 used[3] = false;
40409 for (i = 0; i < nelt; ++i)
40411 unsigned j, e = d->perm[i] & (nelt / 2 - 1);
40412 unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
40413 unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
40415 for (j = 0; j < eltsz; ++j)
40416 rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
40417 used[which] = true;
40420 for (i = 0; i < 2; ++i)
40422 if (!used[2 * i + 1])
40424 h[i] = NULL_RTX;
40425 continue;
40427 vperm = gen_rtx_CONST_VECTOR (V32QImode,
40428 gen_rtvec_v (32, rperm[2 * i + 1]));
40429 vperm = force_reg (V32QImode, vperm);
40430 h[i] = gen_reg_rtx (V32QImode);
40431 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40432 emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
40435 /* Swap the 128-byte lanes of h[X]. */
40436 for (i = 0; i < 2; ++i)
40438 if (h[i] == NULL_RTX)
40439 continue;
40440 op = gen_reg_rtx (V4DImode);
40441 emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
40442 const2_rtx, GEN_INT (3), const0_rtx,
40443 const1_rtx));
40444 h[i] = gen_lowpart (V32QImode, op);
40447 for (i = 0; i < 2; ++i)
40449 if (!used[2 * i])
40451 l[i] = NULL_RTX;
40452 continue;
40454 vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
40455 vperm = force_reg (V32QImode, vperm);
40456 l[i] = gen_reg_rtx (V32QImode);
40457 op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
40458 emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
40461 for (i = 0; i < 2; ++i)
40463 if (h[i] && l[i])
40465 op = gen_reg_rtx (V32QImode);
40466 emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
40467 l[i] = op;
40469 else if (h[i])
40470 l[i] = h[i];
40473 gcc_assert (l[0] && l[1]);
40474 op = gen_lowpart (V32QImode, d->target);
40475 emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
40476 return true;
40479 /* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
40480 With all of the interface bits taken care of, perform the expansion
40481 in D and return true on success. */
40483 static bool
40484 ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
40486 /* Try a single instruction expansion. */
40487 if (expand_vec_perm_1 (d))
40488 return true;
40490 /* Try sequences of two instructions. */
40492 if (expand_vec_perm_pshuflw_pshufhw (d))
40493 return true;
40495 if (expand_vec_perm_palignr (d))
40496 return true;
40498 if (expand_vec_perm_interleave2 (d))
40499 return true;
40501 if (expand_vec_perm_broadcast (d))
40502 return true;
40504 if (expand_vec_perm_vpermq_perm_1 (d))
40505 return true;
40507 if (expand_vec_perm_vperm2f128 (d))
40508 return true;
40510 /* Try sequences of three instructions. */
40512 if (expand_vec_perm_2vperm2f128_vshuf (d))
40513 return true;
40515 if (expand_vec_perm_pshufb2 (d))
40516 return true;
40518 if (expand_vec_perm_interleave3 (d))
40519 return true;
40521 if (expand_vec_perm_vperm2f128_vblend (d))
40522 return true;
40524 /* Try sequences of four instructions. */
40526 if (expand_vec_perm_vpshufb2_vpermq (d))
40527 return true;
40529 if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
40530 return true;
40532 /* ??? Look for narrow permutations whose element orderings would
40533 allow the promotion to a wider mode. */
40535 /* ??? Look for sequences of interleave or a wider permute that place
40536 the data into the correct lanes for a half-vector shuffle like
40537 pshuf[lh]w or vpermilps. */
40539 /* ??? Look for sequences of interleave that produce the desired results.
40540 The combinatorics of punpck[lh] get pretty ugly... */
40542 if (expand_vec_perm_even_odd (d))
40543 return true;
40545 /* Even longer sequences. */
40546 if (expand_vec_perm_vpshufb4_vpermq2 (d))
40547 return true;
40549 return false;
40552 /* If a permutation only uses one operand, make it clear. Returns true
40553 if the permutation references both operands. */
40555 static bool
40556 canonicalize_perm (struct expand_vec_perm_d *d)
40558 int i, which, nelt = d->nelt;
40560 for (i = which = 0; i < nelt; ++i)
40561 which |= (d->perm[i] < nelt ? 1 : 2);
40563 d->one_operand_p = true;
40564 switch (which)
40566 default:
40567 gcc_unreachable();
40569 case 3:
40570 if (!rtx_equal_p (d->op0, d->op1))
40572 d->one_operand_p = false;
40573 break;
40575 /* The elements of PERM do not suggest that only the first operand
40576 is used, but both operands are identical. Allow easier matching
40577 of the permutation by folding the permutation into the single
40578 input vector. */
40579 /* FALLTHRU */
40581 case 2:
40582 for (i = 0; i < nelt; ++i)
40583 d->perm[i] &= nelt - 1;
40584 d->op0 = d->op1;
40585 break;
40587 case 1:
40588 d->op1 = d->op0;
40589 break;
40592 return (which == 3);
40595 bool
40596 ix86_expand_vec_perm_const (rtx operands[4])
40598 struct expand_vec_perm_d d;
40599 unsigned char perm[MAX_VECT_LEN];
40600 int i, nelt;
40601 bool two_args;
40602 rtx sel;
40604 d.target = operands[0];
40605 d.op0 = operands[1];
40606 d.op1 = operands[2];
40607 sel = operands[3];
40609 d.vmode = GET_MODE (d.target);
40610 gcc_assert (VECTOR_MODE_P (d.vmode));
40611 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40612 d.testing_p = false;
40614 gcc_assert (GET_CODE (sel) == CONST_VECTOR);
40615 gcc_assert (XVECLEN (sel, 0) == nelt);
40616 gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
40618 for (i = 0; i < nelt; ++i)
40620 rtx e = XVECEXP (sel, 0, i);
40621 int ei = INTVAL (e) & (2 * nelt - 1);
40622 d.perm[i] = ei;
40623 perm[i] = ei;
40626 two_args = canonicalize_perm (&d);
40628 if (ix86_expand_vec_perm_const_1 (&d))
40629 return true;
40631 /* If the selector says both arguments are needed, but the operands are the
40632 same, the above tried to expand with one_operand_p and flattened selector.
40633 If that didn't work, retry without one_operand_p; we succeeded with that
40634 during testing. */
40635 if (two_args && d.one_operand_p)
40637 d.one_operand_p = false;
40638 memcpy (d.perm, perm, sizeof (perm));
40639 return ix86_expand_vec_perm_const_1 (&d);
40642 return false;
40645 /* Implement targetm.vectorize.vec_perm_const_ok. */
40647 static bool
40648 ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
40649 const unsigned char *sel)
40651 struct expand_vec_perm_d d;
40652 unsigned int i, nelt, which;
40653 bool ret;
40655 d.vmode = vmode;
40656 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40657 d.testing_p = true;
40659 /* Given sufficient ISA support we can just return true here
40660 for selected vector modes. */
40661 if (GET_MODE_SIZE (d.vmode) == 16)
40663 /* All implementable with a single vpperm insn. */
40664 if (TARGET_XOP)
40665 return true;
40666 /* All implementable with 2 pshufb + 1 ior. */
40667 if (TARGET_SSSE3)
40668 return true;
40669 /* All implementable with shufpd or unpck[lh]pd. */
40670 if (d.nelt == 2)
40671 return true;
40674 /* Extract the values from the vector CST into the permutation
40675 array in D. */
40676 memcpy (d.perm, sel, nelt);
40677 for (i = which = 0; i < nelt; ++i)
40679 unsigned char e = d.perm[i];
40680 gcc_assert (e < 2 * nelt);
40681 which |= (e < nelt ? 1 : 2);
40684 /* For all elements from second vector, fold the elements to first. */
40685 if (which == 2)
40686 for (i = 0; i < nelt; ++i)
40687 d.perm[i] -= nelt;
40689 /* Check whether the mask can be applied to the vector type. */
40690 d.one_operand_p = (which != 3);
40692 /* Implementable with shufps or pshufd. */
40693 if (d.one_operand_p && (d.vmode == V4SFmode || d.vmode == V4SImode))
40694 return true;
40696 /* Otherwise we have to go through the motions and see if we can
40697 figure out how to generate the requested permutation. */
40698 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
40699 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
40700 if (!d.one_operand_p)
40701 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
40703 start_sequence ();
40704 ret = ix86_expand_vec_perm_const_1 (&d);
40705 end_sequence ();
40707 return ret;
40710 void
40711 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
40713 struct expand_vec_perm_d d;
40714 unsigned i, nelt;
40716 d.target = targ;
40717 d.op0 = op0;
40718 d.op1 = op1;
40719 d.vmode = GET_MODE (targ);
40720 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40721 d.one_operand_p = false;
40722 d.testing_p = false;
40724 for (i = 0; i < nelt; ++i)
40725 d.perm[i] = i * 2 + odd;
40727 /* We'll either be able to implement the permutation directly... */
40728 if (expand_vec_perm_1 (&d))
40729 return;
40731 /* ... or we use the special-case patterns. */
40732 expand_vec_perm_even_odd_1 (&d, odd);
40735 static void
40736 ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
40738 struct expand_vec_perm_d d;
40739 unsigned i, nelt, base;
40740 bool ok;
40742 d.target = targ;
40743 d.op0 = op0;
40744 d.op1 = op1;
40745 d.vmode = GET_MODE (targ);
40746 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
40747 d.one_operand_p = false;
40748 d.testing_p = false;
40750 base = high_p ? nelt / 2 : 0;
40751 for (i = 0; i < nelt / 2; ++i)
40753 d.perm[i * 2] = i + base;
40754 d.perm[i * 2 + 1] = i + base + nelt;
40757 /* Note that for AVX this isn't one instruction. */
40758 ok = ix86_expand_vec_perm_const_1 (&d);
40759 gcc_assert (ok);
40763 /* Expand a vector operation CODE for a V*QImode in terms of the
40764 same operation on V*HImode. */
40766 void
40767 ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
40769 enum machine_mode qimode = GET_MODE (dest);
40770 enum machine_mode himode;
40771 rtx (*gen_il) (rtx, rtx, rtx);
40772 rtx (*gen_ih) (rtx, rtx, rtx);
40773 rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
40774 struct expand_vec_perm_d d;
40775 bool ok, full_interleave;
40776 bool uns_p = false;
40777 int i;
40779 switch (qimode)
40781 case V16QImode:
40782 himode = V8HImode;
40783 gen_il = gen_vec_interleave_lowv16qi;
40784 gen_ih = gen_vec_interleave_highv16qi;
40785 break;
40786 case V32QImode:
40787 himode = V16HImode;
40788 gen_il = gen_avx2_interleave_lowv32qi;
40789 gen_ih = gen_avx2_interleave_highv32qi;
40790 break;
40791 default:
40792 gcc_unreachable ();
40795 op2_l = op2_h = op2;
40796 switch (code)
40798 case MULT:
40799 /* Unpack data such that we've got a source byte in each low byte of
40800 each word. We don't care what goes into the high byte of each word.
40801 Rather than trying to get zero in there, most convenient is to let
40802 it be a copy of the low byte. */
40803 op2_l = gen_reg_rtx (qimode);
40804 op2_h = gen_reg_rtx (qimode);
40805 emit_insn (gen_il (op2_l, op2, op2));
40806 emit_insn (gen_ih (op2_h, op2, op2));
40807 /* FALLTHRU */
40809 op1_l = gen_reg_rtx (qimode);
40810 op1_h = gen_reg_rtx (qimode);
40811 emit_insn (gen_il (op1_l, op1, op1));
40812 emit_insn (gen_ih (op1_h, op1, op1));
40813 full_interleave = qimode == V16QImode;
40814 break;
40816 case ASHIFT:
40817 case LSHIFTRT:
40818 uns_p = true;
40819 /* FALLTHRU */
40820 case ASHIFTRT:
40821 op1_l = gen_reg_rtx (himode);
40822 op1_h = gen_reg_rtx (himode);
40823 ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
40824 ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
40825 full_interleave = true;
40826 break;
40827 default:
40828 gcc_unreachable ();
40831 /* Perform the operation. */
40832 res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
40833 1, OPTAB_DIRECT);
40834 res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
40835 1, OPTAB_DIRECT);
40836 gcc_assert (res_l && res_h);
40838 /* Merge the data back into the right place. */
40839 d.target = dest;
40840 d.op0 = gen_lowpart (qimode, res_l);
40841 d.op1 = gen_lowpart (qimode, res_h);
40842 d.vmode = qimode;
40843 d.nelt = GET_MODE_NUNITS (qimode);
40844 d.one_operand_p = false;
40845 d.testing_p = false;
40847 if (full_interleave)
40849 /* For SSE2, we used an full interleave, so the desired
40850 results are in the even elements. */
40851 for (i = 0; i < 32; ++i)
40852 d.perm[i] = i * 2;
40854 else
40856 /* For AVX, the interleave used above was not cross-lane. So the
40857 extraction is evens but with the second and third quarter swapped.
40858 Happily, that is even one insn shorter than even extraction. */
40859 for (i = 0; i < 32; ++i)
40860 d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
40863 ok = ix86_expand_vec_perm_const_1 (&d);
40864 gcc_assert (ok);
40866 set_unique_reg_note (get_last_insn (), REG_EQUAL,
40867 gen_rtx_fmt_ee (code, qimode, op1, op2));
40870 void
40871 ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
40872 bool uns_p, bool odd_p)
40874 enum machine_mode mode = GET_MODE (op1);
40875 enum machine_mode wmode = GET_MODE (dest);
40876 rtx x;
40878 /* We only play even/odd games with vectors of SImode. */
40879 gcc_assert (mode == V4SImode || mode == V8SImode);
40881 /* If we're looking for the odd results, shift those members down to
40882 the even slots. For some cpus this is faster than a PSHUFD. */
40883 if (odd_p)
40885 /* For XOP use vpmacsdqh, but only for smult, as it is only
40886 signed. */
40887 if (TARGET_XOP && mode == V4SImode && !uns_p)
40889 x = force_reg (wmode, CONST0_RTX (wmode));
40890 emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
40891 return;
40894 x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
40895 op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
40896 x, NULL, 1, OPTAB_DIRECT);
40897 op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
40898 x, NULL, 1, OPTAB_DIRECT);
40899 op1 = gen_lowpart (mode, op1);
40900 op2 = gen_lowpart (mode, op2);
40903 if (mode == V8SImode)
40905 if (uns_p)
40906 x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
40907 else
40908 x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
40910 else if (uns_p)
40911 x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
40912 else if (TARGET_SSE4_1)
40913 x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
40914 else
40916 rtx s1, s2, t0, t1, t2;
40918 /* The easiest way to implement this without PMULDQ is to go through
40919 the motions as if we are performing a full 64-bit multiply. With
40920 the exception that we need to do less shuffling of the elements. */
40922 /* Compute the sign-extension, aka highparts, of the two operands. */
40923 s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40924 op1, pc_rtx, pc_rtx);
40925 s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
40926 op2, pc_rtx, pc_rtx);
40928 /* Multiply LO(A) * HI(B), and vice-versa. */
40929 t1 = gen_reg_rtx (wmode);
40930 t2 = gen_reg_rtx (wmode);
40931 emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
40932 emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
40934 /* Multiply LO(A) * LO(B). */
40935 t0 = gen_reg_rtx (wmode);
40936 emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
40938 /* Combine and shift the highparts into place. */
40939 t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
40940 t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
40941 1, OPTAB_DIRECT);
40943 /* Combine high and low parts. */
40944 force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
40945 return;
40947 emit_insn (x);
40950 void
40951 ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
40952 bool uns_p, bool high_p)
40954 enum machine_mode wmode = GET_MODE (dest);
40955 enum machine_mode mode = GET_MODE (op1);
40956 rtx t1, t2, t3, t4, mask;
40958 switch (mode)
40960 case V4SImode:
40961 t1 = gen_reg_rtx (mode);
40962 t2 = gen_reg_rtx (mode);
40963 if (TARGET_XOP && !uns_p)
40965 /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
40966 shuffle the elements once so that all elements are in the right
40967 place for immediate use: { A C B D }. */
40968 emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
40969 const1_rtx, GEN_INT (3)));
40970 emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
40971 const1_rtx, GEN_INT (3)));
40973 else
40975 /* Put the elements into place for the multiply. */
40976 ix86_expand_vec_interleave (t1, op1, op1, high_p);
40977 ix86_expand_vec_interleave (t2, op2, op2, high_p);
40978 high_p = false;
40980 ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
40981 break;
40983 case V8SImode:
40984 /* Shuffle the elements between the lanes. After this we
40985 have { A B E F | C D G H } for each operand. */
40986 t1 = gen_reg_rtx (V4DImode);
40987 t2 = gen_reg_rtx (V4DImode);
40988 emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
40989 const0_rtx, const2_rtx,
40990 const1_rtx, GEN_INT (3)));
40991 emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
40992 const0_rtx, const2_rtx,
40993 const1_rtx, GEN_INT (3)));
40995 /* Shuffle the elements within the lanes. After this we
40996 have { A A B B | C C D D } or { E E F F | G G H H }. */
40997 t3 = gen_reg_rtx (V8SImode);
40998 t4 = gen_reg_rtx (V8SImode);
40999 mask = GEN_INT (high_p
41000 ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
41001 : 0 + (0 << 2) + (1 << 4) + (1 << 6));
41002 emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
41003 emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
41005 ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
41006 break;
41008 case V8HImode:
41009 case V16HImode:
41010 t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
41011 uns_p, OPTAB_DIRECT);
41012 t2 = expand_binop (mode,
41013 uns_p ? umul_highpart_optab : smul_highpart_optab,
41014 op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
41015 gcc_assert (t1 && t2);
41017 ix86_expand_vec_interleave (gen_lowpart (mode, dest), t1, t2, high_p);
41018 break;
41020 case V16QImode:
41021 case V32QImode:
41022 t1 = gen_reg_rtx (wmode);
41023 t2 = gen_reg_rtx (wmode);
41024 ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
41025 ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
41027 emit_insn (gen_rtx_SET (VOIDmode, dest, gen_rtx_MULT (wmode, t1, t2)));
41028 break;
41030 default:
41031 gcc_unreachable ();
41035 void
41036 ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
41038 rtx res_1, res_2;
41040 res_1 = gen_reg_rtx (V4SImode);
41041 res_2 = gen_reg_rtx (V4SImode);
41042 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_1),
41043 op1, op2, true, false);
41044 ix86_expand_mul_widen_evenodd (gen_lowpart (V2DImode, res_2),
41045 op1, op2, true, true);
41047 /* Move the results in element 2 down to element 1; we don't care
41048 what goes in elements 2 and 3. Then we can merge the parts
41049 back together with an interleave.
41051 Note that two other sequences were tried:
41052 (1) Use interleaves at the start instead of psrldq, which allows
41053 us to use a single shufps to merge things back at the end.
41054 (2) Use shufps here to combine the two vectors, then pshufd to
41055 put the elements in the correct order.
41056 In both cases the cost of the reformatting stall was too high
41057 and the overall sequence slower. */
41059 emit_insn (gen_sse2_pshufd_1 (res_1, res_1, const0_rtx, const2_rtx,
41060 const0_rtx, const0_rtx));
41061 emit_insn (gen_sse2_pshufd_1 (res_2, res_2, const0_rtx, const2_rtx,
41062 const0_rtx, const0_rtx));
41063 res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
41065 set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
41068 void
41069 ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
41071 enum machine_mode mode = GET_MODE (op0);
41072 rtx t1, t2, t3, t4, t5, t6;
41074 if (TARGET_XOP && mode == V2DImode)
41076 /* op1: A,B,C,D, op2: E,F,G,H */
41077 op1 = gen_lowpart (V4SImode, op1);
41078 op2 = gen_lowpart (V4SImode, op2);
41080 t1 = gen_reg_rtx (V4SImode);
41081 t2 = gen_reg_rtx (V4SImode);
41082 t3 = gen_reg_rtx (V2DImode);
41083 t4 = gen_reg_rtx (V2DImode);
41085 /* t1: B,A,D,C */
41086 emit_insn (gen_sse2_pshufd_1 (t1, op1,
41087 GEN_INT (1),
41088 GEN_INT (0),
41089 GEN_INT (3),
41090 GEN_INT (2)));
41092 /* t2: (B*E),(A*F),(D*G),(C*H) */
41093 emit_insn (gen_mulv4si3 (t2, t1, op2));
41095 /* t3: (B*E)+(A*F), (D*G)+(C*H) */
41096 emit_insn (gen_xop_phadddq (t3, t2));
41098 /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
41099 emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
41101 /* op0: (((B*E)+(A*F))<<32)+(B*F), (((D*G)+(C*H))<<32)+(D*H) */
41102 emit_insn (gen_xop_pmacsdql (op0, op1, op2, t4));
41104 else
41106 enum machine_mode nmode;
41107 rtx (*umul) (rtx, rtx, rtx);
41109 if (mode == V2DImode)
41111 umul = gen_vec_widen_umult_even_v4si;
41112 nmode = V4SImode;
41114 else if (mode == V4DImode)
41116 umul = gen_vec_widen_umult_even_v8si;
41117 nmode = V8SImode;
41119 else
41120 gcc_unreachable ();
41123 /* Multiply low parts. */
41124 t1 = gen_reg_rtx (mode);
41125 emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
41127 /* Shift input vectors right 32 bits so we can multiply high parts. */
41128 t6 = GEN_INT (32);
41129 t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
41130 t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
41132 /* Multiply high parts by low parts. */
41133 t4 = gen_reg_rtx (mode);
41134 t5 = gen_reg_rtx (mode);
41135 emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
41136 emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
41138 /* Combine and shift the highparts back. */
41139 t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
41140 t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
41142 /* Combine high and low parts. */
41143 force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
41146 set_unique_reg_note (get_last_insn (), REG_EQUAL,
41147 gen_rtx_MULT (mode, op1, op2));
41150 /* Expand an insert into a vector register through pinsr insn.
41151 Return true if successful. */
41153 bool
41154 ix86_expand_pinsr (rtx *operands)
41156 rtx dst = operands[0];
41157 rtx src = operands[3];
41159 unsigned int size = INTVAL (operands[1]);
41160 unsigned int pos = INTVAL (operands[2]);
41162 if (GET_CODE (dst) == SUBREG)
41164 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
41165 dst = SUBREG_REG (dst);
41168 if (GET_CODE (src) == SUBREG)
41169 src = SUBREG_REG (src);
41171 switch (GET_MODE (dst))
41173 case V16QImode:
41174 case V8HImode:
41175 case V4SImode:
41176 case V2DImode:
41178 enum machine_mode srcmode, dstmode;
41179 rtx (*pinsr)(rtx, rtx, rtx, rtx);
41181 srcmode = mode_for_size (size, MODE_INT, 0);
41183 switch (srcmode)
41185 case QImode:
41186 if (!TARGET_SSE4_1)
41187 return false;
41188 dstmode = V16QImode;
41189 pinsr = gen_sse4_1_pinsrb;
41190 break;
41192 case HImode:
41193 if (!TARGET_SSE2)
41194 return false;
41195 dstmode = V8HImode;
41196 pinsr = gen_sse2_pinsrw;
41197 break;
41199 case SImode:
41200 if (!TARGET_SSE4_1)
41201 return false;
41202 dstmode = V4SImode;
41203 pinsr = gen_sse4_1_pinsrd;
41204 break;
41206 case DImode:
41207 gcc_assert (TARGET_64BIT);
41208 if (!TARGET_SSE4_1)
41209 return false;
41210 dstmode = V2DImode;
41211 pinsr = gen_sse4_1_pinsrq;
41212 break;
41214 default:
41215 return false;
41218 dst = gen_lowpart (dstmode, dst);
41219 src = gen_lowpart (srcmode, src);
41221 pos /= size;
41223 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
41224 return true;
41227 default:
41228 return false;
41232 /* This function returns the calling abi specific va_list type node.
41233 It returns the FNDECL specific va_list type. */
41235 static tree
41236 ix86_fn_abi_va_list (tree fndecl)
41238 if (!TARGET_64BIT)
41239 return va_list_type_node;
41240 gcc_assert (fndecl != NULL_TREE);
41242 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
41243 return ms_va_list_type_node;
41244 else
41245 return sysv_va_list_type_node;
41248 /* Returns the canonical va_list type specified by TYPE. If there
41249 is no valid TYPE provided, it return NULL_TREE. */
41251 static tree
41252 ix86_canonical_va_list_type (tree type)
41254 tree wtype, htype;
41256 /* Resolve references and pointers to va_list type. */
41257 if (TREE_CODE (type) == MEM_REF)
41258 type = TREE_TYPE (type);
41259 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
41260 type = TREE_TYPE (type);
41261 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
41262 type = TREE_TYPE (type);
41264 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
41266 wtype = va_list_type_node;
41267 gcc_assert (wtype != NULL_TREE);
41268 htype = type;
41269 if (TREE_CODE (wtype) == ARRAY_TYPE)
41271 /* If va_list is an array type, the argument may have decayed
41272 to a pointer type, e.g. by being passed to another function.
41273 In that case, unwrap both types so that we can compare the
41274 underlying records. */
41275 if (TREE_CODE (htype) == ARRAY_TYPE
41276 || POINTER_TYPE_P (htype))
41278 wtype = TREE_TYPE (wtype);
41279 htype = TREE_TYPE (htype);
41282 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41283 return va_list_type_node;
41284 wtype = sysv_va_list_type_node;
41285 gcc_assert (wtype != NULL_TREE);
41286 htype = type;
41287 if (TREE_CODE (wtype) == ARRAY_TYPE)
41289 /* If va_list is an array type, the argument may have decayed
41290 to a pointer type, e.g. by being passed to another function.
41291 In that case, unwrap both types so that we can compare the
41292 underlying records. */
41293 if (TREE_CODE (htype) == ARRAY_TYPE
41294 || POINTER_TYPE_P (htype))
41296 wtype = TREE_TYPE (wtype);
41297 htype = TREE_TYPE (htype);
41300 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41301 return sysv_va_list_type_node;
41302 wtype = ms_va_list_type_node;
41303 gcc_assert (wtype != NULL_TREE);
41304 htype = type;
41305 if (TREE_CODE (wtype) == ARRAY_TYPE)
41307 /* If va_list is an array type, the argument may have decayed
41308 to a pointer type, e.g. by being passed to another function.
41309 In that case, unwrap both types so that we can compare the
41310 underlying records. */
41311 if (TREE_CODE (htype) == ARRAY_TYPE
41312 || POINTER_TYPE_P (htype))
41314 wtype = TREE_TYPE (wtype);
41315 htype = TREE_TYPE (htype);
41318 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
41319 return ms_va_list_type_node;
41320 return NULL_TREE;
41322 return std_canonical_va_list_type (type);
41325 /* Iterate through the target-specific builtin types for va_list.
41326 IDX denotes the iterator, *PTREE is set to the result type of
41327 the va_list builtin, and *PNAME to its internal type.
41328 Returns zero if there is no element for this index, otherwise
41329 IDX should be increased upon the next call.
41330 Note, do not iterate a base builtin's name like __builtin_va_list.
41331 Used from c_common_nodes_and_builtins. */
41333 static int
41334 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
41336 if (TARGET_64BIT)
41338 switch (idx)
41340 default:
41341 break;
41343 case 0:
41344 *ptree = ms_va_list_type_node;
41345 *pname = "__builtin_ms_va_list";
41346 return 1;
41348 case 1:
41349 *ptree = sysv_va_list_type_node;
41350 *pname = "__builtin_sysv_va_list";
41351 return 1;
41355 return 0;
41358 #undef TARGET_SCHED_DISPATCH
41359 #define TARGET_SCHED_DISPATCH has_dispatch
41360 #undef TARGET_SCHED_DISPATCH_DO
41361 #define TARGET_SCHED_DISPATCH_DO do_dispatch
41362 #undef TARGET_SCHED_REASSOCIATION_WIDTH
41363 #define TARGET_SCHED_REASSOCIATION_WIDTH ix86_reassociation_width
41364 #undef TARGET_SCHED_REORDER
41365 #define TARGET_SCHED_REORDER ix86_sched_reorder
41366 #undef TARGET_SCHED_ADJUST_PRIORITY
41367 #define TARGET_SCHED_ADJUST_PRIORITY ix86_adjust_priority
41368 #undef TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK
41369 #define TARGET_SCHED_DEPENDENCIES_EVALUATION_HOOK \
41370 ix86_dependencies_evaluation_hook
41372 /* The size of the dispatch window is the total number of bytes of
41373 object code allowed in a window. */
41374 #define DISPATCH_WINDOW_SIZE 16
41376 /* Number of dispatch windows considered for scheduling. */
41377 #define MAX_DISPATCH_WINDOWS 3
41379 /* Maximum number of instructions in a window. */
41380 #define MAX_INSN 4
41382 /* Maximum number of immediate operands in a window. */
41383 #define MAX_IMM 4
41385 /* Maximum number of immediate bits allowed in a window. */
41386 #define MAX_IMM_SIZE 128
41388 /* Maximum number of 32 bit immediates allowed in a window. */
41389 #define MAX_IMM_32 4
41391 /* Maximum number of 64 bit immediates allowed in a window. */
41392 #define MAX_IMM_64 2
41394 /* Maximum total of loads or prefetches allowed in a window. */
41395 #define MAX_LOAD 2
41397 /* Maximum total of stores allowed in a window. */
41398 #define MAX_STORE 1
41400 #undef BIG
41401 #define BIG 100
41404 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
41405 enum dispatch_group {
41406 disp_no_group = 0,
41407 disp_load,
41408 disp_store,
41409 disp_load_store,
41410 disp_prefetch,
41411 disp_imm,
41412 disp_imm_32,
41413 disp_imm_64,
41414 disp_branch,
41415 disp_cmp,
41416 disp_jcc,
41417 disp_last
41420 /* Number of allowable groups in a dispatch window. It is an array
41421 indexed by dispatch_group enum. 100 is used as a big number,
41422 because the number of these kind of operations does not have any
41423 effect in dispatch window, but we need them for other reasons in
41424 the table. */
41425 static unsigned int num_allowable_groups[disp_last] = {
41426 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
41429 char group_name[disp_last + 1][16] = {
41430 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
41431 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
41432 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
41435 /* Instruction path. */
41436 enum insn_path {
41437 no_path = 0,
41438 path_single, /* Single micro op. */
41439 path_double, /* Double micro op. */
41440 path_multi, /* Instructions with more than 2 micro op.. */
41441 last_path
41444 /* sched_insn_info defines a window to the instructions scheduled in
41445 the basic block. It contains a pointer to the insn_info table and
41446 the instruction scheduled.
41448 Windows are allocated for each basic block and are linked
41449 together. */
41450 typedef struct sched_insn_info_s {
41451 rtx insn;
41452 enum dispatch_group group;
41453 enum insn_path path;
41454 int byte_len;
41455 int imm_bytes;
41456 } sched_insn_info;
41458 /* Linked list of dispatch windows. This is a two way list of
41459 dispatch windows of a basic block. It contains information about
41460 the number of uops in the window and the total number of
41461 instructions and of bytes in the object code for this dispatch
41462 window. */
41463 typedef struct dispatch_windows_s {
41464 int num_insn; /* Number of insn in the window. */
41465 int num_uops; /* Number of uops in the window. */
41466 int window_size; /* Number of bytes in the window. */
41467 int window_num; /* Window number between 0 or 1. */
41468 int num_imm; /* Number of immediates in an insn. */
41469 int num_imm_32; /* Number of 32 bit immediates in an insn. */
41470 int num_imm_64; /* Number of 64 bit immediates in an insn. */
41471 int imm_size; /* Total immediates in the window. */
41472 int num_loads; /* Total memory loads in the window. */
41473 int num_stores; /* Total memory stores in the window. */
41474 int violation; /* Violation exists in window. */
41475 sched_insn_info *window; /* Pointer to the window. */
41476 struct dispatch_windows_s *next;
41477 struct dispatch_windows_s *prev;
41478 } dispatch_windows;
41480 /* Immediate valuse used in an insn. */
41481 typedef struct imm_info_s
41483 int imm;
41484 int imm32;
41485 int imm64;
41486 } imm_info;
41488 static dispatch_windows *dispatch_window_list;
41489 static dispatch_windows *dispatch_window_list1;
41491 /* Get dispatch group of insn. */
41493 static enum dispatch_group
41494 get_mem_group (rtx insn)
41496 enum attr_memory memory;
41498 if (INSN_CODE (insn) < 0)
41499 return disp_no_group;
41500 memory = get_attr_memory (insn);
41501 if (memory == MEMORY_STORE)
41502 return disp_store;
41504 if (memory == MEMORY_LOAD)
41505 return disp_load;
41507 if (memory == MEMORY_BOTH)
41508 return disp_load_store;
41510 return disp_no_group;
41513 /* Return true if insn is a compare instruction. */
41515 static bool
41516 is_cmp (rtx insn)
41518 enum attr_type type;
41520 type = get_attr_type (insn);
41521 return (type == TYPE_TEST
41522 || type == TYPE_ICMP
41523 || type == TYPE_FCMP
41524 || GET_CODE (PATTERN (insn)) == COMPARE);
41527 /* Return true if a dispatch violation encountered. */
41529 static bool
41530 dispatch_violation (void)
41532 if (dispatch_window_list->next)
41533 return dispatch_window_list->next->violation;
41534 return dispatch_window_list->violation;
41537 /* Return true if insn is a branch instruction. */
41539 static bool
41540 is_branch (rtx insn)
41542 return (CALL_P (insn) || JUMP_P (insn));
41545 /* Return true if insn is a prefetch instruction. */
41547 static bool
41548 is_prefetch (rtx insn)
41550 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
41553 /* This function initializes a dispatch window and the list container holding a
41554 pointer to the window. */
41556 static void
41557 init_window (int window_num)
41559 int i;
41560 dispatch_windows *new_list;
41562 if (window_num == 0)
41563 new_list = dispatch_window_list;
41564 else
41565 new_list = dispatch_window_list1;
41567 new_list->num_insn = 0;
41568 new_list->num_uops = 0;
41569 new_list->window_size = 0;
41570 new_list->next = NULL;
41571 new_list->prev = NULL;
41572 new_list->window_num = window_num;
41573 new_list->num_imm = 0;
41574 new_list->num_imm_32 = 0;
41575 new_list->num_imm_64 = 0;
41576 new_list->imm_size = 0;
41577 new_list->num_loads = 0;
41578 new_list->num_stores = 0;
41579 new_list->violation = false;
41581 for (i = 0; i < MAX_INSN; i++)
41583 new_list->window[i].insn = NULL;
41584 new_list->window[i].group = disp_no_group;
41585 new_list->window[i].path = no_path;
41586 new_list->window[i].byte_len = 0;
41587 new_list->window[i].imm_bytes = 0;
41589 return;
41592 /* This function allocates and initializes a dispatch window and the
41593 list container holding a pointer to the window. */
41595 static dispatch_windows *
41596 allocate_window (void)
41598 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
41599 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
41601 return new_list;
41604 /* This routine initializes the dispatch scheduling information. It
41605 initiates building dispatch scheduler tables and constructs the
41606 first dispatch window. */
41608 static void
41609 init_dispatch_sched (void)
41611 /* Allocate a dispatch list and a window. */
41612 dispatch_window_list = allocate_window ();
41613 dispatch_window_list1 = allocate_window ();
41614 init_window (0);
41615 init_window (1);
41618 /* This function returns true if a branch is detected. End of a basic block
41619 does not have to be a branch, but here we assume only branches end a
41620 window. */
41622 static bool
41623 is_end_basic_block (enum dispatch_group group)
41625 return group == disp_branch;
41628 /* This function is called when the end of a window processing is reached. */
41630 static void
41631 process_end_window (void)
41633 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
41634 if (dispatch_window_list->next)
41636 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
41637 gcc_assert (dispatch_window_list->window_size
41638 + dispatch_window_list1->window_size <= 48);
41639 init_window (1);
41641 init_window (0);
41644 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
41645 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
41646 for 48 bytes of instructions. Note that these windows are not dispatch
41647 windows that their sizes are DISPATCH_WINDOW_SIZE. */
41649 static dispatch_windows *
41650 allocate_next_window (int window_num)
41652 if (window_num == 0)
41654 if (dispatch_window_list->next)
41655 init_window (1);
41656 init_window (0);
41657 return dispatch_window_list;
41660 dispatch_window_list->next = dispatch_window_list1;
41661 dispatch_window_list1->prev = dispatch_window_list;
41663 return dispatch_window_list1;
41666 /* Increment the number of immediate operands of an instruction. */
41668 static int
41669 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
41671 if (*in_rtx == 0)
41672 return 0;
41674 switch ( GET_CODE (*in_rtx))
41676 case CONST:
41677 case SYMBOL_REF:
41678 case CONST_INT:
41679 (imm_values->imm)++;
41680 if (x86_64_immediate_operand (*in_rtx, SImode))
41681 (imm_values->imm32)++;
41682 else
41683 (imm_values->imm64)++;
41684 break;
41686 case CONST_DOUBLE:
41687 (imm_values->imm)++;
41688 (imm_values->imm64)++;
41689 break;
41691 case CODE_LABEL:
41692 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
41694 (imm_values->imm)++;
41695 (imm_values->imm32)++;
41697 break;
41699 default:
41700 break;
41703 return 0;
41706 /* Compute number of immediate operands of an instruction. */
41708 static void
41709 find_constant (rtx in_rtx, imm_info *imm_values)
41711 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
41712 (rtx_function) find_constant_1, (void *) imm_values);
41715 /* Return total size of immediate operands of an instruction along with number
41716 of corresponding immediate-operands. It initializes its parameters to zero
41717 befor calling FIND_CONSTANT.
41718 INSN is the input instruction. IMM is the total of immediates.
41719 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
41720 bit immediates. */
41722 static int
41723 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
41725 imm_info imm_values = {0, 0, 0};
41727 find_constant (insn, &imm_values);
41728 *imm = imm_values.imm;
41729 *imm32 = imm_values.imm32;
41730 *imm64 = imm_values.imm64;
41731 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
41734 /* This function indicates if an operand of an instruction is an
41735 immediate. */
41737 static bool
41738 has_immediate (rtx insn)
41740 int num_imm_operand;
41741 int num_imm32_operand;
41742 int num_imm64_operand;
41744 if (insn)
41745 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41746 &num_imm64_operand);
41747 return false;
41750 /* Return single or double path for instructions. */
41752 static enum insn_path
41753 get_insn_path (rtx insn)
41755 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
41757 if ((int)path == 0)
41758 return path_single;
41760 if ((int)path == 1)
41761 return path_double;
41763 return path_multi;
41766 /* Return insn dispatch group. */
41768 static enum dispatch_group
41769 get_insn_group (rtx insn)
41771 enum dispatch_group group = get_mem_group (insn);
41772 if (group)
41773 return group;
41775 if (is_branch (insn))
41776 return disp_branch;
41778 if (is_cmp (insn))
41779 return disp_cmp;
41781 if (has_immediate (insn))
41782 return disp_imm;
41784 if (is_prefetch (insn))
41785 return disp_prefetch;
41787 return disp_no_group;
41790 /* Count number of GROUP restricted instructions in a dispatch
41791 window WINDOW_LIST. */
41793 static int
41794 count_num_restricted (rtx insn, dispatch_windows *window_list)
41796 enum dispatch_group group = get_insn_group (insn);
41797 int imm_size;
41798 int num_imm_operand;
41799 int num_imm32_operand;
41800 int num_imm64_operand;
41802 if (group == disp_no_group)
41803 return 0;
41805 if (group == disp_imm)
41807 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41808 &num_imm64_operand);
41809 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
41810 || num_imm_operand + window_list->num_imm > MAX_IMM
41811 || (num_imm32_operand > 0
41812 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
41813 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
41814 || (num_imm64_operand > 0
41815 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
41816 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
41817 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
41818 && num_imm64_operand > 0
41819 && ((window_list->num_imm_64 > 0
41820 && window_list->num_insn >= 2)
41821 || window_list->num_insn >= 3)))
41822 return BIG;
41824 return 1;
41827 if ((group == disp_load_store
41828 && (window_list->num_loads >= MAX_LOAD
41829 || window_list->num_stores >= MAX_STORE))
41830 || ((group == disp_load
41831 || group == disp_prefetch)
41832 && window_list->num_loads >= MAX_LOAD)
41833 || (group == disp_store
41834 && window_list->num_stores >= MAX_STORE))
41835 return BIG;
41837 return 1;
41840 /* This function returns true if insn satisfies dispatch rules on the
41841 last window scheduled. */
41843 static bool
41844 fits_dispatch_window (rtx insn)
41846 dispatch_windows *window_list = dispatch_window_list;
41847 dispatch_windows *window_list_next = dispatch_window_list->next;
41848 unsigned int num_restrict;
41849 enum dispatch_group group = get_insn_group (insn);
41850 enum insn_path path = get_insn_path (insn);
41851 int sum;
41853 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
41854 instructions should be given the lowest priority in the
41855 scheduling process in Haifa scheduler to make sure they will be
41856 scheduled in the same dispatch window as the reference to them. */
41857 if (group == disp_jcc || group == disp_cmp)
41858 return false;
41860 /* Check nonrestricted. */
41861 if (group == disp_no_group || group == disp_branch)
41862 return true;
41864 /* Get last dispatch window. */
41865 if (window_list_next)
41866 window_list = window_list_next;
41868 if (window_list->window_num == 1)
41870 sum = window_list->prev->window_size + window_list->window_size;
41872 if (sum == 32
41873 || (min_insn_size (insn) + sum) >= 48)
41874 /* Window 1 is full. Go for next window. */
41875 return true;
41878 num_restrict = count_num_restricted (insn, window_list);
41880 if (num_restrict > num_allowable_groups[group])
41881 return false;
41883 /* See if it fits in the first window. */
41884 if (window_list->window_num == 0)
41886 /* The first widow should have only single and double path
41887 uops. */
41888 if (path == path_double
41889 && (window_list->num_uops + 2) > MAX_INSN)
41890 return false;
41891 else if (path != path_single)
41892 return false;
41894 return true;
41897 /* Add an instruction INSN with NUM_UOPS micro-operations to the
41898 dispatch window WINDOW_LIST. */
41900 static void
41901 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
41903 int byte_len = min_insn_size (insn);
41904 int num_insn = window_list->num_insn;
41905 int imm_size;
41906 sched_insn_info *window = window_list->window;
41907 enum dispatch_group group = get_insn_group (insn);
41908 enum insn_path path = get_insn_path (insn);
41909 int num_imm_operand;
41910 int num_imm32_operand;
41911 int num_imm64_operand;
41913 if (!window_list->violation && group != disp_cmp
41914 && !fits_dispatch_window (insn))
41915 window_list->violation = true;
41917 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
41918 &num_imm64_operand);
41920 /* Initialize window with new instruction. */
41921 window[num_insn].insn = insn;
41922 window[num_insn].byte_len = byte_len;
41923 window[num_insn].group = group;
41924 window[num_insn].path = path;
41925 window[num_insn].imm_bytes = imm_size;
41927 window_list->window_size += byte_len;
41928 window_list->num_insn = num_insn + 1;
41929 window_list->num_uops = window_list->num_uops + num_uops;
41930 window_list->imm_size += imm_size;
41931 window_list->num_imm += num_imm_operand;
41932 window_list->num_imm_32 += num_imm32_operand;
41933 window_list->num_imm_64 += num_imm64_operand;
41935 if (group == disp_store)
41936 window_list->num_stores += 1;
41937 else if (group == disp_load
41938 || group == disp_prefetch)
41939 window_list->num_loads += 1;
41940 else if (group == disp_load_store)
41942 window_list->num_stores += 1;
41943 window_list->num_loads += 1;
41947 /* Adds a scheduled instruction, INSN, to the current dispatch window.
41948 If the total bytes of instructions or the number of instructions in
41949 the window exceed allowable, it allocates a new window. */
41951 static void
41952 add_to_dispatch_window (rtx insn)
41954 int byte_len;
41955 dispatch_windows *window_list;
41956 dispatch_windows *next_list;
41957 dispatch_windows *window0_list;
41958 enum insn_path path;
41959 enum dispatch_group insn_group;
41960 bool insn_fits;
41961 int num_insn;
41962 int num_uops;
41963 int window_num;
41964 int insn_num_uops;
41965 int sum;
41967 if (INSN_CODE (insn) < 0)
41968 return;
41970 byte_len = min_insn_size (insn);
41971 window_list = dispatch_window_list;
41972 next_list = window_list->next;
41973 path = get_insn_path (insn);
41974 insn_group = get_insn_group (insn);
41976 /* Get the last dispatch window. */
41977 if (next_list)
41978 window_list = dispatch_window_list->next;
41980 if (path == path_single)
41981 insn_num_uops = 1;
41982 else if (path == path_double)
41983 insn_num_uops = 2;
41984 else
41985 insn_num_uops = (int) path;
41987 /* If current window is full, get a new window.
41988 Window number zero is full, if MAX_INSN uops are scheduled in it.
41989 Window number one is full, if window zero's bytes plus window
41990 one's bytes is 32, or if the bytes of the new instruction added
41991 to the total makes it greater than 48, or it has already MAX_INSN
41992 instructions in it. */
41993 num_insn = window_list->num_insn;
41994 num_uops = window_list->num_uops;
41995 window_num = window_list->window_num;
41996 insn_fits = fits_dispatch_window (insn);
41998 if (num_insn >= MAX_INSN
41999 || num_uops + insn_num_uops > MAX_INSN
42000 || !(insn_fits))
42002 window_num = ~window_num & 1;
42003 window_list = allocate_next_window (window_num);
42006 if (window_num == 0)
42008 add_insn_window (insn, window_list, insn_num_uops);
42009 if (window_list->num_insn >= MAX_INSN
42010 && insn_group == disp_branch)
42012 process_end_window ();
42013 return;
42016 else if (window_num == 1)
42018 window0_list = window_list->prev;
42019 sum = window0_list->window_size + window_list->window_size;
42020 if (sum == 32
42021 || (byte_len + sum) >= 48)
42023 process_end_window ();
42024 window_list = dispatch_window_list;
42027 add_insn_window (insn, window_list, insn_num_uops);
42029 else
42030 gcc_unreachable ();
42032 if (is_end_basic_block (insn_group))
42034 /* End of basic block is reached do end-basic-block process. */
42035 process_end_window ();
42036 return;
42040 /* Print the dispatch window, WINDOW_NUM, to FILE. */
42042 DEBUG_FUNCTION static void
42043 debug_dispatch_window_file (FILE *file, int window_num)
42045 dispatch_windows *list;
42046 int i;
42048 if (window_num == 0)
42049 list = dispatch_window_list;
42050 else
42051 list = dispatch_window_list1;
42053 fprintf (file, "Window #%d:\n", list->window_num);
42054 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
42055 list->num_insn, list->num_uops, list->window_size);
42056 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42057 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
42059 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
42060 list->num_stores);
42061 fprintf (file, " insn info:\n");
42063 for (i = 0; i < MAX_INSN; i++)
42065 if (!list->window[i].insn)
42066 break;
42067 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
42068 i, group_name[list->window[i].group],
42069 i, (void *)list->window[i].insn,
42070 i, list->window[i].path,
42071 i, list->window[i].byte_len,
42072 i, list->window[i].imm_bytes);
42076 /* Print to stdout a dispatch window. */
42078 DEBUG_FUNCTION void
42079 debug_dispatch_window (int window_num)
42081 debug_dispatch_window_file (stdout, window_num);
42084 /* Print INSN dispatch information to FILE. */
42086 DEBUG_FUNCTION static void
42087 debug_insn_dispatch_info_file (FILE *file, rtx insn)
42089 int byte_len;
42090 enum insn_path path;
42091 enum dispatch_group group;
42092 int imm_size;
42093 int num_imm_operand;
42094 int num_imm32_operand;
42095 int num_imm64_operand;
42097 if (INSN_CODE (insn) < 0)
42098 return;
42100 byte_len = min_insn_size (insn);
42101 path = get_insn_path (insn);
42102 group = get_insn_group (insn);
42103 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
42104 &num_imm64_operand);
42106 fprintf (file, " insn info:\n");
42107 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
42108 group_name[group], path, byte_len);
42109 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
42110 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
42113 /* Print to STDERR the status of the ready list with respect to
42114 dispatch windows. */
42116 DEBUG_FUNCTION void
42117 debug_ready_dispatch (void)
42119 int i;
42120 int no_ready = number_in_ready ();
42122 fprintf (stdout, "Number of ready: %d\n", no_ready);
42124 for (i = 0; i < no_ready; i++)
42125 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
42128 /* This routine is the driver of the dispatch scheduler. */
42130 static void
42131 do_dispatch (rtx insn, int mode)
42133 if (mode == DISPATCH_INIT)
42134 init_dispatch_sched ();
42135 else if (mode == ADD_TO_DISPATCH_WINDOW)
42136 add_to_dispatch_window (insn);
42139 /* Return TRUE if Dispatch Scheduling is supported. */
42141 static bool
42142 has_dispatch (rtx insn, int action)
42144 if ((TARGET_BDVER1 || TARGET_BDVER2 || TARGET_BDVER3)
42145 && flag_dispatch_scheduler)
42146 switch (action)
42148 default:
42149 return false;
42151 case IS_DISPATCH_ON:
42152 return true;
42153 break;
42155 case IS_CMP:
42156 return is_cmp (insn);
42158 case DISPATCH_VIOLATION:
42159 return dispatch_violation ();
42161 case FITS_DISPATCH_WINDOW:
42162 return fits_dispatch_window (insn);
42165 return false;
42168 /* Implementation of reassociation_width target hook used by
42169 reassoc phase to identify parallelism level in reassociated
42170 tree. Statements tree_code is passed in OPC. Arguments type
42171 is passed in MODE.
42173 Currently parallel reassociation is enabled for Atom
42174 processors only and we set reassociation width to be 2
42175 because Atom may issue up to 2 instructions per cycle.
42177 Return value should be fixed if parallel reassociation is
42178 enabled for other processors. */
42180 static int
42181 ix86_reassociation_width (unsigned int opc ATTRIBUTE_UNUSED,
42182 enum machine_mode mode)
42184 int res = 1;
42186 if (INTEGRAL_MODE_P (mode) && TARGET_REASSOC_INT_TO_PARALLEL)
42187 res = 2;
42188 else if (FLOAT_MODE_P (mode) && TARGET_REASSOC_FP_TO_PARALLEL)
42189 res = 2;
42191 return res;
42194 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
42195 place emms and femms instructions. */
42197 static enum machine_mode
42198 ix86_preferred_simd_mode (enum machine_mode mode)
42200 if (!TARGET_SSE)
42201 return word_mode;
42203 switch (mode)
42205 case QImode:
42206 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
42207 case HImode:
42208 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
42209 case SImode:
42210 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
42211 case DImode:
42212 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
42214 case SFmode:
42215 if (TARGET_AVX && !TARGET_PREFER_AVX128)
42216 return V8SFmode;
42217 else
42218 return V4SFmode;
42220 case DFmode:
42221 if (!TARGET_VECTORIZE_DOUBLE)
42222 return word_mode;
42223 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
42224 return V4DFmode;
42225 else if (TARGET_SSE2)
42226 return V2DFmode;
42227 /* FALLTHRU */
42229 default:
42230 return word_mode;
42234 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
42235 vectors. */
42237 static unsigned int
42238 ix86_autovectorize_vector_sizes (void)
42240 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
42245 /* Return class of registers which could be used for pseudo of MODE
42246 and of class RCLASS for spilling instead of memory. Return NO_REGS
42247 if it is not possible or non-profitable. */
42248 static reg_class_t
42249 ix86_spill_class (reg_class_t rclass, enum machine_mode mode)
42251 if (TARGET_SSE && TARGET_GENERAL_REGS_SSE_SPILL && ! TARGET_MMX
42252 && (mode == SImode || (TARGET_64BIT && mode == DImode))
42253 && INTEGER_CLASS_P (rclass))
42254 return SSE_REGS;
42255 return NO_REGS;
42258 /* Implement targetm.vectorize.init_cost. */
42260 static void *
42261 ix86_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
42263 unsigned *cost = XNEWVEC (unsigned, 3);
42264 cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
42265 return cost;
42268 /* Implement targetm.vectorize.add_stmt_cost. */
42270 static unsigned
42271 ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
42272 struct _stmt_vec_info *stmt_info, int misalign,
42273 enum vect_cost_model_location where)
42275 unsigned *cost = (unsigned *) data;
42276 unsigned retval = 0;
42278 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
42279 int stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
42281 /* Statements in an inner loop relative to the loop being
42282 vectorized are weighted more heavily. The value here is
42283 arbitrary and could potentially be improved with analysis. */
42284 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
42285 count *= 50; /* FIXME. */
42287 retval = (unsigned) (count * stmt_cost);
42288 cost[where] += retval;
42290 return retval;
42293 /* Implement targetm.vectorize.finish_cost. */
42295 static void
42296 ix86_finish_cost (void *data, unsigned *prologue_cost,
42297 unsigned *body_cost, unsigned *epilogue_cost)
42299 unsigned *cost = (unsigned *) data;
42300 *prologue_cost = cost[vect_prologue];
42301 *body_cost = cost[vect_body];
42302 *epilogue_cost = cost[vect_epilogue];
42305 /* Implement targetm.vectorize.destroy_cost_data. */
42307 static void
42308 ix86_destroy_cost_data (void *data)
42310 free (data);
42313 /* Validate target specific memory model bits in VAL. */
42315 static unsigned HOST_WIDE_INT
42316 ix86_memmodel_check (unsigned HOST_WIDE_INT val)
42318 unsigned HOST_WIDE_INT model = val & MEMMODEL_MASK;
42319 bool strong;
42321 if (val & ~(unsigned HOST_WIDE_INT)(IX86_HLE_ACQUIRE|IX86_HLE_RELEASE
42322 |MEMMODEL_MASK)
42323 || ((val & IX86_HLE_ACQUIRE) && (val & IX86_HLE_RELEASE)))
42325 warning (OPT_Winvalid_memory_model,
42326 "Unknown architecture specific memory model");
42327 return MEMMODEL_SEQ_CST;
42329 strong = (model == MEMMODEL_ACQ_REL || model == MEMMODEL_SEQ_CST);
42330 if (val & IX86_HLE_ACQUIRE && !(model == MEMMODEL_ACQUIRE || strong))
42332 warning (OPT_Winvalid_memory_model,
42333 "HLE_ACQUIRE not used with ACQUIRE or stronger memory model");
42334 return MEMMODEL_SEQ_CST | IX86_HLE_ACQUIRE;
42336 if (val & IX86_HLE_RELEASE && !(model == MEMMODEL_RELEASE || strong))
42338 warning (OPT_Winvalid_memory_model,
42339 "HLE_RELEASE not used with RELEASE or stronger memory model");
42340 return MEMMODEL_SEQ_CST | IX86_HLE_RELEASE;
42342 return val;
42345 /* Initialize the GCC target structure. */
42346 #undef TARGET_RETURN_IN_MEMORY
42347 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
42349 #undef TARGET_LEGITIMIZE_ADDRESS
42350 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
42352 #undef TARGET_ATTRIBUTE_TABLE
42353 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
42354 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
42355 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P hook_bool_const_tree_true
42356 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42357 # undef TARGET_MERGE_DECL_ATTRIBUTES
42358 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
42359 #endif
42361 #undef TARGET_COMP_TYPE_ATTRIBUTES
42362 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
42364 #undef TARGET_INIT_BUILTINS
42365 #define TARGET_INIT_BUILTINS ix86_init_builtins
42366 #undef TARGET_BUILTIN_DECL
42367 #define TARGET_BUILTIN_DECL ix86_builtin_decl
42368 #undef TARGET_EXPAND_BUILTIN
42369 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
42371 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
42372 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
42373 ix86_builtin_vectorized_function
42375 #undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
42376 #define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
42378 #undef TARGET_VECTORIZE_BUILTIN_TM_STORE
42379 #define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
42381 #undef TARGET_VECTORIZE_BUILTIN_GATHER
42382 #define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
42384 #undef TARGET_BUILTIN_RECIPROCAL
42385 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
42387 #undef TARGET_ASM_FUNCTION_EPILOGUE
42388 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
42390 #undef TARGET_ENCODE_SECTION_INFO
42391 #ifndef SUBTARGET_ENCODE_SECTION_INFO
42392 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
42393 #else
42394 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
42395 #endif
42397 #undef TARGET_ASM_OPEN_PAREN
42398 #define TARGET_ASM_OPEN_PAREN ""
42399 #undef TARGET_ASM_CLOSE_PAREN
42400 #define TARGET_ASM_CLOSE_PAREN ""
42402 #undef TARGET_ASM_BYTE_OP
42403 #define TARGET_ASM_BYTE_OP ASM_BYTE
42405 #undef TARGET_ASM_ALIGNED_HI_OP
42406 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
42407 #undef TARGET_ASM_ALIGNED_SI_OP
42408 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
42409 #ifdef ASM_QUAD
42410 #undef TARGET_ASM_ALIGNED_DI_OP
42411 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
42412 #endif
42414 #undef TARGET_PROFILE_BEFORE_PROLOGUE
42415 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
42417 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
42418 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME ix86_mangle_decl_assembler_name
42420 #undef TARGET_ASM_UNALIGNED_HI_OP
42421 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
42422 #undef TARGET_ASM_UNALIGNED_SI_OP
42423 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
42424 #undef TARGET_ASM_UNALIGNED_DI_OP
42425 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
42427 #undef TARGET_PRINT_OPERAND
42428 #define TARGET_PRINT_OPERAND ix86_print_operand
42429 #undef TARGET_PRINT_OPERAND_ADDRESS
42430 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
42431 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
42432 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
42433 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
42434 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
42436 #undef TARGET_SCHED_INIT_GLOBAL
42437 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
42438 #undef TARGET_SCHED_ADJUST_COST
42439 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
42440 #undef TARGET_SCHED_ISSUE_RATE
42441 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
42442 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
42443 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
42444 ia32_multipass_dfa_lookahead
42446 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
42447 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
42449 #undef TARGET_MEMMODEL_CHECK
42450 #define TARGET_MEMMODEL_CHECK ix86_memmodel_check
42452 #ifdef HAVE_AS_TLS
42453 #undef TARGET_HAVE_TLS
42454 #define TARGET_HAVE_TLS true
42455 #endif
42456 #undef TARGET_CANNOT_FORCE_CONST_MEM
42457 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
42458 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
42459 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
42461 #undef TARGET_DELEGITIMIZE_ADDRESS
42462 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
42464 #undef TARGET_MS_BITFIELD_LAYOUT_P
42465 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
42467 #if TARGET_MACHO
42468 #undef TARGET_BINDS_LOCAL_P
42469 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
42470 #endif
42471 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
42472 #undef TARGET_BINDS_LOCAL_P
42473 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
42474 #endif
42476 #undef TARGET_ASM_OUTPUT_MI_THUNK
42477 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
42478 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
42479 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
42481 #undef TARGET_ASM_FILE_START
42482 #define TARGET_ASM_FILE_START x86_file_start
42484 #undef TARGET_OPTION_OVERRIDE
42485 #define TARGET_OPTION_OVERRIDE ix86_option_override
42487 #undef TARGET_REGISTER_MOVE_COST
42488 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
42489 #undef TARGET_MEMORY_MOVE_COST
42490 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
42491 #undef TARGET_RTX_COSTS
42492 #define TARGET_RTX_COSTS ix86_rtx_costs
42493 #undef TARGET_ADDRESS_COST
42494 #define TARGET_ADDRESS_COST ix86_address_cost
42496 #undef TARGET_FIXED_CONDITION_CODE_REGS
42497 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
42498 #undef TARGET_CC_MODES_COMPATIBLE
42499 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
42501 #undef TARGET_MACHINE_DEPENDENT_REORG
42502 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
42504 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
42505 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
42507 #undef TARGET_BUILD_BUILTIN_VA_LIST
42508 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
42510 #undef TARGET_FOLD_BUILTIN
42511 #define TARGET_FOLD_BUILTIN ix86_fold_builtin
42513 #undef TARGET_COMPARE_VERSION_PRIORITY
42514 #define TARGET_COMPARE_VERSION_PRIORITY ix86_compare_version_priority
42516 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
42517 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
42518 ix86_generate_version_dispatcher_body
42520 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
42521 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
42522 ix86_get_function_versions_dispatcher
42524 #undef TARGET_ENUM_VA_LIST_P
42525 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
42527 #undef TARGET_FN_ABI_VA_LIST
42528 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
42530 #undef TARGET_CANONICAL_VA_LIST_TYPE
42531 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
42533 #undef TARGET_EXPAND_BUILTIN_VA_START
42534 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
42536 #undef TARGET_MD_ASM_CLOBBERS
42537 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
42539 #undef TARGET_PROMOTE_PROTOTYPES
42540 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
42541 #undef TARGET_SETUP_INCOMING_VARARGS
42542 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
42543 #undef TARGET_MUST_PASS_IN_STACK
42544 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
42545 #undef TARGET_FUNCTION_ARG_ADVANCE
42546 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
42547 #undef TARGET_FUNCTION_ARG
42548 #define TARGET_FUNCTION_ARG ix86_function_arg
42549 #undef TARGET_FUNCTION_ARG_BOUNDARY
42550 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
42551 #undef TARGET_PASS_BY_REFERENCE
42552 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
42553 #undef TARGET_INTERNAL_ARG_POINTER
42554 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
42555 #undef TARGET_UPDATE_STACK_BOUNDARY
42556 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
42557 #undef TARGET_GET_DRAP_RTX
42558 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
42559 #undef TARGET_STRICT_ARGUMENT_NAMING
42560 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
42561 #undef TARGET_STATIC_CHAIN
42562 #define TARGET_STATIC_CHAIN ix86_static_chain
42563 #undef TARGET_TRAMPOLINE_INIT
42564 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
42565 #undef TARGET_RETURN_POPS_ARGS
42566 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
42568 #undef TARGET_LEGITIMATE_COMBINED_INSN
42569 #define TARGET_LEGITIMATE_COMBINED_INSN ix86_legitimate_combined_insn
42571 #undef TARGET_ASAN_SHADOW_OFFSET
42572 #define TARGET_ASAN_SHADOW_OFFSET ix86_asan_shadow_offset
42574 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
42575 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
42577 #undef TARGET_SCALAR_MODE_SUPPORTED_P
42578 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
42580 #undef TARGET_VECTOR_MODE_SUPPORTED_P
42581 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
42583 #undef TARGET_C_MODE_FOR_SUFFIX
42584 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
42586 #ifdef HAVE_AS_TLS
42587 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
42588 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
42589 #endif
42591 #ifdef SUBTARGET_INSERT_ATTRIBUTES
42592 #undef TARGET_INSERT_ATTRIBUTES
42593 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
42594 #endif
42596 #undef TARGET_MANGLE_TYPE
42597 #define TARGET_MANGLE_TYPE ix86_mangle_type
42599 #if !TARGET_MACHO
42600 #undef TARGET_STACK_PROTECT_FAIL
42601 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
42602 #endif
42604 #undef TARGET_FUNCTION_VALUE
42605 #define TARGET_FUNCTION_VALUE ix86_function_value
42607 #undef TARGET_FUNCTION_VALUE_REGNO_P
42608 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
42610 #undef TARGET_PROMOTE_FUNCTION_MODE
42611 #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode
42613 #undef TARGET_MEMBER_TYPE_FORCES_BLK
42614 #define TARGET_MEMBER_TYPE_FORCES_BLK ix86_member_type_forces_blk
42616 #undef TARGET_INSTANTIATE_DECLS
42617 #define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls
42619 #undef TARGET_SECONDARY_RELOAD
42620 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
42622 #undef TARGET_CLASS_MAX_NREGS
42623 #define TARGET_CLASS_MAX_NREGS ix86_class_max_nregs
42625 #undef TARGET_PREFERRED_RELOAD_CLASS
42626 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
42627 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
42628 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
42629 #undef TARGET_CLASS_LIKELY_SPILLED_P
42630 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
42632 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
42633 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
42634 ix86_builtin_vectorization_cost
42635 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
42636 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
42637 ix86_vectorize_vec_perm_const_ok
42638 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
42639 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
42640 ix86_preferred_simd_mode
42641 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
42642 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
42643 ix86_autovectorize_vector_sizes
42644 #undef TARGET_VECTORIZE_INIT_COST
42645 #define TARGET_VECTORIZE_INIT_COST ix86_init_cost
42646 #undef TARGET_VECTORIZE_ADD_STMT_COST
42647 #define TARGET_VECTORIZE_ADD_STMT_COST ix86_add_stmt_cost
42648 #undef TARGET_VECTORIZE_FINISH_COST
42649 #define TARGET_VECTORIZE_FINISH_COST ix86_finish_cost
42650 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
42651 #define TARGET_VECTORIZE_DESTROY_COST_DATA ix86_destroy_cost_data
42653 #undef TARGET_SET_CURRENT_FUNCTION
42654 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
42656 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
42657 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
42659 #undef TARGET_OPTION_SAVE
42660 #define TARGET_OPTION_SAVE ix86_function_specific_save
42662 #undef TARGET_OPTION_RESTORE
42663 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
42665 #undef TARGET_OPTION_PRINT
42666 #define TARGET_OPTION_PRINT ix86_function_specific_print
42668 #undef TARGET_OPTION_FUNCTION_VERSIONS
42669 #define TARGET_OPTION_FUNCTION_VERSIONS ix86_function_versions
42671 #undef TARGET_CAN_INLINE_P
42672 #define TARGET_CAN_INLINE_P ix86_can_inline_p
42674 #undef TARGET_EXPAND_TO_RTL_HOOK
42675 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
42677 #undef TARGET_LEGITIMATE_ADDRESS_P
42678 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
42680 #undef TARGET_LRA_P
42681 #define TARGET_LRA_P hook_bool_void_true
42683 #undef TARGET_REGISTER_PRIORITY
42684 #define TARGET_REGISTER_PRIORITY ix86_register_priority
42686 #undef TARGET_LEGITIMATE_CONSTANT_P
42687 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
42689 #undef TARGET_FRAME_POINTER_REQUIRED
42690 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
42692 #undef TARGET_CAN_ELIMINATE
42693 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
42695 #undef TARGET_EXTRA_LIVE_ON_ENTRY
42696 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
42698 #undef TARGET_ASM_CODE_END
42699 #define TARGET_ASM_CODE_END ix86_code_end
42701 #undef TARGET_CONDITIONAL_REGISTER_USAGE
42702 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
42704 #if TARGET_MACHO
42705 #undef TARGET_INIT_LIBFUNCS
42706 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
42707 #endif
42709 #undef TARGET_SPILL_CLASS
42710 #define TARGET_SPILL_CLASS ix86_spill_class
42712 struct gcc_target targetm = TARGET_INITIALIZER;
42714 #include "gt-i386.h"